/* * Copyright (c) 2022 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include // AVX2 #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, uint32_t sad_array[4]) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), _mm256_extractf128_si256(t2, 1)); _mm_storeu_si128((__m128i *)sad_array, sum); } static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, int src_stride, uint16_t *refs[4], int ref_stride, int height) { int i; for (i = 0; i < height; ++i) { // load src and all ref[] const __m256i s0 = _mm256_load_si256((const __m256i *)src); const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); int x; for (x = 0; x < 4; ++x) { __m256i r[4]; r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); // absolute differences between every ref[] to src r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); // sum every abs diff sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); } src += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } } #define HIGHBD_SAD64XNX4D(n) \ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *refs[4]; \ __m256i sums_16[4]; \ __m256i sums_32[4]; \ int i; \ \ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ sums_32[0] = _mm256_setzero_si256(); \ sums_32[1] = _mm256_setzero_si256(); \ sums_32[2] = _mm256_setzero_si256(); \ sums_32[3] = _mm256_setzero_si256(); \ \ for (i = 0; i < (n / 2); ++i) { \ sums_16[0] = _mm256_setzero_si256(); \ sums_16[1] = _mm256_setzero_si256(); \ sums_16[2] = _mm256_setzero_si256(); \ sums_16[3] = _mm256_setzero_si256(); \ \ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ \ /* sums_16 will outrange after 2 rows, so add current sums_16 to \ * sums_32*/ \ sums_32[0] = _mm256_add_epi32( \ sums_32[0], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[0], 1)))); \ sums_32[1] = _mm256_add_epi32( \ sums_32[1], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[1], 1)))); \ sums_32[2] = _mm256_add_epi32( \ sums_32[2], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[2], 1)))); \ sums_32[3] = _mm256_add_epi32( \ sums_32[3], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[3], 1)))); \ \ src += src_stride << 1; \ } \ calc_final_4(sums_32, sad_array); \ } // 64x64 HIGHBD_SAD64XNX4D(64) // 64x32 HIGHBD_SAD64XNX4D(32) static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, int src_stride, uint16_t *refs[4], int ref_stride, int height) { int i; for (i = 0; i < height; i++) { __m256i r[8]; // load src and all ref[] const __m256i s = _mm256_load_si256((const __m256i *)src); const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); // absolute differences between every ref[] to src r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); // sum every abs diff sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); src += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } } #define HIGHBD_SAD32XNX4D(n) \ void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *refs[4]; \ __m256i sums_16[4]; \ __m256i sums_32[4]; \ int i; \ \ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ sums_32[0] = _mm256_setzero_si256(); \ sums_32[1] = _mm256_setzero_si256(); \ sums_32[2] = _mm256_setzero_si256(); \ sums_32[3] = _mm256_setzero_si256(); \ \ for (i = 0; i < (n / 8); ++i) { \ sums_16[0] = _mm256_setzero_si256(); \ sums_16[1] = _mm256_setzero_si256(); \ sums_16[2] = _mm256_setzero_si256(); \ sums_16[3] = _mm256_setzero_si256(); \ \ highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ \ /* sums_16 will outrange after 8 rows, so add current sums_16 to \ * sums_32*/ \ sums_32[0] = _mm256_add_epi32( \ sums_32[0], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[0], 1)))); \ sums_32[1] = _mm256_add_epi32( \ sums_32[1], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[1], 1)))); \ sums_32[2] = _mm256_add_epi32( \ sums_32[2], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[2], 1)))); \ sums_32[3] = _mm256_add_epi32( \ sums_32[3], \ _mm256_add_epi32( \ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ _mm256_cvtepu16_epi32( \ _mm256_extractf128_si256(sums_16[3], 1)))); \ \ src += src_stride << 3; \ } \ calc_final_4(sums_32, sad_array); \ } // 32x64 HIGHBD_SAD32XNX4D(64) // 32x32 HIGHBD_SAD32XNX4D(32) // 32x16 HIGHBD_SAD32XNX4D(16) static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, const uint16_t *src, int src_stride, uint16_t *refs[4], int ref_stride, int height) { int i; for (i = 0; i < height; i++) { __m256i r[4]; // load src and all ref[] const __m256i s = _mm256_load_si256((const __m256i *)src); r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); // absolute differences between every ref[] to src r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); // sum every abs diff sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); src += src_stride; refs[0] += ref_stride; refs[1] += ref_stride; refs[2] += ref_stride; refs[3] += ref_stride; } } void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; __m256i sums_32[4]; int i; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); sums_32[0] = _mm256_setzero_si256(); sums_32[1] = _mm256_setzero_si256(); sums_32[2] = _mm256_setzero_si256(); sums_32[3] = _mm256_setzero_si256(); for (i = 0; i < 2; ++i) { sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 sums_32[0] = _mm256_add_epi32( sums_32[0], _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); sums_32[1] = _mm256_add_epi32( sums_32[1], _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); sums_32[2] = _mm256_add_epi32( sums_32[2], _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); sums_32[3] = _mm256_add_epi32( sums_32[3], _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); src += src_stride << 4; } calc_final_4(sums_32, sad_array); } void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); { __m256i sums_32[4]; sums_32[0] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); sums_32[1] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); sums_32[2] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); sums_32[3] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); calc_final_4(sums_32, sad_array); } } void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); uint16_t *refs[4]; __m256i sums_16[4]; refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); sums_16[0] = _mm256_setzero_si256(); sums_16[1] = _mm256_setzero_si256(); sums_16[2] = _mm256_setzero_si256(); sums_16[3] = _mm256_setzero_si256(); highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); { __m256i sums_32[4]; sums_32[0] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); sums_32[1] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); sums_32[2] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); sums_32[3] = _mm256_add_epi32( _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); calc_final_4(sums_32, sad_array); } }