469 lines
22 KiB
C
469 lines
22 KiB
C
/*
|
|
* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
#include <immintrin.h>
|
|
#include "./vpx_dsp_rtcd.h"
|
|
#include "vpx/vpx_integer.h"
|
|
|
|
static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
|
|
const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
|
|
const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
|
|
const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
|
|
_mm256_extractf128_si256(t1, 1));
|
|
return (unsigned int)_mm_cvtsi128_si32(sum);
|
|
}
|
|
|
|
static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
|
|
const uint16_t *src, int src_stride,
|
|
uint16_t *ref, int ref_stride,
|
|
int height) {
|
|
int i;
|
|
for (i = 0; i < height; ++i) {
|
|
// load src and all ref[]
|
|
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
|
|
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
|
|
const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
|
|
const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
|
|
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
|
|
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
|
|
const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
|
|
const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
|
|
// absolute differences between every ref[] to src
|
|
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
|
|
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
|
|
const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
|
|
const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
|
|
// sum every abs diff
|
|
*sums_16 =
|
|
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
|
|
*sums_16 =
|
|
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
|
|
|
|
src += src_stride;
|
|
ref += ref_stride;
|
|
}
|
|
}
|
|
|
|
#define HIGHBD_SAD64XN(n) \
|
|
unsigned int vpx_highbd_sad64x##n##_avx2( \
|
|
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
|
int ref_stride) { \
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
|
|
__m256i sums_32 = _mm256_setzero_si256(); \
|
|
int i; \
|
|
\
|
|
for (i = 0; i < (n / 2); ++i) { \
|
|
__m256i sums_16 = _mm256_setzero_si256(); \
|
|
\
|
|
highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \
|
|
\
|
|
/* sums_16 will outrange after 2 rows, so add current sums_16 to \
|
|
* sums_32*/ \
|
|
sums_32 = _mm256_add_epi32( \
|
|
sums_32, \
|
|
_mm256_add_epi32( \
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
|
|
\
|
|
src += src_stride << 1; \
|
|
ref += ref_stride << 1; \
|
|
} \
|
|
return calc_final(sums_32); \
|
|
}
|
|
|
|
// 64x64
|
|
HIGHBD_SAD64XN(64)
|
|
|
|
// 64x32
|
|
HIGHBD_SAD64XN(32)
|
|
|
|
static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
|
|
const uint16_t *src, int src_stride,
|
|
uint16_t *ref, int ref_stride,
|
|
int height) {
|
|
int i;
|
|
for (i = 0; i < height; ++i) {
|
|
// load src and all ref[]
|
|
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
|
|
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
|
|
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
|
|
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
|
|
// absolute differences between every ref[] to src
|
|
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
|
|
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
|
|
// sum every abs diff
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
|
|
|
|
src += src_stride;
|
|
ref += ref_stride;
|
|
}
|
|
}
|
|
|
|
#define HIGHBD_SAD32XN(n) \
|
|
unsigned int vpx_highbd_sad32x##n##_avx2( \
|
|
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
|
int ref_stride) { \
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
|
|
__m256i sums_32 = _mm256_setzero_si256(); \
|
|
int i; \
|
|
\
|
|
for (i = 0; i < (n / 8); ++i) { \
|
|
__m256i sums_16 = _mm256_setzero_si256(); \
|
|
\
|
|
highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \
|
|
\
|
|
/* sums_16 will outrange after 8 rows, so add current sums_16 to \
|
|
* sums_32*/ \
|
|
sums_32 = _mm256_add_epi32( \
|
|
sums_32, \
|
|
_mm256_add_epi32( \
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
|
|
\
|
|
src += src_stride << 3; \
|
|
ref += ref_stride << 3; \
|
|
} \
|
|
return calc_final(sums_32); \
|
|
}
|
|
|
|
// 32x64
|
|
HIGHBD_SAD32XN(64)
|
|
|
|
// 32x32
|
|
HIGHBD_SAD32XN(32)
|
|
|
|
// 32x16
|
|
HIGHBD_SAD32XN(16)
|
|
|
|
static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
|
|
const uint16_t *src, int src_stride,
|
|
uint16_t *ref, int ref_stride,
|
|
int height) {
|
|
int i;
|
|
for (i = 0; i < height; i += 2) {
|
|
// load src and all ref[]
|
|
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
|
|
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
|
|
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
|
|
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
|
|
// absolute differences between every ref[] to src
|
|
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
|
|
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
|
|
// sum every abs diff
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
|
|
|
|
src += src_stride << 1;
|
|
ref += ref_stride << 1;
|
|
}
|
|
}
|
|
|
|
unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
|
|
const uint8_t *ref_ptr, int ref_stride) {
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
|
|
__m256i sums_32 = _mm256_setzero_si256();
|
|
int i;
|
|
|
|
for (i = 0; i < 2; ++i) {
|
|
__m256i sums_16 = _mm256_setzero_si256();
|
|
|
|
highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
|
|
|
|
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
|
|
sums_32 = _mm256_add_epi32(
|
|
sums_32,
|
|
_mm256_add_epi32(
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
|
|
|
|
src += src_stride << 4;
|
|
ref += ref_stride << 4;
|
|
}
|
|
return calc_final(sums_32);
|
|
}
|
|
|
|
unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
|
|
const uint8_t *ref_ptr, int ref_stride) {
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
|
|
__m256i sums_16 = _mm256_setzero_si256();
|
|
|
|
highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
|
|
|
|
{
|
|
const __m256i sums_32 = _mm256_add_epi32(
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
|
|
return calc_final(sums_32);
|
|
}
|
|
}
|
|
|
|
unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
|
|
const uint8_t *ref_ptr, int ref_stride) {
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
|
|
__m256i sums_16 = _mm256_setzero_si256();
|
|
|
|
highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
|
|
|
|
{
|
|
const __m256i sums_32 = _mm256_add_epi32(
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
|
|
return calc_final(sums_32);
|
|
}
|
|
}
|
|
|
|
// AVG -------------------------------------------------------------------------
|
|
static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
|
|
const uint16_t *src,
|
|
int src_stride, uint16_t *ref,
|
|
int ref_stride, uint16_t *sec,
|
|
int height) {
|
|
int i;
|
|
for (i = 0; i < height; ++i) {
|
|
// load src and all ref[]
|
|
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
|
|
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
|
|
const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
|
|
const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
|
|
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
|
|
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
|
|
const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
|
|
const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
|
|
const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
|
|
const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
|
|
const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
|
|
const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
|
|
const __m256i avg0 = _mm256_avg_epu16(r0, x0);
|
|
const __m256i avg1 = _mm256_avg_epu16(r1, x1);
|
|
const __m256i avg2 = _mm256_avg_epu16(r2, x2);
|
|
const __m256i avg3 = _mm256_avg_epu16(r3, x3);
|
|
// absolute differences between every ref/pred avg to src
|
|
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
|
|
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
|
|
const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
|
|
const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
|
|
// sum every abs diff
|
|
*sums_16 =
|
|
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
|
|
*sums_16 =
|
|
_mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
|
|
|
|
src += src_stride;
|
|
ref += ref_stride;
|
|
sec += 64;
|
|
}
|
|
}
|
|
|
|
#define HIGHBD_SAD64XN_AVG(n) \
|
|
unsigned int vpx_highbd_sad64x##n##_avg_avx2( \
|
|
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
|
int ref_stride, const uint8_t *second_pred) { \
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
|
|
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
|
|
__m256i sums_32 = _mm256_setzero_si256(); \
|
|
int i; \
|
|
\
|
|
for (i = 0; i < (n / 2); ++i) { \
|
|
__m256i sums_16 = _mm256_setzero_si256(); \
|
|
\
|
|
highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
|
|
\
|
|
/* sums_16 will outrange after 2 rows, so add current sums_16 to \
|
|
* sums_32*/ \
|
|
sums_32 = _mm256_add_epi32( \
|
|
sums_32, \
|
|
_mm256_add_epi32( \
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
|
|
\
|
|
src += src_stride << 1; \
|
|
ref += ref_stride << 1; \
|
|
sec += 64 << 1; \
|
|
} \
|
|
return calc_final(sums_32); \
|
|
}
|
|
|
|
// 64x64
|
|
HIGHBD_SAD64XN_AVG(64)
|
|
|
|
// 64x32
|
|
HIGHBD_SAD64XN_AVG(32)
|
|
|
|
static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
|
|
const uint16_t *src,
|
|
int src_stride, uint16_t *ref,
|
|
int ref_stride, uint16_t *sec,
|
|
int height) {
|
|
int i;
|
|
for (i = 0; i < height; ++i) {
|
|
// load src and all ref[]
|
|
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
|
|
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
|
|
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
|
|
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
|
|
const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
|
|
const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
|
|
const __m256i avg0 = _mm256_avg_epu16(r0, x0);
|
|
const __m256i avg1 = _mm256_avg_epu16(r1, x1);
|
|
// absolute differences between every ref/pred avg to src
|
|
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
|
|
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
|
|
// sum every abs diff
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
|
|
|
|
src += src_stride;
|
|
ref += ref_stride;
|
|
sec += 32;
|
|
}
|
|
}
|
|
|
|
#define HIGHBD_SAD32XN_AVG(n) \
|
|
unsigned int vpx_highbd_sad32x##n##_avg_avx2( \
|
|
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
|
int ref_stride, const uint8_t *second_pred) { \
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \
|
|
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \
|
|
__m256i sums_32 = _mm256_setzero_si256(); \
|
|
int i; \
|
|
\
|
|
for (i = 0; i < (n / 8); ++i) { \
|
|
__m256i sums_16 = _mm256_setzero_si256(); \
|
|
\
|
|
highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
|
|
\
|
|
/* sums_16 will outrange after 8 rows, so add current sums_16 to \
|
|
* sums_32*/ \
|
|
sums_32 = _mm256_add_epi32( \
|
|
sums_32, \
|
|
_mm256_add_epi32( \
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
|
|
\
|
|
src += src_stride << 3; \
|
|
ref += ref_stride << 3; \
|
|
sec += 32 << 3; \
|
|
} \
|
|
return calc_final(sums_32); \
|
|
}
|
|
|
|
// 32x64
|
|
HIGHBD_SAD32XN_AVG(64)
|
|
|
|
// 32x32
|
|
HIGHBD_SAD32XN_AVG(32)
|
|
|
|
// 32x16
|
|
HIGHBD_SAD32XN_AVG(16)
|
|
|
|
static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
|
|
const uint16_t *src,
|
|
int src_stride, uint16_t *ref,
|
|
int ref_stride, uint16_t *sec,
|
|
int height) {
|
|
int i;
|
|
for (i = 0; i < height; i += 2) {
|
|
// load src and all ref[]
|
|
const __m256i s0 = _mm256_load_si256((const __m256i *)src);
|
|
const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
|
|
const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
|
|
const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
|
|
const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
|
|
const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
|
|
const __m256i avg0 = _mm256_avg_epu16(r0, x0);
|
|
const __m256i avg1 = _mm256_avg_epu16(r1, x1);
|
|
// absolute differences between every ref[] to src
|
|
const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
|
|
const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
|
|
// sum every abs diff
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
|
|
*sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
|
|
|
|
src += src_stride << 1;
|
|
ref += ref_stride << 1;
|
|
sec += 32;
|
|
}
|
|
}
|
|
|
|
unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
|
|
int src_stride,
|
|
const uint8_t *ref_ptr,
|
|
int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
|
|
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
|
|
__m256i sums_32 = _mm256_setzero_si256();
|
|
int i;
|
|
|
|
for (i = 0; i < 2; ++i) {
|
|
__m256i sums_16 = _mm256_setzero_si256();
|
|
|
|
highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
|
|
|
|
// sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
|
|
sums_32 = _mm256_add_epi32(
|
|
sums_32,
|
|
_mm256_add_epi32(
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
|
|
|
|
src += src_stride << 4;
|
|
ref += ref_stride << 4;
|
|
sec += 16 << 4;
|
|
}
|
|
return calc_final(sums_32);
|
|
}
|
|
|
|
unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
|
|
int src_stride,
|
|
const uint8_t *ref_ptr,
|
|
int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
|
|
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
|
|
__m256i sums_16 = _mm256_setzero_si256();
|
|
|
|
highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
|
|
|
|
{
|
|
const __m256i sums_32 = _mm256_add_epi32(
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
|
|
return calc_final(sums_32);
|
|
}
|
|
}
|
|
|
|
unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
|
|
const uint8_t *ref_ptr, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
|
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
|
|
uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
|
|
__m256i sums_16 = _mm256_setzero_si256();
|
|
|
|
highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
|
|
|
|
{
|
|
const __m256i sums_32 = _mm256_add_epi32(
|
|
_mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
|
|
_mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
|
|
return calc_final(sums_32);
|
|
}
|
|
}
|