718 lines
26 KiB
C
718 lines
26 KiB
C
|
|
/*
|
||
|
|
* Copyright (c) 2022 The WebM project authors. All Rights Reserved.
|
||
|
|
*
|
||
|
|
* Use of this source code is governed by a BSD-style license
|
||
|
|
* that can be found in the LICENSE file in the root of the source
|
||
|
|
* tree. An additional intellectual property rights grant can be found
|
||
|
|
* in the file PATENTS. All contributing project authors may
|
||
|
|
* be found in the AUTHORS file in the root of the source tree.
|
||
|
|
*/
|
||
|
|
|
||
|
|
#include "./vpx_config.h"
|
||
|
|
#include "./vpx_dsp_rtcd.h"
|
||
|
|
#include "vpx_util/loongson_intrinsics.h"
|
||
|
|
|
||
|
|
static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
|
||
|
|
__m128i ref1) {
|
||
|
|
__m128i diff0_m, diff1_m, sad_m0;
|
||
|
|
__m128i sad_m = __lsx_vldi(0);
|
||
|
|
|
||
|
|
diff0_m = __lsx_vabsd_bu(in0, ref0);
|
||
|
|
diff1_m = __lsx_vabsd_bu(in1, ref1);
|
||
|
|
|
||
|
|
sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
|
||
|
|
sad_m = __lsx_vadd_h(sad_m, sad_m0);
|
||
|
|
sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
|
||
|
|
sad_m = __lsx_vadd_h(sad_m, sad_m0);
|
||
|
|
|
||
|
|
return sad_m;
|
||
|
|
}
|
||
|
|
|
||
|
|
static INLINE uint32_t hadd_uw_u32(__m128i in) {
|
||
|
|
__m128i res0_m;
|
||
|
|
uint32_t sum_m;
|
||
|
|
|
||
|
|
res0_m = __lsx_vhaddw_du_wu(in, in);
|
||
|
|
res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
|
||
|
|
sum_m = __lsx_vpickve2gr_w(res0_m, 0);
|
||
|
|
|
||
|
|
return sum_m;
|
||
|
|
}
|
||
|
|
|
||
|
|
static INLINE uint32_t hadd_uh_u32(__m128i in) {
|
||
|
|
__m128i res_m;
|
||
|
|
uint32_t sum_m;
|
||
|
|
|
||
|
|
res_m = __lsx_vhaddw_wu_hu(in, in);
|
||
|
|
sum_m = hadd_uw_u32(res_m);
|
||
|
|
|
||
|
|
return sum_m;
|
||
|
|
}
|
||
|
|
|
||
|
|
static INLINE int32_t hadd_sw_s32(__m128i in) {
|
||
|
|
__m128i res0_m;
|
||
|
|
int32_t sum_m;
|
||
|
|
|
||
|
|
res0_m = __lsx_vhaddw_d_w(in, in);
|
||
|
|
res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
|
||
|
|
sum_m = __lsx_vpickve2gr_w(res0_m, 0);
|
||
|
|
|
||
|
|
return sum_m;
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *ref, int32_t ref_stride,
|
||
|
|
int32_t height) {
|
||
|
|
int32_t ht_cnt;
|
||
|
|
uint32_t res;
|
||
|
|
__m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
|
||
|
|
__m128i sad = __lsx_vldi(0);
|
||
|
|
|
||
|
|
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
|
||
|
|
src += src_stride;
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
|
||
|
|
src += src_stride;
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
|
||
|
|
src += src_stride;
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
|
||
|
|
src += src_stride;
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
|
||
|
|
src0, src1, ref0, ref1);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
}
|
||
|
|
res = hadd_uh_u32(sad);
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *ref, int32_t ref_stride,
|
||
|
|
int32_t height) {
|
||
|
|
int32_t ht_cnt = (height >> 2);
|
||
|
|
uint32_t res;
|
||
|
|
__m128i src0, src1, ref0, ref1, sad_tmp;
|
||
|
|
__m128i sad = __lsx_vldi(0);
|
||
|
|
int32_t src_stride2 = src_stride << 1;
|
||
|
|
int32_t ref_stride2 = ref_stride << 1;
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
|
||
|
|
src += src_stride2;
|
||
|
|
ref += ref_stride2;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
|
||
|
|
src += src_stride2;
|
||
|
|
ref += ref_stride2;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
}
|
||
|
|
|
||
|
|
res = hadd_uh_u32(sad);
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *ref, int32_t ref_stride,
|
||
|
|
int32_t height) {
|
||
|
|
int32_t ht_cnt = (height >> 2);
|
||
|
|
uint32_t res;
|
||
|
|
__m128i src0, src1, ref0, ref1;
|
||
|
|
__m128i sad_tmp;
|
||
|
|
__m128i sad = __lsx_vldi(0);
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
|
||
|
|
src += src_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
|
||
|
|
ref += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
|
||
|
|
src += src_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
|
||
|
|
ref += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
|
||
|
|
src += src_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
|
||
|
|
ref += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
|
||
|
|
src += src_stride;
|
||
|
|
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
|
||
|
|
ref += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
}
|
||
|
|
res = hadd_uh_u32(sad);
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *ref, int32_t ref_stride,
|
||
|
|
int32_t height) {
|
||
|
|
int32_t ht_cnt = (height >> 1);
|
||
|
|
uint32_t sad = 0;
|
||
|
|
__m128i src0, src1, src2, src3;
|
||
|
|
__m128i ref0, ref1, ref2, ref3;
|
||
|
|
__m128i sad_tmp;
|
||
|
|
__m128i sad0 = __lsx_vldi(0);
|
||
|
|
__m128i sad1 = sad0;
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
|
||
|
|
ref3);
|
||
|
|
ref += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
|
||
|
|
ref3);
|
||
|
|
ref += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
}
|
||
|
|
|
||
|
|
sad = hadd_uh_u32(sad0);
|
||
|
|
sad += hadd_uh_u32(sad1);
|
||
|
|
|
||
|
|
return sad;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
|
||
|
|
const uint8_t *const aref_ptr[],
|
||
|
|
int32_t ref_stride, int32_t height,
|
||
|
|
uint32_t *sad_array) {
|
||
|
|
int32_t ht_cnt = (height >> 2);
|
||
|
|
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
|
||
|
|
__m128i src0, src1, src2, src3, sad_tmp;
|
||
|
|
__m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||
|
|
__m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
|
||
|
|
__m128i sad0 = __lsx_vldi(0);
|
||
|
|
__m128i sad1 = sad0;
|
||
|
|
__m128i sad2 = sad0;
|
||
|
|
__m128i sad3 = sad0;
|
||
|
|
int32_t src_stride2 = src_stride << 1;
|
||
|
|
int32_t src_stride3 = src_stride2 + src_stride;
|
||
|
|
int32_t src_stride4 = src_stride2 << 1;
|
||
|
|
int32_t ref_stride2 = ref_stride << 1;
|
||
|
|
int32_t ref_stride3 = ref_stride2 + ref_stride;
|
||
|
|
int32_t ref_stride4 = ref_stride2 << 1;
|
||
|
|
|
||
|
|
ref0_ptr = aref_ptr[0];
|
||
|
|
ref1_ptr = aref_ptr[1];
|
||
|
|
ref2_ptr = aref_ptr[2];
|
||
|
|
ref3_ptr = aref_ptr[3];
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
src0 = __lsx_vld(src_ptr, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
|
||
|
|
src2);
|
||
|
|
src3 = __lsx_vldx(src_ptr, src_stride3);
|
||
|
|
src_ptr += src_stride4;
|
||
|
|
ref0 = __lsx_vld(ref0_ptr, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
|
||
|
|
ref2);
|
||
|
|
ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
|
||
|
|
ref0_ptr += ref_stride4;
|
||
|
|
ref4 = __lsx_vld(ref1_ptr, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
|
||
|
|
ref6);
|
||
|
|
ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
|
||
|
|
ref1_ptr += ref_stride4;
|
||
|
|
ref8 = __lsx_vld(ref2_ptr, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
|
||
|
|
ref10);
|
||
|
|
ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
|
||
|
|
ref2_ptr += ref_stride4;
|
||
|
|
ref12 = __lsx_vld(ref3_ptr, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
|
||
|
|
ref14);
|
||
|
|
ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
|
||
|
|
ref3_ptr += ref_stride4;
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
|
||
|
|
DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad2 = __lsx_vadd_h(sad2, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad3 = __lsx_vadd_h(sad3, sad_tmp);
|
||
|
|
}
|
||
|
|
sad_array[0] = hadd_uh_u32(sad0);
|
||
|
|
sad_array[1] = hadd_uh_u32(sad1);
|
||
|
|
sad_array[2] = hadd_uh_u32(sad2);
|
||
|
|
sad_array[3] = hadd_uh_u32(sad3);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
|
||
|
|
const uint8_t *const aref_ptr[],
|
||
|
|
int32_t ref_stride, int32_t height,
|
||
|
|
uint32_t *sad_array) {
|
||
|
|
int32_t ht_cnt = (height >> 1);
|
||
|
|
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
|
||
|
|
__m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
|
||
|
|
__m128i sad0 = __lsx_vldi(0);
|
||
|
|
__m128i sad1 = sad0;
|
||
|
|
__m128i sad2 = sad0;
|
||
|
|
__m128i sad3 = sad0;
|
||
|
|
|
||
|
|
ref0_ptr = aref_ptr[0];
|
||
|
|
ref1_ptr = aref_ptr[1];
|
||
|
|
ref2_ptr = aref_ptr[2];
|
||
|
|
ref3_ptr = aref_ptr[3];
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
src = __lsx_vld(src_ptr, 0);
|
||
|
|
src_ptr += src_stride;
|
||
|
|
ref0 = __lsx_vld(ref0_ptr, 0);
|
||
|
|
ref0_ptr += ref_stride;
|
||
|
|
ref1 = __lsx_vld(ref1_ptr, 0);
|
||
|
|
ref1_ptr += ref_stride;
|
||
|
|
ref2 = __lsx_vld(ref2_ptr, 0);
|
||
|
|
ref2_ptr += ref_stride;
|
||
|
|
ref3 = __lsx_vld(ref3_ptr, 0);
|
||
|
|
ref3_ptr += ref_stride;
|
||
|
|
|
||
|
|
diff = __lsx_vabsd_bu(src, ref0);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
diff = __lsx_vabsd_bu(src, ref1);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
diff = __lsx_vabsd_bu(src, ref2);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad2 = __lsx_vadd_h(sad2, sad_tmp);
|
||
|
|
diff = __lsx_vabsd_bu(src, ref3);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad3 = __lsx_vadd_h(sad3, sad_tmp);
|
||
|
|
|
||
|
|
src = __lsx_vld(src_ptr, 0);
|
||
|
|
src_ptr += src_stride;
|
||
|
|
ref0 = __lsx_vld(ref0_ptr, 0);
|
||
|
|
ref0_ptr += ref_stride;
|
||
|
|
ref1 = __lsx_vld(ref1_ptr, 0);
|
||
|
|
ref1_ptr += ref_stride;
|
||
|
|
ref2 = __lsx_vld(ref2_ptr, 0);
|
||
|
|
ref2_ptr += ref_stride;
|
||
|
|
ref3 = __lsx_vld(ref3_ptr, 0);
|
||
|
|
ref3_ptr += ref_stride;
|
||
|
|
|
||
|
|
diff = __lsx_vabsd_bu(src, ref0);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
diff = __lsx_vabsd_bu(src, ref1);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
diff = __lsx_vabsd_bu(src, ref2);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad2 = __lsx_vadd_h(sad2, sad_tmp);
|
||
|
|
diff = __lsx_vabsd_bu(src, ref3);
|
||
|
|
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
|
||
|
|
sad3 = __lsx_vadd_h(sad3, sad_tmp);
|
||
|
|
}
|
||
|
|
sad_array[0] = hadd_uh_u32(sad0);
|
||
|
|
sad_array[1] = hadd_uh_u32(sad1);
|
||
|
|
sad_array[2] = hadd_uh_u32(sad2);
|
||
|
|
sad_array[3] = hadd_uh_u32(sad3);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *const aref_ptr[],
|
||
|
|
int32_t ref_stride, int32_t height,
|
||
|
|
uint32_t *sad_array) {
|
||
|
|
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
|
||
|
|
int32_t ht_cnt = height;
|
||
|
|
__m128i src0, src1, ref0, ref1, sad_tmp;
|
||
|
|
__m128i sad0 = __lsx_vldi(0);
|
||
|
|
__m128i sad1 = sad0;
|
||
|
|
__m128i sad2 = sad0;
|
||
|
|
__m128i sad3 = sad0;
|
||
|
|
|
||
|
|
ref0_ptr = aref_ptr[0];
|
||
|
|
ref1_ptr = aref_ptr[1];
|
||
|
|
ref2_ptr = aref_ptr[2];
|
||
|
|
ref3_ptr = aref_ptr[3];
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
|
||
|
|
src += src_stride;
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
|
||
|
|
ref0_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
|
||
|
|
ref1_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
|
||
|
|
ref2_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad2 = __lsx_vadd_h(sad2, sad_tmp);
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
|
||
|
|
ref3_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad3 = __lsx_vadd_h(sad3, sad_tmp);
|
||
|
|
}
|
||
|
|
sad_array[0] = hadd_uh_u32(sad0);
|
||
|
|
sad_array[1] = hadd_uh_u32(sad1);
|
||
|
|
sad_array[2] = hadd_uh_u32(sad2);
|
||
|
|
sad_array[3] = hadd_uh_u32(sad3);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *const aref_ptr[],
|
||
|
|
int32_t ref_stride, int32_t height,
|
||
|
|
uint32_t *sad_array) {
|
||
|
|
const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
|
||
|
|
int32_t ht_cnt = height;
|
||
|
|
__m128i src0, src1, src2, src3;
|
||
|
|
__m128i ref0, ref1, ref2, ref3;
|
||
|
|
__m128i sad, sad_tmp;
|
||
|
|
|
||
|
|
__m128i sad0_0 = __lsx_vldi(0);
|
||
|
|
__m128i sad0_1 = sad0_0;
|
||
|
|
__m128i sad1_0 = sad0_0;
|
||
|
|
__m128i sad1_1 = sad0_0;
|
||
|
|
__m128i sad2_0 = sad0_0;
|
||
|
|
__m128i sad2_1 = sad0_0;
|
||
|
|
__m128i sad3_0 = sad0_0;
|
||
|
|
__m128i sad3_1 = sad0_0;
|
||
|
|
|
||
|
|
ref0_ptr = aref_ptr[0];
|
||
|
|
ref1_ptr = aref_ptr[1];
|
||
|
|
ref2_ptr = aref_ptr[2];
|
||
|
|
ref3_ptr = aref_ptr[3];
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
|
||
|
|
ref0, ref1, ref2, ref3);
|
||
|
|
ref0_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
|
||
|
|
sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
|
||
|
|
ref0, ref1, ref2, ref3);
|
||
|
|
ref1_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
|
||
|
|
sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
|
||
|
|
ref0, ref1, ref2, ref3);
|
||
|
|
ref2_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
|
||
|
|
sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
|
||
|
|
ref0, ref1, ref2, ref3);
|
||
|
|
ref3_ptr += ref_stride;
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
|
||
|
|
sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
|
||
|
|
sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
|
||
|
|
}
|
||
|
|
sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
|
||
|
|
sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
|
||
|
|
sad = __lsx_vadd_w(sad, sad_tmp);
|
||
|
|
sad_array[0] = hadd_uw_u32(sad);
|
||
|
|
|
||
|
|
sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
|
||
|
|
sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
|
||
|
|
sad = __lsx_vadd_w(sad, sad_tmp);
|
||
|
|
sad_array[1] = hadd_uw_u32(sad);
|
||
|
|
|
||
|
|
sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
|
||
|
|
sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
|
||
|
|
sad = __lsx_vadd_w(sad, sad_tmp);
|
||
|
|
sad_array[2] = hadd_uw_u32(sad);
|
||
|
|
|
||
|
|
sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
|
||
|
|
sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
|
||
|
|
sad = __lsx_vadd_w(sad, sad_tmp);
|
||
|
|
sad_array[3] = hadd_uw_u32(sad);
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *ref, int32_t ref_stride,
|
||
|
|
int32_t height, const uint8_t *sec_pred) {
|
||
|
|
int32_t res, ht_cnt = (height >> 2);
|
||
|
|
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
|
||
|
|
__m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
|
||
|
|
__m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
|
||
|
|
__m128i comp0, comp1, sad_tmp;
|
||
|
|
__m128i sad = __lsx_vldi(0);
|
||
|
|
uint8_t *src_tmp, *ref_tmp;
|
||
|
|
int32_t src_stride2 = src_stride << 1;
|
||
|
|
int32_t src_stride3 = src_stride2 + src_stride;
|
||
|
|
int32_t src_stride4 = src_stride2 << 1;
|
||
|
|
int32_t ref_stride2 = ref_stride << 1;
|
||
|
|
int32_t ref_stride3 = ref_stride2 + ref_stride;
|
||
|
|
int32_t ref_stride4 = ref_stride2 << 1;
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
src_tmp = (uint8_t *)src + 16;
|
||
|
|
src0 = __lsx_vld(src, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
|
||
|
|
src6 = __lsx_vldx(src, src_stride3);
|
||
|
|
src1 = __lsx_vld(src_tmp, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
|
||
|
|
src5);
|
||
|
|
src7 = __lsx_vldx(src_tmp, src_stride3);
|
||
|
|
src += src_stride4;
|
||
|
|
|
||
|
|
ref_tmp = (uint8_t *)ref + 16;
|
||
|
|
ref0 = __lsx_vld(ref, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
|
||
|
|
ref6 = __lsx_vldx(ref, ref_stride3);
|
||
|
|
ref1 = __lsx_vld(ref_tmp, 0);
|
||
|
|
DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
|
||
|
|
ref5);
|
||
|
|
ref7 = __lsx_vldx(ref_tmp, ref_stride3);
|
||
|
|
ref += ref_stride4;
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
|
||
|
|
pred0, pred2, pred4, pred6);
|
||
|
|
DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
|
||
|
|
112, pred1, pred3, pred5, pred7);
|
||
|
|
sec_pred += 128;
|
||
|
|
|
||
|
|
DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
|
||
|
|
sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
|
||
|
|
sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
|
||
|
|
sad = __lsx_vadd_h(sad, sad_tmp);
|
||
|
|
}
|
||
|
|
res = hadd_uh_u32(sad);
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
|
||
|
|
const uint8_t *ref, int32_t ref_stride,
|
||
|
|
int32_t height, const uint8_t *sec_pred) {
|
||
|
|
int32_t res, ht_cnt = (height >> 2);
|
||
|
|
__m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
|
||
|
|
__m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
|
||
|
|
__m128i sad, sad_tmp;
|
||
|
|
__m128i sad0 = __lsx_vldi(0);
|
||
|
|
__m128i sad1 = sad0;
|
||
|
|
|
||
|
|
for (; ht_cnt--;) {
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
|
||
|
|
ref3);
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
|
||
|
|
pred0, pred1, pred2, pred3);
|
||
|
|
sec_pred += 64;
|
||
|
|
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
|
||
|
|
ref3, comp0, comp1, comp2, comp3);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
|
||
|
|
ref3);
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
|
||
|
|
pred0, pred1, pred2, pred3);
|
||
|
|
sec_pred += 64;
|
||
|
|
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
|
||
|
|
ref3, comp0, comp1, comp2, comp3);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
|
||
|
|
ref3);
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
|
||
|
|
pred0, pred1, pred2, pred3);
|
||
|
|
sec_pred += 64;
|
||
|
|
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
|
||
|
|
ref3, comp0, comp1, comp2, comp3);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
|
||
|
|
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
|
||
|
|
src3);
|
||
|
|
src += src_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
|
||
|
|
ref3);
|
||
|
|
ref += ref_stride;
|
||
|
|
DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
|
||
|
|
pred0, pred1, pred2, pred3);
|
||
|
|
sec_pred += 64;
|
||
|
|
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
|
||
|
|
ref3, comp0, comp1, comp2, comp3);
|
||
|
|
sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
|
||
|
|
sad0 = __lsx_vadd_h(sad0, sad_tmp);
|
||
|
|
sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
|
||
|
|
sad1 = __lsx_vadd_h(sad1, sad_tmp);
|
||
|
|
}
|
||
|
|
sad = __lsx_vhaddw_wu_hu(sad0, sad0);
|
||
|
|
sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
|
||
|
|
sad = __lsx_vadd_w(sad, sad_tmp);
|
||
|
|
|
||
|
|
res = hadd_sw_s32(sad);
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_8xHT_LSX(height) \
|
||
|
|
uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *ref, int32_t ref_stride) { \
|
||
|
|
return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_16xHT_LSX(height) \
|
||
|
|
uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *ref, int32_t ref_stride) { \
|
||
|
|
return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_32xHT_LSX(height) \
|
||
|
|
uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *ref, int32_t ref_stride) { \
|
||
|
|
return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_64xHT_LSX(height) \
|
||
|
|
uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *ref, int32_t ref_stride) { \
|
||
|
|
return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_8xHTx4D_LSX(height) \
|
||
|
|
void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *const refs[4], \
|
||
|
|
int32_t ref_stride, uint32_t sads[4]) { \
|
||
|
|
sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_16xHTx4D_LSX(height) \
|
||
|
|
void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *const refs[], \
|
||
|
|
int32_t ref_stride, uint32_t *sads) { \
|
||
|
|
sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_32xHTx4D_LSX(height) \
|
||
|
|
void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *const refs[], \
|
||
|
|
int32_t ref_stride, uint32_t *sads) { \
|
||
|
|
sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_SAD_64xHTx4D_LSX(height) \
|
||
|
|
void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
|
||
|
|
const uint8_t *const refs[], \
|
||
|
|
int32_t ref_stride, uint32_t *sads) { \
|
||
|
|
sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_AVGSAD_32xHT_LSX(height) \
|
||
|
|
uint32_t vpx_sad32x##height##_avg_lsx( \
|
||
|
|
const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
|
||
|
|
int32_t ref_stride, const uint8_t *second_pred) { \
|
||
|
|
return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
|
||
|
|
second_pred); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define VPX_AVGSAD_64xHT_LSX(height) \
|
||
|
|
uint32_t vpx_sad64x##height##_avg_lsx( \
|
||
|
|
const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
|
||
|
|
int32_t ref_stride, const uint8_t *second_pred) { \
|
||
|
|
return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
|
||
|
|
second_pred); \
|
||
|
|
}
|
||
|
|
|
||
|
|
#define SAD64 \
|
||
|
|
VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
|
||
|
|
VPX_AVGSAD_64xHT_LSX(64)
|
||
|
|
|
||
|
|
SAD64
|
||
|
|
|
||
|
|
#define SAD32 \
|
||
|
|
VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
|
||
|
|
VPX_AVGSAD_32xHT_LSX(32)
|
||
|
|
|
||
|
|
SAD32
|
||
|
|
|
||
|
|
#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
|
||
|
|
|
||
|
|
SAD16
|
||
|
|
|
||
|
|
#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
|
||
|
|
|
||
|
|
SAD8
|
||
|
|
|
||
|
|
#undef SAD64
|
||
|
|
#undef SAD32
|
||
|
|
#undef SAD16
|
||
|
|
#undef SAD8
|