659 lines
33 KiB
C
659 lines
33 KiB
C
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2022 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*****************************************************************************
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
*/
|
|
/*!
|
|
**************************************************************************
|
|
|
|
* * \file ih264d_resamp_svc.c
|
|
*
|
|
* \brief
|
|
* Contains routines that
|
|
* resample for SVC resampling
|
|
*
|
|
* Detailed_description
|
|
*
|
|
* \date
|
|
*
|
|
*
|
|
*
|
|
* \author
|
|
|
|
* **************************************************************************
|
|
|
|
*/
|
|
#include <immintrin.h>
|
|
|
|
#include "ih264_typedefs.h"
|
|
#include "ih264_debug.h"
|
|
#include "isvc_intra_resample.h"
|
|
|
|
void isvc_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
|
|
UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
|
|
{
|
|
WORD32 i4_y;
|
|
WORD32 i4_filt_stride, i4_src_stride;
|
|
UWORD8 *pu1_inp, *pu1_out;
|
|
WORD16 *pi2_tmp;
|
|
|
|
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
|
|
__m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
|
|
__m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
|
|
__m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
|
|
|
|
/* Filter coefficient values for phase 4 */
|
|
__m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
|
|
__m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
|
|
i4_filt_stride = 12;
|
|
i4_src_stride = DYADIC_REF_W_Y;
|
|
|
|
/* Initializing pointers */
|
|
pu1_inp = pu1_inp_buf;
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
|
pu1_out = pu1_out_buf;
|
|
|
|
/* Vertical interpolation */
|
|
/*First 64 bit */
|
|
/* y = 0, y_phase = 12 */
|
|
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
|
|
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
|
|
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
|
i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
|
pu1_inp += (i4_src_stride << 2);
|
|
i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
|
|
i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
|
|
i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
|
|
|
|
/* since y_phase 12 for y = 0 */
|
|
/*Multiply by 8 => left shift by 3*/
|
|
i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
|
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
|
pi2_tmp += i4_filt_stride;
|
|
|
|
for(i4_y = 1; i4_y < 15; i4_y += 2)
|
|
{
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
|
|
|
/* y_phase is 4 for odd values of y */
|
|
/* and 12 for even values of y */
|
|
//*Multiply by 8 => left shift by 3*/
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
|
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
|
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
|
|
|
/* Storing the results */
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
|
pi2_tmp += (i4_filt_stride << 1);
|
|
pu1_inp += i4_src_stride;
|
|
|
|
} /* End of loop over y */
|
|
|
|
/* y = 15, y_phase = 4 */
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
|
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
|
|
|
/* Store the output */
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
|
|
|
/* Reinitializing the ptrs */
|
|
pu1_inp = pu1_inp_buf;
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
|
|
|
/*Remaining 32 bit */
|
|
pu1_inp += 8;
|
|
pi2_tmp += 8;
|
|
|
|
/* y = 0, y_phase = 12 */
|
|
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
|
|
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
|
|
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
|
i4_samp_16x8b_3 =
|
|
_mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
|
pu1_inp += (i4_src_stride << 2);
|
|
i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
|
|
i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
|
|
i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
|
|
|
|
/* since y_phase 12 for y = 0 */
|
|
/*Multiply by 8 => left shift by 3*/
|
|
i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
|
|
|
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
|
pi2_tmp += i4_filt_stride;
|
|
|
|
for(i4_y = 1; i4_y < 15; i4_y += 2)
|
|
{
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
|
|
|
/* y_phase is 4 for odd values of y */
|
|
/* and 12 for even values of y */
|
|
//*Multiply by 8 => left shift by 3*/
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
|
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
|
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
|
|
|
/* Storing the results */
|
|
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
|
_mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
|
pi2_tmp += (i4_filt_stride << 1);
|
|
pu1_inp += i4_src_stride;
|
|
|
|
} /* End of loop over y */
|
|
|
|
/* y = 15, y_phase = 4 */
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
|
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
|
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
|
|
|
/* Store the output */
|
|
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
|
|
|
/* Reinitializing the ptrs */
|
|
pu1_inp = pu1_inp_buf;
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
|
|
|
{
|
|
__m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
|
|
__m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
|
|
__m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
|
|
__m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
|
|
|
|
__m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
|
|
__m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
|
|
__m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
|
|
__m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
|
|
__m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
|
|
|
|
__m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
|
|
__m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
|
|
__m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
|
|
__m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
|
|
|
|
__m128i res_512 = _mm_set1_epi32(512);
|
|
/* Horizontal interpolation */
|
|
for(i4_y = 0; i4_y < 16; i4_y++)
|
|
{
|
|
i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
|
|
i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
|
|
|
|
i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
|
|
i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
|
|
i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
|
|
i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
|
|
|
|
i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
|
|
i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
|
|
i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
|
|
i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
|
|
|
|
i4_samp_8x16b_rpart1_0 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
|
|
i4_samp_8x16b_rpart1_1 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
|
|
i4_samp_8x16b_rpart1_2 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
|
|
i4_samp_8x16b_rpart1_3 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
|
|
|
|
i4_samp_8x16b_rpart2_0 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
|
|
i4_samp_8x16b_rpart2_1 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
|
|
i4_samp_8x16b_rpart2_2 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
|
|
i4_samp_8x16b_rpart2_3 =
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
|
|
|
|
i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
|
|
i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
|
|
|
|
i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
|
|
|
|
i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
|
|
i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
|
|
|
|
i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
|
|
|
|
i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
|
|
i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
|
|
|
|
i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
|
|
i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
|
|
|
|
i4_res_4x32b_rpart1_2 =
|
|
_mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
|
|
i4_res_4x32b_rpart1_3 =
|
|
_mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
|
|
|
|
i4_res_4x32b_rpart2_2 =
|
|
_mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
|
|
i4_res_4x32b_rpart2_3 =
|
|
_mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
|
|
|
|
i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
|
|
i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
|
|
|
|
i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
|
|
i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
|
|
|
|
i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
|
|
i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
|
|
|
|
i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
|
|
i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
|
|
|
|
_mm_storeu_si128(
|
|
(__m128i *) pu1_out,
|
|
_mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
|
|
_mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
|
|
|
|
pi2_tmp += i4_filt_stride;
|
|
pu1_out += i4_out_stride;
|
|
|
|
} /* End of loop over y */
|
|
}
|
|
}
|
|
|
|
void isvc_vert_interpol_chroma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
|
|
WORD32 i4_phase_0, WORD32 i4_phase_1)
|
|
{
|
|
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
|
|
WORD32 i4_filt_stride, i4_src_stride;
|
|
UWORD8 *pu1_inp;
|
|
WORD16 *pi2_tmp;
|
|
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
|
|
i4_samp_16x8b_5;
|
|
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
|
|
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
|
|
__m128i i4_res_8x16b_r7_temp;
|
|
__m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
|
|
|
|
i4_coeff_0 = (WORD8) (16 - i4_phase_0);
|
|
i4_coeff_1 = (WORD8) (i4_phase_0);
|
|
i4_coeff_2 = (WORD8) (16 - i4_phase_1);
|
|
i4_coeff_3 = (WORD8) (i4_phase_1);
|
|
|
|
i4_c0_c1_16x8b =
|
|
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
|
i4_c2_c3_16x8b =
|
|
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
|
|
|
/* Initializing pointers */
|
|
pu1_inp = pu1_inp_buf;
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
|
i4_filt_stride = 6;
|
|
i4_src_stride = DYADIC_REF_W_C;
|
|
|
|
i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
|
|
i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
|
|
i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
|
i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
|
i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
|
|
i4_samp_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
|
|
|
|
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
|
|
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
|
|
|
|
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
|
|
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
|
|
|
|
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
|
|
|
|
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
|
|
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
|
|
i4_res_8x16b_r3);
|
|
|
|
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
|
|
|
|
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
|
|
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
|
|
i4_res_8x16b_r5);
|
|
|
|
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
|
|
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
|
|
i4_res_8x16b_r6);
|
|
|
|
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
|
|
|
|
i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
|
|
|
|
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
|
|
|
|
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
|
|
|
|
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
|
|
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
|
|
i4_res_8x16b_r7_temp);
|
|
}
|
|
|
|
void isvc_horz_interpol_chroma_dyadic_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
|
|
WORD32 i4_out_stride, WORD32 i4_phase_0,
|
|
WORD32 i4_phase_1)
|
|
{
|
|
WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
|
|
UWORD8 *pu1_out;
|
|
WORD16 *pi2_tmp;
|
|
|
|
__m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
|
|
__m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
|
|
__m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
|
|
__m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
|
|
__m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
|
|
__m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
|
|
__m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
|
|
__m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
|
|
|
|
__m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
|
|
__m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
|
|
__m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
|
|
__m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
|
|
__m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
|
|
__m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
|
|
__m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
|
|
__m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
|
|
|
|
__m128i i4_res_final_8x16b_r1, i4_res_final_8x16b_r2, i4_res_final_8x16b_r3,
|
|
i4_res_final_8x16b_r4, i4_res_final_8x16b_r5, i4_res_final_8x16b_r6, i4_res_final_8x16b_r7,
|
|
i4_res_final_8x16b_r8;
|
|
|
|
__m128i out_16x8b_r1, out_16x8b_r2, out_16x8b_r3, out_16x8b_r4, out_16x8b_r5, out_16x8b_r6,
|
|
out_16x8b_r7, out_16x8b_r8;
|
|
|
|
__m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
|
|
__m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
|
|
__m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
|
|
__m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
|
|
__m128i chroma_mask, chroma_mask2;
|
|
|
|
WORD32 i4_coeff_0 = 16 - i4_phase_0;
|
|
WORD32 i4_coeff_1 = i4_phase_0;
|
|
WORD32 i4_coeff_2 = 16 - i4_phase_1;
|
|
WORD32 i4_coeff_3 = i4_phase_1;
|
|
__m128i coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
|
__m128i coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
|
__m128i res_128 = _mm_set1_epi32(128);
|
|
UWORD32 u4_norm_factor = 8;
|
|
|
|
/* Initializing pointers */
|
|
pu1_out = pu1_out_buf;
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
|
i4_dst_stride = i4_out_stride;
|
|
|
|
i4_dst_stride2 = i4_dst_stride << 1;
|
|
i4_dst_stride4 = i4_dst_stride << 2;
|
|
|
|
/* Horizontal interpolation */
|
|
i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
|
|
i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6));
|
|
i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12));
|
|
i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18));
|
|
i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24));
|
|
i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30));
|
|
i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36));
|
|
i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42));
|
|
|
|
i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2);
|
|
i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4);
|
|
|
|
i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2);
|
|
i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4);
|
|
|
|
i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2);
|
|
i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4);
|
|
|
|
i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2);
|
|
i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4);
|
|
|
|
i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2);
|
|
i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4);
|
|
|
|
i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2);
|
|
i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4);
|
|
|
|
i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2);
|
|
i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4);
|
|
|
|
i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2);
|
|
i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4);
|
|
|
|
i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1);
|
|
i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1);
|
|
i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
|
|
i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
|
|
i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
|
|
i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
|
|
i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
|
|
i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
|
|
|
|
i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2);
|
|
i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2);
|
|
i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
|
|
i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
|
|
i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
|
|
i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
|
|
i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
|
|
i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
|
|
|
|
// a0c0 + a1c1 a1c0 + a2c1 a2c0 + a3c1 a3c0 + a4c1
|
|
i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
|
|
// b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
|
|
i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
|
|
i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
|
|
|
|
// a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
|
|
i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
|
|
// b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
|
|
i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
|
|
i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
|
|
i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
|
|
i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
|
|
i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
|
|
i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
|
|
i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
|
|
|
|
i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_128);
|
|
i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_128);
|
|
i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_128);
|
|
i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_128);
|
|
i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_128);
|
|
i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_128);
|
|
i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_128);
|
|
i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_128);
|
|
|
|
i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_128);
|
|
i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_128);
|
|
i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_128);
|
|
i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_128);
|
|
i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_128);
|
|
i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_128);
|
|
i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_128);
|
|
i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_128);
|
|
|
|
i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, u4_norm_factor);
|
|
i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, u4_norm_factor);
|
|
i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, u4_norm_factor);
|
|
i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, u4_norm_factor);
|
|
i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, u4_norm_factor);
|
|
i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, u4_norm_factor);
|
|
i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, u4_norm_factor);
|
|
i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, u4_norm_factor);
|
|
|
|
i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, u4_norm_factor);
|
|
i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, u4_norm_factor);
|
|
i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, u4_norm_factor);
|
|
i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, u4_norm_factor);
|
|
i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, u4_norm_factor);
|
|
i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, u4_norm_factor);
|
|
i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, u4_norm_factor);
|
|
i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, u4_norm_factor);
|
|
|
|
i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
|
|
i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
|
|
i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
|
|
i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
|
|
|
|
i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
|
|
i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
|
|
i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
|
|
i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
|
|
|
|
i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
|
|
i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
|
|
i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
|
|
i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
|
|
i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
|
|
i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
|
|
i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
|
|
i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
|
|
|
|
chroma_mask = _mm_set1_epi16(0xFF00);
|
|
chroma_mask2 = _mm_set1_epi16(0x00FF);
|
|
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
|
|
out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
|
|
out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
|
|
out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
|
|
out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
|
|
out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
|
|
out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
|
|
out_16x8b_r8 =
|
|
_mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
|
|
|
|
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
|
|
out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
|
|
out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
|
|
out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
|
|
out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
|
|
out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
|
|
out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
|
|
out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
|
|
|
|
i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
|
|
i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
|
|
i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
|
|
i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
|
|
i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
|
|
i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
|
|
i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
|
|
i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
|
|
|
|
out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
|
|
out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
|
|
out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
|
|
out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
|
|
out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
|
|
out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
|
|
out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
|
|
out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
|
|
|
|
_mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 2)), out_16x8b_r3);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 4)), out_16x8b_r5);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
|
|
}
|