unplugged-system/external/libavc/decoder/x86/svc/isvcd_intra_resamp_sse42.c

1926 lines
105 KiB
C
Raw Normal View History

/******************************************************************************
*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* isvcd_intra_resamp_sse42.c
*
* @brief
* Contains function definitions for intra resampling functions
*
* @author
* Kishore
*
* @par List of Functions:
* - isvcd_interpolate_base_luma_dyadic_sse42
* - isvcd_vert_interpol_chroma_dyadic_1_sse42
* - isvcd_vert_interpol_chroma_dyadic_2_sse42
* - isvcd_vert_interpol_chroma_dyadic_3_sse42
* - isvcd_horz_interpol_chroma_dyadic_1_sse42
* - isvcd_horz_interpol_chroma_dyadic_2_sse42
*
* @remarks
* None
*
*******************************************************************************
*/
#include <immintrin.h>
#include <smmintrin.h>
#include <emmintrin.h>
/* User include files */
#include "ih264_typedefs.h"
#include "isvcd_structs.h"
/*****************************************************************************/
/* */
/* Function Name : isvcd_interpolate_base_luma_dyadic_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* intra resampling for dyadic scaling ratios */
/* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
/* vertically interpolated data */
/* pu1_out_buf : output buffer pointer */
/* i4_out_stride : output buffer stride */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction followed */
/* by horizontal direction */
/* Outputs : resampled pixels */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 05 21 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
{
WORD32 i4_x, i4_y;
WORD32 i4_filt_stride, i4_src_stride;
UWORD8 *pu1_inp, *pu1_out;
WORD16 *pi2_tmp;
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
__m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
__m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
__m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
/* Filter coefficient values for phase 4 */
__m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
__m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
i4_filt_stride = 12;
i4_src_stride = DYADIC_REF_W_Y;
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
pu1_out = pu1_out_buf;
/* Vertical interpolation */
/*First 64 bit */
for(i4_x = 0; i4_x < 1; i4_x++)
{
/* y = 0, y_phase = 12 */
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
i4_samp_16x8b_3 =
_mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
pu1_inp += (i4_src_stride << 2);
i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
/* since y_phase 12 for y = 0 */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
pi2_tmp += i4_filt_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
/* y = 15, y_phase = 4 */
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
/* Store the output */
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
/* Reinitializing the ptrs */
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
} /* End of loop over x */
/*Remaining 32 bit */
pu1_inp += 8;
pi2_tmp += 8;
for(i4_x = 0; i4_x < 1; i4_x++)
{
/* y = 0, y_phase = 12 */
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
i4_samp_16x8b_3 =
_mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
pu1_inp += (i4_src_stride << 2);
i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
/* since y_phase 12 for y = 0 */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
pi2_tmp += i4_filt_stride;
for(i4_y = 1; i4_y < 15; i4_y += 2)
{
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
/* y_phase is 4 for odd values of y */
/* and 12 for even values of y */
/*Multiply by 8 => left shift by 3*/
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
/* Storing the results */
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
_mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
pi2_tmp += (i4_filt_stride << 1);
pu1_inp += i4_src_stride;
} /* End of loop over y */
/* y = 15, y_phase = 4 */
i4_samp_8x16b_0 = i4_samp_8x16b_1;
i4_samp_8x16b_1 = i4_samp_8x16b_2;
i4_samp_8x16b_2 = i4_samp_8x16b_3;
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
/* Store the output */
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
/* Reinitializing the ptrs */
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
}
{
__m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
__m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
__m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
__m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
__m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
__m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
__m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
__m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
__m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
__m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
__m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
__m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
__m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
__m128i res_512 = _mm_set1_epi32(512);
/* Horizontal interpolation */
for(i4_y = 0; i4_y < 16; i4_y++)
{
// a0 a1 a2 a3 a4 a5 a6 a7
i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
// a4 a5 a6 a7 a8 a9 a10 a11
i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
// a1 a2 a3 a4 a5 a6 a7 0
i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
// a2 a3 a4 a5 a6 a7 0 0
i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
// a3 a4 a5 a6 a7 0 0 0
i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
// a4 a5 a6 a7 0 0 0 0
i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
// a5 a6 a7 a8 a9 a10 a11 0
i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
// a6 a7 a8 a9 a10 a11 0 0
i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
// a7 a8 a9 a10 a11 0 0 0
i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
// a8 a9 a10 a11 0 0 0 0
i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
// a0 a1 a1 a2 a2 a3 a3 a4
i4_samp_8x16b_rpart1_0 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
// a1 a2 a2 a3 a3 a4 a4 a5
i4_samp_8x16b_rpart1_1 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
// a2 a3 a3 a4 a4 a5 a5 a6
i4_samp_8x16b_rpart1_2 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
// a3 a4 a4 a5 a5 a6 a6 a7
i4_samp_8x16b_rpart1_3 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
// a4 a5 a5 a6 a6 a7 a7 a8
i4_samp_8x16b_rpart2_0 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
// a5 a6 a6 a7 a7 a8 a8 a9
i4_samp_8x16b_rpart2_1 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
// a6 a7 a7 a8 a8 a9 a9 a10
i4_samp_8x16b_rpart2_2 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
// a7 a8 a8 a9 a9 a10 a10 a11
i4_samp_8x16b_rpart2_3 =
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
// a0c3+a1c2 a1c3+a2c2 a2c3+a3c2 a3c3+a4c2
i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
// a2c1+a3c0 a3c1+a4c0 a4c1+a5c0 a5c1+a6c0
i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
// a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 a4c0+a5c1
i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
// a3c2+a4c3 a5c2+a5c3 a5c2+a6c3 a6c2+a7c3
i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
// a4c3+a5c2 a5a3+a6c2 a6c3+a7c2 a7c3+a8c2
i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
// a6c1+a7c0 a7c1+a8c0 a8c1+a9c0 a9c1+a10c0
i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
// a5c0+a6c1 a6c0+a7c1 a7c0+a8c1 a8c0+a9c1
i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
// a7c2+a8c3 a8c2+a9c3 a9c2+a10c3 a10c2+a11c3
i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
// a0c3+a1c2 + a2c1+a3c0 a1c3+a2c2 + a3c1+a4c0 a2c3+a3c2 + a4c1+a5c0
// a3c3+a4c2 +a5c1+a6c0
i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
// a1c0+a2c1 + a3c2+a4c3 a2c0+a3c1 + a5c2+a5c3 a3c0+a4c1 + a5c2+a6c3
// a4c0+a5c1 + a6c2+a7c3
i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
// a4c3+a5c2 + a6c1+a7c0 a5a3+a6c2 + a7c1+a8c0 a6c3+a7c2 + a8c1+a9c0
// a7c3+a8c2+ a9c1+a10c0
i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
// a5c0+a6c1 + a7c2+a8c3 a6c0+a7c1 + a8c2+a9c3 a7c0+a8c1 + a9c2+a10c3
// a8c0+a9c1 + a10c2+a11c3
i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
i4_res_4x32b_rpart1_2 =
_mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
i4_res_4x32b_rpart1_3 =
_mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
i4_res_4x32b_rpart2_2 =
_mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
i4_res_4x32b_rpart2_3 =
_mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
_mm_storeu_si128(
(__m128i *) pu1_out,
_mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
_mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
pi2_tmp += i4_filt_stride;
pu1_out += i4_out_stride;
} /* End of loop over y */
}
} /* isvcd_interpolate_base_luma_dyadic */
/*****************************************************************************/
/* */
/* Function Name : isvcd_interpolate_intra_base_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* interpolation of a component to find the intra */
/* resampled value */
/* Inputs : pv_intra_samp_ctxt : intra sampling context */
/* pu1_out : output buffer pointer */
/* i4_out_stride : output buffer stride */
/* i4_refarray_wd : reference array width */
/* i4_x_offset : offset in reference layer in horz direction*/
/* ps_coord : current mb co-ordinate */
/* i4_chroma_flag : chroma processing flag */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction followed */
/* by horizontal direction */
/* Outputs : resampled pixels */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 06 09 2021 Kishore creation */
/* */
/*****************************************************************************/
void isvcd_interpolate_intra_base_sse42(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
WORD32 i4_out_stride, WORD32 i4_refarray_wd, WORD32 i4_mb_x,
WORD32 i4_mb_y, WORD32 i4_chroma_flag,
WORD32 i4_refarray_flag)
{
/* --------------------------------------------------------------------- */
/* Index Parameters */
/* --------------------------------------------------------------------- */
intra_sampling_ctxt_t *ps_ctxt;
intra_samp_map_ctxt_t *ps_map_ctxt;
intra_samp_lyr_ctxt *ps_lyr_ctxt;
WORD32 i4_x, i4_y;
WORD32 i4_frm_mb_x, i4_frm_mb_y;
UWORD8 *pu1_refarray = NULL;
ref_pixel_map_t *ps_x_pos_phase;
ref_pixel_map_t *ps_y_pos_phase;
WORD32 i4_temp_array_ht;
WORD32 *pi4_interp_buff;
WORD32 i4_mb_wd;
WORD32 i4_mb_ht;
WORD32 i4_x_min;
ref_min_max_map_t *ps_x_min_max;
WORD8 arr_y_ref_pos_luma[16] = {0};
WORD8 arr_x_ref_pos_luma[16] = {0};
WORD8 arr_x_ref_pos_luma_low[16] = {0};
WORD8 arr_x_ref_pos_luma_high[16] = {0};
WORD8 arr_phase_luma[32] = {0};
WORD8 *pi4_y_ref_pos_luma;
WORD8 *pi4_x_ref_pos_luma_low;
WORD8 *pi4_x_ref_pos_luma_high;
WORD8 *pi4_phase_luma;
UWORD8 *pu1_refarray_temp;
/* --------------------------------------------------------------------- */
/* Extracting pointers from the context */
/* --------------------------------------------------------------------- */
ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
if(0 == i4_refarray_flag)
{
pu1_refarray = ps_ctxt->pu1_refarray_buffer;
}
else if(1 == i4_refarray_flag)
{
pu1_refarray = ps_ctxt->pu1_refarray_cb;
}
/* --------------------------------------------------------------------- */
/* LUMA or CHROMA */
/* --------------------------------------------------------------------- */
if(1 == i4_chroma_flag)
ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
else
ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
ps_x_min_max = ps_map_ctxt->ps_x_min_max;
i4_frm_mb_y = i4_mb_y * i4_mb_ht;
i4_frm_mb_x = i4_mb_x * i4_mb_wd;
/* get the min position */
i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
/* --------------------------------------------------------------------- */
/* Projected frame level pointers */
/* --------------------------------------------------------------------- */
ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
/* --------------------------------------------------------------------- */
/* Pointers and Dimenstion of the temporary buffer */
/* --------------------------------------------------------------------- */
i4_temp_array_ht = i4_mb_ht;
pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
if(i4_chroma_flag == 0)
{
/* --------------------------------------------------------------------- */
/* Loop for interpolation in vertical direction */
/* --------------------------------------------------------------------- */
WORD16 *pi2_interp_buff_temp;
pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
{
__m128i out_res_8x16b_0, out_res_8x16b_1;
__m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
out_res_8x16b_r01_0;
__m128i inp_8x16b_r1, inp_8x16b_r23_0, phs_mask_16x8b_r1, phs_mask_16x8b_r23_0,
out_res_8x16b_r01_1;
__m128i inp_8x16b_r2, inp_8x16b_r01_1, phs_mask_16x8b_r2, phs_mask_16x8b_r01_1,
out_res_8x16b_r23_0;
__m128i inp_8x16b_r3, inp_8x16b_r23_1, phs_mask_16x8b_r3, phs_mask_16x8b_r23_1,
out_res_8x16b_r23_1;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
arr_y_ref_pos_luma[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
}
pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
pi4_phase_luma = arr_phase_luma;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
pu1_refarray_temp =
pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp - i4_refarray_wd));
inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
inp_8x16b_r2 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
inp_8x16b_r3 =
_mm_loadu_si128((__m128i *) (pu1_refarray_temp + 2 * i4_refarray_wd));
inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
inp_8x16b_r23_0 = _mm_unpacklo_epi8(inp_8x16b_r2, inp_8x16b_r3);
inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
inp_8x16b_r23_1 = _mm_unpackhi_epi8(inp_8x16b_r2, inp_8x16b_r3);
phs_mask_16x8b_r0 = _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
phs_mask_16x8b_r1 =
_mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
phs_mask_16x8b_r2 =
_mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
phs_mask_16x8b_r3 =
_mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
phs_mask_16x8b_r23_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
phs_mask_16x8b_r23_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
out_res_8x16b_r23_0 = _mm_maddubs_epi16(inp_8x16b_r23_0, phs_mask_16x8b_r23_0);
out_res_8x16b_r23_1 = _mm_maddubs_epi16(inp_8x16b_r23_1, phs_mask_16x8b_r23_1);
out_res_8x16b_0 = _mm_add_epi16(out_res_8x16b_r01_0, out_res_8x16b_r23_0);
out_res_8x16b_1 = _mm_add_epi16(out_res_8x16b_r01_1, out_res_8x16b_r23_1);
_mm_storeu_si128(
(__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
out_res_8x16b_0);
_mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
(i4_x_min - 1) + 8),
out_res_8x16b_1);
}
}
/* --------------------------------------------------------------------- */
/* Loop for interpolation in horizontal direction */
/* --------------------------------------------------------------------- */
{
WORD32 strt_indx = 10, strt_indx_h = 0;
__m128i inp_8x16b_0;
__m128i inp_8x16b_1;
__m128i phs_mask_16x8b_0;
__m128i phs_mask_16x8b_1;
__m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r0_1, x_ref_pos_luma_mask_r1_0,
x_ref_pos_luma_mask_r1_1, x_ref_pos_luma_mask_r2_0, x_ref_pos_luma_mask_r2_1,
x_ref_pos_luma_mask_r3_0, x_ref_pos_luma_mask_r3_1;
__m128i inp_8x16b_2, inp_8x16b_3;
WORD32 i4_x2 = 0;
WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
__m128i twos = _mm_set1_epi8(2);
strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
{
arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
}
for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
{
i4_x2 = i4_x << 1;
arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
}
for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
{
i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
}
pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
pi4_phase_luma = arr_phase_luma;
phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
phs_mask_16x8b_1 = _mm_loadu_si128((__m128i *) (pi4_phase_luma + 8));
x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
x_ref_pos_luma_mask_r0_1 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_high));
x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
x_ref_pos_luma_mask_r1_1 = _mm_add_epi8(x_ref_pos_luma_mask_r0_1, twos);
x_ref_pos_luma_mask_r2_0 = _mm_add_epi8(x_ref_pos_luma_mask_r1_0, twos);
x_ref_pos_luma_mask_r2_1 = _mm_add_epi8(x_ref_pos_luma_mask_r1_1, twos);
x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
{
__m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r0_1,
ip_filt_8x16b_r01_l_0, ip_filt_8x16b_r01_h_0;
__m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r1_1,
ip_filt_8x16b_r23_l_0, ip_filt_8x16b_r23_h_0;
__m128i ip_filt_16x8b_r2, ip_filt_8x16b_r2_0, ip_filt_8x16b_r2_1,
ip_filt_8x16b_r01_l_1, ip_filt_8x16b_r01_h_1;
__m128i ip_filt_16x8b_r3, ip_filt_8x16b_r3_0, ip_filt_8x16b_r3_1,
ip_filt_8x16b_r23_l_1, ip_filt_8x16b_r23_h_1;
__m128i inp_8x16b_r0_0, inp_8x16b_r2_0, inp_8x16b_r01_l_0, inp_8x16b_r01_h_0,
out_res_4x32b_r01_l_0, out_res_4x32b_r01_h_0;
__m128i inp_8x16b_r0_1, inp_8x16b_r2_1, inp_8x16b_r23_l_0, inp_8x16b_r23_h_0,
out_res_4x32b_r01_l_1, out_res_4x32b_r01_h_1;
__m128i inp_8x16b_r1_0, inp_8x16b_r3_0, inp_8x16b_r01_l_1, inp_8x16b_r01_h_1,
out_res_4x32b_r23_l_0, out_res_4x32b_r23_h_0;
__m128i inp_8x16b_r1_1, inp_8x16b_r3_1, inp_8x16b_r23_l_1, inp_8x16b_r23_h_1,
out_res_4x32b_r23_l_1, out_res_4x32b_r23_h_1;
__m128i out_res_4x32b_l_0;
__m128i out_res_4x32b_l_1;
__m128i out_res_4x32b_h_0;
__m128i out_res_4x32b_h_1;
__m128i out_res_8x16b_l;
__m128i out_res_8x16b_h;
__m128i out_res_16x8b;
__m128i const_512 = _mm_set1_epi32(512);
ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma));
ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 16));
ip_filt_16x8b_r2 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 32));
ip_filt_16x8b_r3 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 48));
ip_filt_8x16b_r0_0 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
ip_filt_8x16b_r1_0 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
ip_filt_8x16b_r2_0 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_0));
ip_filt_8x16b_r3_0 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_0));
ip_filt_8x16b_r0_1 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_1));
ip_filt_8x16b_r1_1 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_1));
ip_filt_8x16b_r2_1 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_1));
ip_filt_8x16b_r3_1 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_1));
ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
ip_filt_8x16b_r23_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
ip_filt_8x16b_r23_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
ip_filt_8x16b_r01_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
ip_filt_8x16b_r23_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
ip_filt_8x16b_r01_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
ip_filt_8x16b_r23_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
{
inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
inp_8x16b_1 = _mm_loadu_si128(
(__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h));
inp_8x16b_2 =
_mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx + 3));
inp_8x16b_3 = _mm_loadu_si128(
(__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
pi2_interp_buff_temp += i4_refarray_wd;
inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
inp_8x16b_r0_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r0_1);
inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
inp_8x16b_r1_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r1_1);
inp_8x16b_r2_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r2_0);
inp_8x16b_r2_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r2_1);
inp_8x16b_r3_0 = _mm_shuffle_epi8(inp_8x16b_2, x_ref_pos_luma_mask_r3_0);
inp_8x16b_r3_1 = _mm_shuffle_epi8(inp_8x16b_3, x_ref_pos_luma_mask_r3_1);
inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
inp_8x16b_r23_l_0 = _mm_unpacklo_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
inp_8x16b_r23_l_1 = _mm_unpackhi_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
inp_8x16b_r01_h_0 = _mm_unpacklo_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
inp_8x16b_r23_h_0 = _mm_unpacklo_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
inp_8x16b_r01_h_1 = _mm_unpackhi_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
inp_8x16b_r23_h_1 = _mm_unpackhi_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
out_res_4x32b_r01_l_0 =
_mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
out_res_4x32b_r01_l_1 =
_mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
out_res_4x32b_r23_l_0 =
_mm_madd_epi16(inp_8x16b_r23_l_0, ip_filt_8x16b_r23_l_0);
out_res_4x32b_r23_l_1 =
_mm_madd_epi16(inp_8x16b_r23_l_1, ip_filt_8x16b_r23_l_1);
out_res_4x32b_r01_h_0 =
_mm_madd_epi16(inp_8x16b_r01_h_0, ip_filt_8x16b_r01_h_0);
out_res_4x32b_r01_h_1 =
_mm_madd_epi16(inp_8x16b_r01_h_1, ip_filt_8x16b_r01_h_1);
out_res_4x32b_r23_h_0 =
_mm_madd_epi16(inp_8x16b_r23_h_0, ip_filt_8x16b_r23_h_0);
out_res_4x32b_r23_h_1 =
_mm_madd_epi16(inp_8x16b_r23_h_1, ip_filt_8x16b_r23_h_1);
out_res_4x32b_l_0 = _mm_add_epi32(out_res_4x32b_r01_l_0, out_res_4x32b_r23_l_0);
out_res_4x32b_l_1 = _mm_add_epi32(out_res_4x32b_r01_l_1, out_res_4x32b_r23_l_1);
out_res_4x32b_h_0 = _mm_add_epi32(out_res_4x32b_r01_h_0, out_res_4x32b_r23_h_0);
out_res_4x32b_h_1 = _mm_add_epi32(out_res_4x32b_r01_h_1, out_res_4x32b_r23_h_1);
out_res_4x32b_l_0 =
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_0, const_512), 10);
out_res_4x32b_l_1 =
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_1, const_512), 10);
out_res_4x32b_h_0 =
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_0, const_512), 10);
out_res_4x32b_h_1 =
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_1, const_512), 10);
out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
out_res_8x16b_h = _mm_packs_epi32(out_res_4x32b_h_0, out_res_4x32b_h_1);
out_res_16x8b = _mm_packus_epi16(out_res_8x16b_l, out_res_8x16b_h);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_res_16x8b);
}
}
}
}
else
{
WORD16 *pi2_interp_buff_temp;
pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
{
__m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
out_res_8x16b_r01_0;
__m128i inp_8x16b_r1, phs_mask_16x8b_r1, out_res_8x16b_r01_1;
__m128i inp_8x16b_r01_1, phs_mask_16x8b_r01_1;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
arr_y_ref_pos_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
}
pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
pi4_phase_luma = arr_phase_luma;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
pu1_refarray_temp =
pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
phs_mask_16x8b_r0 = _mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
phs_mask_16x8b_r1 =
_mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
_mm_storeu_si128(
(__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
out_res_8x16b_r01_0);
_mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
(i4_x_min - 1) + 8),
out_res_8x16b_r01_1);
}
}
{
WORD32 strt_indx = 10;
__m128i inp_8x16b_0, inp_8x16b_r0_0;
__m128i phs_mask_16x8b_0;
__m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r1_0;
__m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r01_l_0;
__m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r01_l_1;
__m128i inp_8x16b_r1_0, inp_8x16b_r01_l_0, out_res_4x32b_r01_l_0;
__m128i inp_8x16b_r01_l_1, out_res_4x32b_r01_l_1;
__m128i out_res_4x32b_l_0;
__m128i out_res_4x32b_l_1;
__m128i out_res_8x16b_l;
__m128i out_16x8b_r1;
__m128i chroma_mask;
__m128i const_512 = _mm_set1_epi32(512);
WORD32 i4_x2 = 0;
__m128i twos = _mm_set1_epi8(2);
strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
{
arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
i4_x2 = i4_x << 1;
arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
}
pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
pi4_phase_luma = arr_phase_luma;
phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma));
ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma + 16));
ip_filt_8x16b_r0_0 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
ip_filt_8x16b_r1_0 =
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
{
inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
pi2_interp_buff_temp += i4_refarray_wd;
inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
out_res_4x32b_r01_l_0 = _mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
out_res_4x32b_r01_l_1 = _mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
out_res_4x32b_l_0 =
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_0, const_512), 10);
out_res_4x32b_l_1 =
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_1, const_512), 10);
out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
chroma_mask = _mm_set1_epi16(0xFF00);
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)));
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
out_16x8b_r1 = _mm_add_epi8(out_res_8x16b_l, out_16x8b_r1);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_16x8b_r1);
}
}
}
return;
} /* End of Interpolation Function */
/*****************************************************************************/
/* */
/* Function Name : isvcd_vert_interpol_chroma_dyadic_1_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* interpolation of a component to find the intra */
/* resampled value */
/* Inputs : pv_intra_samp_ctxt : intra sampling context */
/* pu1_out : output buffer pointer */
/* i4_out_stride : output buffer stride */
/* i4_refarray_wd : reference array width */
/* i4_x_offset : offset in reference layer in horz direction*/
/* ps_coord : current mb co-ordinate */
/* i4_chroma_flag : chroma processing flag */
/* Globals : none */
/* Processing : it does the interpolation on horizontal direction */
/* Outputs : resampled pixels */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 06 09 2021 Kishore creation */
/* */
/*****************************************************************************/
void isvcd_vert_interpol_chroma_dyadic_1_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
WORD32 i4_phase_0, WORD32 i4_phase_1)
{
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_filt_stride, i4_src_stride;
UWORD8 *pu1_inp;
WORD16 *pi2_tmp;
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
i4_samp_16x8b_5;
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
__m128i i4_res_8x16b_r7_temp;
__m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
i4_coeff_0 = (WORD8) (8 - i4_phase_0);
i4_coeff_1 = (WORD8) (i4_phase_0);
i4_coeff_2 = (WORD8) (8 - i4_phase_1);
i4_coeff_3 = (WORD8) (i4_phase_1);
i4_c0_c1_16x8b =
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
i4_c2_c3_16x8b =
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
i4_filt_stride = 6;
i4_src_stride = DYADIC_REF_W_C;
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
i4_samp_16x8b_4 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2)));
i4_samp_16x8b_5 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
i4_res_8x16b_r3);
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
i4_res_8x16b_r5);
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
i4_res_8x16b_r6);
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
i4_res_8x16b_r7_temp);
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_vert_interpol_chroma_dyadic_2_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 0 1 */
/* 0 2 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_vert_interpol_chroma_dyadic_2_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
WORD32 i4_phase_0, WORD32 i4_phase_1)
{
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_filt_stride, i4_src_stride;
UWORD8 *pu1_inp;
WORD16 *pi2_tmp;
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
__m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
i4_coeff_0 = (WORD8) (8 - i4_phase_0);
i4_coeff_1 = (WORD8) (i4_phase_0);
i4_coeff_2 = (WORD8) (8 - i4_phase_1);
i4_coeff_3 = (WORD8) (i4_phase_1);
i4_c0_c1_16x8b =
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
i4_c2_c3_16x8b =
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
pi2_tmp = pi2_tmp_filt_buf;
i4_filt_stride = 6;
i4_src_stride = DYADIC_REF_W_C;
pu1_inp = pu1_inp_buf + i4_src_stride;
i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
i4_res_8x16b_r3);
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
i4_res_8x16b_r5);
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
i4_res_8x16b_r6);
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
i4_res_8x16b_r7_temp);
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_vert_interpol_chroma_dyadic_3_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 2 0 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_vert_interpol_chroma_dyadic_3_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
WORD32 i4_phase_0, WORD32 i4_phase_1)
{
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_filt_stride, i4_src_stride;
UWORD8 *pu1_inp;
WORD16 *pi2_tmp;
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
__m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
i4_coeff_0 = (WORD8) (8 - i4_phase_0);
i4_coeff_1 = (WORD8) (i4_phase_0);
i4_coeff_2 = (WORD8) (8 - i4_phase_1);
i4_coeff_3 = (WORD8) (i4_phase_1);
i4_c0_c1_16x8b =
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
i4_c2_c3_16x8b =
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
pi2_tmp = pi2_tmp_filt_buf;
i4_filt_stride = 6;
i4_src_stride = DYADIC_REF_W_C;
pu1_inp = pu1_inp_buf;
i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
i4_res_8x16b_r3);
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
i4_res_8x16b_r5);
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
i4_res_8x16b_r6);
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
i4_res_8x16b_r7_temp);
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_horz_interpol_chroma_dyadic_1_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 2 0 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_horz_interpol_chroma_dyadic_1_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
WORD32 i4_out_stride, WORD32 i4_phase_0,
WORD32 i4_phase_1)
{
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
UWORD8 *pu1_out;
WORD16 *pi2_tmp;
__m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
__m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
__m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
__m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
__m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
__m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
__m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
__m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
__m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
__m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
__m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
__m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
__m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
__m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
__m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
__m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
__m128i i4_res_final_8x16b_r1;
__m128i i4_res_final_8x16b_r2;
__m128i i4_res_final_8x16b_r3;
__m128i i4_res_final_8x16b_r4;
__m128i i4_res_final_8x16b_r5;
__m128i i4_res_final_8x16b_r6;
__m128i i4_res_final_8x16b_r7;
__m128i i4_res_final_8x16b_r8;
__m128i out_16x8b_r1;
__m128i out_16x8b_r2;
__m128i out_16x8b_r3;
__m128i out_16x8b_r4;
__m128i out_16x8b_r5;
__m128i out_16x8b_r6;
__m128i out_16x8b_r7;
__m128i out_16x8b_r8;
__m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
__m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
__m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
__m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
__m128i chroma_mask, chroma_mask2;
__m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
i4_coeff_0, i4_coeff_1, i4_coeff_0);
coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
i4_coeff_2, i4_coeff_3, i4_coeff_2);
res_32 = _mm_set1_epi32(32);
pu1_out = pu1_out_buf;
pi2_tmp = pi2_tmp_filt_buf;
i4_dst_stride = i4_out_stride;
i4_dst_stride2 = i4_dst_stride << 1;
i4_dst_stride4 = i4_dst_stride << 2;
/* Horizontal interpolation */
/* x = 0, x_phase = phase_0 */
i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4); // a2 a3 a4 a5 a6 a7 0 0
i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4); // b2 b3 b4 b5 b6 b7 0 0
i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
i4_samp_8x16b_r1_1); // a0 a1 a1 a2 a2 a3 a3 a4
i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
i4_samp_8x16b_r2_1); // b0 b1 b1 b2 b2 b3 b3 b4
i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1,
i4_samp_8x16b_r1_2); // a1 a2 a2 a3 a3 a4 a4 a5
i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1,
i4_samp_8x16b_r2_2); // b1 b2 b2 b3 b3 b4 b4 b5
i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
// a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
// b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
// a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
// b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
chroma_mask = _mm_set1_epi16(0xFF00);
chroma_mask2 = _mm_set1_epi16(0x00FF);
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
out_16x8b_r8 =
_mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
_mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
_mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
/* End of loop over x */
} /* isvcd_horz_interpol_chroma_dyadic_1_sse42 */
/*****************************************************************************/
/* */
/* Function Name : isvcd_horz_interpol_chroma_dyadic_2_sse42 */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 2 0 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_horz_interpol_chroma_dyadic_2_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
WORD32 i4_out_stride, WORD32 i4_phase_0,
WORD32 i4_phase_1)
{
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
UWORD8 *pu1_out;
WORD16 *pi2_tmp;
__m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1;
__m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1;
__m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1;
__m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1;
__m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1;
__m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1;
__m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1;
__m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1;
__m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
__m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
__m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
__m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
__m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
__m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
__m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
__m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
__m128i i4_res_final_8x16b_r1;
__m128i i4_res_final_8x16b_r2;
__m128i i4_res_final_8x16b_r3;
__m128i i4_res_final_8x16b_r4;
__m128i i4_res_final_8x16b_r5;
__m128i i4_res_final_8x16b_r6;
__m128i i4_res_final_8x16b_r7;
__m128i i4_res_final_8x16b_r8;
__m128i out_16x8b_r1;
__m128i out_16x8b_r2;
__m128i out_16x8b_r3;
__m128i out_16x8b_r4;
__m128i out_16x8b_r5;
__m128i out_16x8b_r6;
__m128i out_16x8b_r7;
__m128i out_16x8b_r8;
__m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
__m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
__m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
__m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
__m128i chroma_mask, chroma_mask2;
__m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
i4_coeff_0, i4_coeff_1, i4_coeff_0);
coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
i4_coeff_2, i4_coeff_3, i4_coeff_2);
res_32 = _mm_set1_epi32(32);
pu1_out = pu1_out_buf;
pi2_tmp = pi2_tmp_filt_buf + 1;
i4_dst_stride = i4_out_stride;
i4_dst_stride2 = i4_dst_stride << 1;
i4_dst_stride4 = i4_dst_stride << 2;
/* Horizontal interpolation */
/* x = 0, x_phase = phase_0 */
i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42)); // b0 b1 b2 b3 b4 b5 b6 b7
i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
i4_samp_8x16b_r1_1); // a0 a1 a1 a2 a2 a3 a3 a4
i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
i4_samp_8x16b_r2_1); // b0 b1 b1 b2 b2 b3 b3 b4
i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
// a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
// b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
// a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c2_c3_8x16b);
// b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c2_c3_8x16b);
i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
chroma_mask = _mm_set1_epi16(0xFF00);
chroma_mask2 = _mm_set1_epi16(0x00FF);
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
out_16x8b_r8 =
_mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
_mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
_mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
/* End of loop over x */
}