1926 lines
105 KiB
C
1926 lines
105 KiB
C
|
|
/******************************************************************************
|
||
|
|
*
|
||
|
|
* Copyright (C) 2022 The Android Open Source Project
|
||
|
|
*
|
||
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
* you may not use this file except in compliance with the License.
|
||
|
|
* You may obtain a copy of the License at:
|
||
|
|
*
|
||
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
*
|
||
|
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
* See the License for the specific language governing permissions and
|
||
|
|
* limitations under the License.
|
||
|
|
*
|
||
|
|
*****************************************************************************
|
||
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||
|
|
*/
|
||
|
|
/**
|
||
|
|
*******************************************************************************
|
||
|
|
* @file
|
||
|
|
* isvcd_intra_resamp_sse42.c
|
||
|
|
*
|
||
|
|
* @brief
|
||
|
|
* Contains function definitions for intra resampling functions
|
||
|
|
*
|
||
|
|
* @author
|
||
|
|
* Kishore
|
||
|
|
*
|
||
|
|
* @par List of Functions:
|
||
|
|
* - isvcd_interpolate_base_luma_dyadic_sse42
|
||
|
|
* - isvcd_vert_interpol_chroma_dyadic_1_sse42
|
||
|
|
* - isvcd_vert_interpol_chroma_dyadic_2_sse42
|
||
|
|
* - isvcd_vert_interpol_chroma_dyadic_3_sse42
|
||
|
|
* - isvcd_horz_interpol_chroma_dyadic_1_sse42
|
||
|
|
* - isvcd_horz_interpol_chroma_dyadic_2_sse42
|
||
|
|
*
|
||
|
|
* @remarks
|
||
|
|
* None
|
||
|
|
*
|
||
|
|
*******************************************************************************
|
||
|
|
*/
|
||
|
|
#include <immintrin.h>
|
||
|
|
#include <smmintrin.h>
|
||
|
|
#include <emmintrin.h>
|
||
|
|
/* User include files */
|
||
|
|
#include "ih264_typedefs.h"
|
||
|
|
#include "isvcd_structs.h"
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_interpolate_base_luma_dyadic_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* intra resampling for dyadic scaling ratios */
|
||
|
|
/* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */
|
||
|
|
/* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
|
||
|
|
/* vertically interpolated data */
|
||
|
|
/* pu1_out_buf : output buffer pointer */
|
||
|
|
/* i4_out_stride : output buffer stride */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation in vertical direction followed */
|
||
|
|
/* by horizontal direction */
|
||
|
|
/* Outputs : resampled pixels */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 05 21 2021 Dolan creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
|
||
|
|
void isvcd_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
|
||
|
|
UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
|
||
|
|
{
|
||
|
|
WORD32 i4_x, i4_y;
|
||
|
|
WORD32 i4_filt_stride, i4_src_stride;
|
||
|
|
UWORD8 *pu1_inp, *pu1_out;
|
||
|
|
WORD16 *pi2_tmp;
|
||
|
|
|
||
|
|
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
|
||
|
|
__m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
|
||
|
|
__m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
|
||
|
|
__m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
|
||
|
|
|
||
|
|
/* Filter coefficient values for phase 4 */
|
||
|
|
__m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
|
||
|
|
__m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
|
||
|
|
i4_filt_stride = 12;
|
||
|
|
i4_src_stride = DYADIC_REF_W_Y;
|
||
|
|
|
||
|
|
pu1_inp = pu1_inp_buf;
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
pu1_out = pu1_out_buf;
|
||
|
|
|
||
|
|
/* Vertical interpolation */
|
||
|
|
/*First 64 bit */
|
||
|
|
for(i4_x = 0; i4_x < 1; i4_x++)
|
||
|
|
{
|
||
|
|
/* y = 0, y_phase = 12 */
|
||
|
|
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
|
||
|
|
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
|
||
|
|
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
||
|
|
i4_samp_16x8b_3 =
|
||
|
|
_mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
||
|
|
pu1_inp += (i4_src_stride << 2);
|
||
|
|
i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
|
||
|
|
i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
|
||
|
|
i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
|
||
|
|
|
||
|
|
/* since y_phase 12 for y = 0 */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
pi2_tmp += i4_filt_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
|
||
|
|
/* y = 15, y_phase = 4 */
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
|
||
|
|
/* Store the output */
|
||
|
|
_mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
|
||
|
|
/* Reinitializing the ptrs */
|
||
|
|
pu1_inp = pu1_inp_buf;
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
} /* End of loop over x */
|
||
|
|
|
||
|
|
/*Remaining 32 bit */
|
||
|
|
pu1_inp += 8;
|
||
|
|
pi2_tmp += 8;
|
||
|
|
for(i4_x = 0; i4_x < 1; i4_x++)
|
||
|
|
{
|
||
|
|
/* y = 0, y_phase = 12 */
|
||
|
|
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
|
||
|
|
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
|
||
|
|
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
||
|
|
i4_samp_16x8b_3 =
|
||
|
|
_mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
||
|
|
pu1_inp += (i4_src_stride << 2);
|
||
|
|
i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
|
||
|
|
i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
|
||
|
|
i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
|
||
|
|
|
||
|
|
/* since y_phase 12 for y = 0 */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
|
||
|
|
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
pi2_tmp += i4_filt_stride;
|
||
|
|
|
||
|
|
for(i4_y = 1; i4_y < 15; i4_y += 2)
|
||
|
|
{
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
/* y_phase is 4 for odd values of y */
|
||
|
|
/* and 12 for even values of y */
|
||
|
|
/*Multiply by 8 => left shift by 3*/
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
|
||
|
|
i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
|
||
|
|
|
||
|
|
/* Storing the results */
|
||
|
|
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
_mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
|
||
|
|
pi2_tmp += (i4_filt_stride << 1);
|
||
|
|
pu1_inp += i4_src_stride;
|
||
|
|
} /* End of loop over y */
|
||
|
|
|
||
|
|
/* y = 15, y_phase = 4 */
|
||
|
|
i4_samp_8x16b_0 = i4_samp_8x16b_1;
|
||
|
|
i4_samp_8x16b_1 = i4_samp_8x16b_2;
|
||
|
|
i4_samp_8x16b_2 = i4_samp_8x16b_3;
|
||
|
|
i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
|
||
|
|
i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
|
||
|
|
i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
|
||
|
|
i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
|
||
|
|
|
||
|
|
/* Store the output */
|
||
|
|
_mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
|
||
|
|
|
||
|
|
/* Reinitializing the ptrs */
|
||
|
|
pu1_inp = pu1_inp_buf;
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
}
|
||
|
|
|
||
|
|
{
|
||
|
|
__m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
|
||
|
|
__m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
|
||
|
|
__m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
|
||
|
|
__m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
|
||
|
|
|
||
|
|
__m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
|
||
|
|
__m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
|
||
|
|
__m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
|
||
|
|
__m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
|
||
|
|
__m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
|
||
|
|
|
||
|
|
__m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
|
||
|
|
__m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
|
||
|
|
__m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
|
||
|
|
__m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
|
||
|
|
|
||
|
|
__m128i res_512 = _mm_set1_epi32(512);
|
||
|
|
/* Horizontal interpolation */
|
||
|
|
for(i4_y = 0; i4_y < 16; i4_y++)
|
||
|
|
{
|
||
|
|
// a0 a1 a2 a3 a4 a5 a6 a7
|
||
|
|
i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
|
||
|
|
// a4 a5 a6 a7 a8 a9 a10 a11
|
||
|
|
i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
|
||
|
|
// a1 a2 a3 a4 a5 a6 a7 0
|
||
|
|
i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
|
||
|
|
// a2 a3 a4 a5 a6 a7 0 0
|
||
|
|
i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
|
||
|
|
// a3 a4 a5 a6 a7 0 0 0
|
||
|
|
i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
|
||
|
|
// a4 a5 a6 a7 0 0 0 0
|
||
|
|
i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
|
||
|
|
|
||
|
|
// a5 a6 a7 a8 a9 a10 a11 0
|
||
|
|
i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
|
||
|
|
// a6 a7 a8 a9 a10 a11 0 0
|
||
|
|
i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
|
||
|
|
// a7 a8 a9 a10 a11 0 0 0
|
||
|
|
i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
|
||
|
|
// a8 a9 a10 a11 0 0 0 0
|
||
|
|
i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
|
||
|
|
// a0 a1 a1 a2 a2 a3 a3 a4
|
||
|
|
i4_samp_8x16b_rpart1_0 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
|
||
|
|
// a1 a2 a2 a3 a3 a4 a4 a5
|
||
|
|
i4_samp_8x16b_rpart1_1 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
|
||
|
|
// a2 a3 a3 a4 a4 a5 a5 a6
|
||
|
|
i4_samp_8x16b_rpart1_2 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
|
||
|
|
// a3 a4 a4 a5 a5 a6 a6 a7
|
||
|
|
i4_samp_8x16b_rpart1_3 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
|
||
|
|
// a4 a5 a5 a6 a6 a7 a7 a8
|
||
|
|
i4_samp_8x16b_rpart2_0 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
|
||
|
|
// a5 a6 a6 a7 a7 a8 a8 a9
|
||
|
|
i4_samp_8x16b_rpart2_1 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
|
||
|
|
// a6 a7 a7 a8 a8 a9 a9 a10
|
||
|
|
i4_samp_8x16b_rpart2_2 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
|
||
|
|
// a7 a8 a8 a9 a9 a10 a10 a11
|
||
|
|
i4_samp_8x16b_rpart2_3 =
|
||
|
|
_mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
|
||
|
|
// a0c3+a1c2 a1c3+a2c2 a2c3+a3c2 a3c3+a4c2
|
||
|
|
i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
|
||
|
|
// a2c1+a3c0 a3c1+a4c0 a4c1+a5c0 a5c1+a6c0
|
||
|
|
i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
|
||
|
|
// a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 a4c0+a5c1
|
||
|
|
i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
|
||
|
|
// a3c2+a4c3 a5c2+a5c3 a5c2+a6c3 a6c2+a7c3
|
||
|
|
i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
|
||
|
|
// a4c3+a5c2 a5a3+a6c2 a6c3+a7c2 a7c3+a8c2
|
||
|
|
i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
|
||
|
|
// a6c1+a7c0 a7c1+a8c0 a8c1+a9c0 a9c1+a10c0
|
||
|
|
i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
|
||
|
|
// a5c0+a6c1 a6c0+a7c1 a7c0+a8c1 a8c0+a9c1
|
||
|
|
i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
|
||
|
|
// a7c2+a8c3 a8c2+a9c3 a9c2+a10c3 a10c2+a11c3
|
||
|
|
i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
|
||
|
|
// a0c3+a1c2 + a2c1+a3c0 a1c3+a2c2 + a3c1+a4c0 a2c3+a3c2 + a4c1+a5c0
|
||
|
|
// a3c3+a4c2 +a5c1+a6c0
|
||
|
|
i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
|
||
|
|
// a1c0+a2c1 + a3c2+a4c3 a2c0+a3c1 + a5c2+a5c3 a3c0+a4c1 + a5c2+a6c3
|
||
|
|
// a4c0+a5c1 + a6c2+a7c3
|
||
|
|
i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
|
||
|
|
// a4c3+a5c2 + a6c1+a7c0 a5a3+a6c2 + a7c1+a8c0 a6c3+a7c2 + a8c1+a9c0
|
||
|
|
// a7c3+a8c2+ a9c1+a10c0
|
||
|
|
i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
|
||
|
|
// a5c0+a6c1 + a7c2+a8c3 a6c0+a7c1 + a8c2+a9c3 a7c0+a8c1 + a9c2+a10c3
|
||
|
|
// a8c0+a9c1 + a10c2+a11c3
|
||
|
|
i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
|
||
|
|
|
||
|
|
i4_res_4x32b_rpart1_2 =
|
||
|
|
_mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
|
||
|
|
i4_res_4x32b_rpart1_3 =
|
||
|
|
_mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
|
||
|
|
|
||
|
|
i4_res_4x32b_rpart2_2 =
|
||
|
|
_mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
|
||
|
|
i4_res_4x32b_rpart2_3 =
|
||
|
|
_mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
|
||
|
|
|
||
|
|
i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
|
||
|
|
i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
|
||
|
|
|
||
|
|
i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
|
||
|
|
i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
|
||
|
|
|
||
|
|
i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
|
||
|
|
i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
|
||
|
|
|
||
|
|
i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
|
||
|
|
i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
|
||
|
|
|
||
|
|
_mm_storeu_si128(
|
||
|
|
(__m128i *) pu1_out,
|
||
|
|
_mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
|
||
|
|
_mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
|
||
|
|
|
||
|
|
pi2_tmp += i4_filt_stride;
|
||
|
|
pu1_out += i4_out_stride;
|
||
|
|
} /* End of loop over y */
|
||
|
|
}
|
||
|
|
} /* isvcd_interpolate_base_luma_dyadic */
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_interpolate_intra_base_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* interpolation of a component to find the intra */
|
||
|
|
/* resampled value */
|
||
|
|
/* Inputs : pv_intra_samp_ctxt : intra sampling context */
|
||
|
|
/* pu1_out : output buffer pointer */
|
||
|
|
/* i4_out_stride : output buffer stride */
|
||
|
|
/* i4_refarray_wd : reference array width */
|
||
|
|
/* i4_x_offset : offset in reference layer in horz direction*/
|
||
|
|
/* ps_coord : current mb co-ordinate */
|
||
|
|
/* i4_chroma_flag : chroma processing flag */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation in vertical direction followed */
|
||
|
|
/* by horizontal direction */
|
||
|
|
/* Outputs : resampled pixels */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 06 09 2021 Kishore creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
void isvcd_interpolate_intra_base_sse42(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
|
||
|
|
WORD32 i4_out_stride, WORD32 i4_refarray_wd, WORD32 i4_mb_x,
|
||
|
|
WORD32 i4_mb_y, WORD32 i4_chroma_flag,
|
||
|
|
WORD32 i4_refarray_flag)
|
||
|
|
{
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* Index Parameters */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
intra_sampling_ctxt_t *ps_ctxt;
|
||
|
|
intra_samp_map_ctxt_t *ps_map_ctxt;
|
||
|
|
intra_samp_lyr_ctxt *ps_lyr_ctxt;
|
||
|
|
WORD32 i4_x, i4_y;
|
||
|
|
WORD32 i4_frm_mb_x, i4_frm_mb_y;
|
||
|
|
UWORD8 *pu1_refarray = NULL;
|
||
|
|
ref_pixel_map_t *ps_x_pos_phase;
|
||
|
|
ref_pixel_map_t *ps_y_pos_phase;
|
||
|
|
WORD32 i4_temp_array_ht;
|
||
|
|
WORD32 *pi4_interp_buff;
|
||
|
|
WORD32 i4_mb_wd;
|
||
|
|
WORD32 i4_mb_ht;
|
||
|
|
|
||
|
|
WORD32 i4_x_min;
|
||
|
|
ref_min_max_map_t *ps_x_min_max;
|
||
|
|
WORD8 arr_y_ref_pos_luma[16] = {0};
|
||
|
|
WORD8 arr_x_ref_pos_luma[16] = {0};
|
||
|
|
WORD8 arr_x_ref_pos_luma_low[16] = {0};
|
||
|
|
WORD8 arr_x_ref_pos_luma_high[16] = {0};
|
||
|
|
WORD8 arr_phase_luma[32] = {0};
|
||
|
|
WORD8 *pi4_y_ref_pos_luma;
|
||
|
|
WORD8 *pi4_x_ref_pos_luma_low;
|
||
|
|
WORD8 *pi4_x_ref_pos_luma_high;
|
||
|
|
WORD8 *pi4_phase_luma;
|
||
|
|
UWORD8 *pu1_refarray_temp;
|
||
|
|
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* Extracting pointers from the context */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
|
||
|
|
ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
|
||
|
|
|
||
|
|
if(0 == i4_refarray_flag)
|
||
|
|
{
|
||
|
|
pu1_refarray = ps_ctxt->pu1_refarray_buffer;
|
||
|
|
}
|
||
|
|
else if(1 == i4_refarray_flag)
|
||
|
|
{
|
||
|
|
pu1_refarray = ps_ctxt->pu1_refarray_cb;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* LUMA or CHROMA */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
if(1 == i4_chroma_flag)
|
||
|
|
ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
|
||
|
|
else
|
||
|
|
ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
|
||
|
|
|
||
|
|
i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
|
||
|
|
i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
|
||
|
|
|
||
|
|
ps_x_min_max = ps_map_ctxt->ps_x_min_max;
|
||
|
|
|
||
|
|
i4_frm_mb_y = i4_mb_y * i4_mb_ht;
|
||
|
|
i4_frm_mb_x = i4_mb_x * i4_mb_wd;
|
||
|
|
/* get the min position */
|
||
|
|
i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
|
||
|
|
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* Projected frame level pointers */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
|
||
|
|
ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
|
||
|
|
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* Pointers and Dimenstion of the temporary buffer */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
i4_temp_array_ht = i4_mb_ht;
|
||
|
|
pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
|
||
|
|
|
||
|
|
if(i4_chroma_flag == 0)
|
||
|
|
{
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* Loop for interpolation in vertical direction */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
WORD16 *pi2_interp_buff_temp;
|
||
|
|
pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
|
||
|
|
{
|
||
|
|
__m128i out_res_8x16b_0, out_res_8x16b_1;
|
||
|
|
|
||
|
|
__m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
|
||
|
|
out_res_8x16b_r01_0;
|
||
|
|
__m128i inp_8x16b_r1, inp_8x16b_r23_0, phs_mask_16x8b_r1, phs_mask_16x8b_r23_0,
|
||
|
|
out_res_8x16b_r01_1;
|
||
|
|
__m128i inp_8x16b_r2, inp_8x16b_r01_1, phs_mask_16x8b_r2, phs_mask_16x8b_r01_1,
|
||
|
|
out_res_8x16b_r23_0;
|
||
|
|
__m128i inp_8x16b_r3, inp_8x16b_r23_1, phs_mask_16x8b_r3, phs_mask_16x8b_r23_1,
|
||
|
|
out_res_8x16b_r23_1;
|
||
|
|
|
||
|
|
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
|
||
|
|
{
|
||
|
|
arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
|
||
|
|
arr_y_ref_pos_luma[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
|
||
|
|
}
|
||
|
|
pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
|
||
|
|
pi4_phase_luma = arr_phase_luma;
|
||
|
|
|
||
|
|
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
|
||
|
|
{
|
||
|
|
pu1_refarray_temp =
|
||
|
|
pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
|
||
|
|
inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp - i4_refarray_wd));
|
||
|
|
inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
|
||
|
|
inp_8x16b_r2 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
|
||
|
|
inp_8x16b_r3 =
|
||
|
|
_mm_loadu_si128((__m128i *) (pu1_refarray_temp + 2 * i4_refarray_wd));
|
||
|
|
|
||
|
|
inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
|
||
|
|
inp_8x16b_r23_0 = _mm_unpacklo_epi8(inp_8x16b_r2, inp_8x16b_r3);
|
||
|
|
inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
|
||
|
|
inp_8x16b_r23_1 = _mm_unpackhi_epi8(inp_8x16b_r2, inp_8x16b_r3);
|
||
|
|
|
||
|
|
phs_mask_16x8b_r0 = _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
|
||
|
|
phs_mask_16x8b_r1 =
|
||
|
|
_mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
|
||
|
|
phs_mask_16x8b_r2 =
|
||
|
|
_mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
|
||
|
|
phs_mask_16x8b_r3 =
|
||
|
|
_mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
|
||
|
|
|
||
|
|
phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
|
||
|
|
phs_mask_16x8b_r23_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
|
||
|
|
phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
|
||
|
|
phs_mask_16x8b_r23_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
|
||
|
|
|
||
|
|
out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
|
||
|
|
out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
|
||
|
|
out_res_8x16b_r23_0 = _mm_maddubs_epi16(inp_8x16b_r23_0, phs_mask_16x8b_r23_0);
|
||
|
|
out_res_8x16b_r23_1 = _mm_maddubs_epi16(inp_8x16b_r23_1, phs_mask_16x8b_r23_1);
|
||
|
|
|
||
|
|
out_res_8x16b_0 = _mm_add_epi16(out_res_8x16b_r01_0, out_res_8x16b_r23_0);
|
||
|
|
out_res_8x16b_1 = _mm_add_epi16(out_res_8x16b_r01_1, out_res_8x16b_r23_1);
|
||
|
|
|
||
|
|
_mm_storeu_si128(
|
||
|
|
(__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
|
||
|
|
out_res_8x16b_0);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
|
||
|
|
(i4_x_min - 1) + 8),
|
||
|
|
out_res_8x16b_1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
/* Loop for interpolation in horizontal direction */
|
||
|
|
/* --------------------------------------------------------------------- */
|
||
|
|
{
|
||
|
|
WORD32 strt_indx = 10, strt_indx_h = 0;
|
||
|
|
|
||
|
|
__m128i inp_8x16b_0;
|
||
|
|
__m128i inp_8x16b_1;
|
||
|
|
|
||
|
|
__m128i phs_mask_16x8b_0;
|
||
|
|
__m128i phs_mask_16x8b_1;
|
||
|
|
__m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r0_1, x_ref_pos_luma_mask_r1_0,
|
||
|
|
x_ref_pos_luma_mask_r1_1, x_ref_pos_luma_mask_r2_0, x_ref_pos_luma_mask_r2_1,
|
||
|
|
x_ref_pos_luma_mask_r3_0, x_ref_pos_luma_mask_r3_1;
|
||
|
|
|
||
|
|
__m128i inp_8x16b_2, inp_8x16b_3;
|
||
|
|
|
||
|
|
WORD32 i4_x2 = 0;
|
||
|
|
WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
|
||
|
|
__m128i twos = _mm_set1_epi8(2);
|
||
|
|
|
||
|
|
strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
|
||
|
|
strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
|
||
|
|
for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
|
||
|
|
{
|
||
|
|
arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
|
||
|
|
arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
|
||
|
|
arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
|
||
|
|
{
|
||
|
|
i4_x2 = i4_x << 1;
|
||
|
|
arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
|
||
|
|
arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
|
||
|
|
}
|
||
|
|
for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
|
||
|
|
{
|
||
|
|
i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
|
||
|
|
arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
|
||
|
|
arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
|
||
|
|
}
|
||
|
|
pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
|
||
|
|
pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
|
||
|
|
pi4_phase_luma = arr_phase_luma;
|
||
|
|
|
||
|
|
phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
|
||
|
|
phs_mask_16x8b_1 = _mm_loadu_si128((__m128i *) (pi4_phase_luma + 8));
|
||
|
|
|
||
|
|
x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
|
||
|
|
x_ref_pos_luma_mask_r0_1 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_high));
|
||
|
|
x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
|
||
|
|
x_ref_pos_luma_mask_r1_1 = _mm_add_epi8(x_ref_pos_luma_mask_r0_1, twos);
|
||
|
|
x_ref_pos_luma_mask_r2_0 = _mm_add_epi8(x_ref_pos_luma_mask_r1_0, twos);
|
||
|
|
x_ref_pos_luma_mask_r2_1 = _mm_add_epi8(x_ref_pos_luma_mask_r1_1, twos);
|
||
|
|
x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
|
||
|
|
x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
|
||
|
|
|
||
|
|
{
|
||
|
|
__m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r0_1,
|
||
|
|
ip_filt_8x16b_r01_l_0, ip_filt_8x16b_r01_h_0;
|
||
|
|
__m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r1_1,
|
||
|
|
ip_filt_8x16b_r23_l_0, ip_filt_8x16b_r23_h_0;
|
||
|
|
__m128i ip_filt_16x8b_r2, ip_filt_8x16b_r2_0, ip_filt_8x16b_r2_1,
|
||
|
|
ip_filt_8x16b_r01_l_1, ip_filt_8x16b_r01_h_1;
|
||
|
|
__m128i ip_filt_16x8b_r3, ip_filt_8x16b_r3_0, ip_filt_8x16b_r3_1,
|
||
|
|
ip_filt_8x16b_r23_l_1, ip_filt_8x16b_r23_h_1;
|
||
|
|
|
||
|
|
__m128i inp_8x16b_r0_0, inp_8x16b_r2_0, inp_8x16b_r01_l_0, inp_8x16b_r01_h_0,
|
||
|
|
out_res_4x32b_r01_l_0, out_res_4x32b_r01_h_0;
|
||
|
|
__m128i inp_8x16b_r0_1, inp_8x16b_r2_1, inp_8x16b_r23_l_0, inp_8x16b_r23_h_0,
|
||
|
|
out_res_4x32b_r01_l_1, out_res_4x32b_r01_h_1;
|
||
|
|
__m128i inp_8x16b_r1_0, inp_8x16b_r3_0, inp_8x16b_r01_l_1, inp_8x16b_r01_h_1,
|
||
|
|
out_res_4x32b_r23_l_0, out_res_4x32b_r23_h_0;
|
||
|
|
__m128i inp_8x16b_r1_1, inp_8x16b_r3_1, inp_8x16b_r23_l_1, inp_8x16b_r23_h_1,
|
||
|
|
out_res_4x32b_r23_l_1, out_res_4x32b_r23_h_1;
|
||
|
|
|
||
|
|
__m128i out_res_4x32b_l_0;
|
||
|
|
__m128i out_res_4x32b_l_1;
|
||
|
|
__m128i out_res_4x32b_h_0;
|
||
|
|
__m128i out_res_4x32b_h_1;
|
||
|
|
|
||
|
|
__m128i out_res_8x16b_l;
|
||
|
|
__m128i out_res_8x16b_h;
|
||
|
|
|
||
|
|
__m128i out_res_16x8b;
|
||
|
|
__m128i const_512 = _mm_set1_epi32(512);
|
||
|
|
|
||
|
|
ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma));
|
||
|
|
ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 16));
|
||
|
|
ip_filt_16x8b_r2 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 32));
|
||
|
|
ip_filt_16x8b_r3 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 48));
|
||
|
|
|
||
|
|
ip_filt_8x16b_r0_0 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
|
||
|
|
ip_filt_8x16b_r1_0 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
|
||
|
|
ip_filt_8x16b_r2_0 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_0));
|
||
|
|
ip_filt_8x16b_r3_0 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_0));
|
||
|
|
|
||
|
|
ip_filt_8x16b_r0_1 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_1));
|
||
|
|
ip_filt_8x16b_r1_1 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_1));
|
||
|
|
ip_filt_8x16b_r2_1 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_1));
|
||
|
|
ip_filt_8x16b_r3_1 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_1));
|
||
|
|
|
||
|
|
ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
|
||
|
|
ip_filt_8x16b_r23_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
|
||
|
|
ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
|
||
|
|
ip_filt_8x16b_r23_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
|
||
|
|
|
||
|
|
ip_filt_8x16b_r01_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
|
||
|
|
ip_filt_8x16b_r23_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
|
||
|
|
ip_filt_8x16b_r01_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
|
||
|
|
ip_filt_8x16b_r23_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
|
||
|
|
|
||
|
|
for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
|
||
|
|
{
|
||
|
|
inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
|
||
|
|
inp_8x16b_1 = _mm_loadu_si128(
|
||
|
|
(__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h));
|
||
|
|
inp_8x16b_2 =
|
||
|
|
_mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx + 3));
|
||
|
|
inp_8x16b_3 = _mm_loadu_si128(
|
||
|
|
(__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
|
||
|
|
pi2_interp_buff_temp += i4_refarray_wd;
|
||
|
|
|
||
|
|
inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
|
||
|
|
inp_8x16b_r0_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r0_1);
|
||
|
|
inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
|
||
|
|
inp_8x16b_r1_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r1_1);
|
||
|
|
|
||
|
|
inp_8x16b_r2_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r2_0);
|
||
|
|
inp_8x16b_r2_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r2_1);
|
||
|
|
inp_8x16b_r3_0 = _mm_shuffle_epi8(inp_8x16b_2, x_ref_pos_luma_mask_r3_0);
|
||
|
|
inp_8x16b_r3_1 = _mm_shuffle_epi8(inp_8x16b_3, x_ref_pos_luma_mask_r3_1);
|
||
|
|
|
||
|
|
inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
|
||
|
|
inp_8x16b_r23_l_0 = _mm_unpacklo_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
|
||
|
|
inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
|
||
|
|
inp_8x16b_r23_l_1 = _mm_unpackhi_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
|
||
|
|
|
||
|
|
inp_8x16b_r01_h_0 = _mm_unpacklo_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
|
||
|
|
inp_8x16b_r23_h_0 = _mm_unpacklo_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
|
||
|
|
inp_8x16b_r01_h_1 = _mm_unpackhi_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
|
||
|
|
inp_8x16b_r23_h_1 = _mm_unpackhi_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
|
||
|
|
|
||
|
|
out_res_4x32b_r01_l_0 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
|
||
|
|
out_res_4x32b_r01_l_1 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
|
||
|
|
out_res_4x32b_r23_l_0 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r23_l_0, ip_filt_8x16b_r23_l_0);
|
||
|
|
out_res_4x32b_r23_l_1 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r23_l_1, ip_filt_8x16b_r23_l_1);
|
||
|
|
|
||
|
|
out_res_4x32b_r01_h_0 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r01_h_0, ip_filt_8x16b_r01_h_0);
|
||
|
|
out_res_4x32b_r01_h_1 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r01_h_1, ip_filt_8x16b_r01_h_1);
|
||
|
|
out_res_4x32b_r23_h_0 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r23_h_0, ip_filt_8x16b_r23_h_0);
|
||
|
|
out_res_4x32b_r23_h_1 =
|
||
|
|
_mm_madd_epi16(inp_8x16b_r23_h_1, ip_filt_8x16b_r23_h_1);
|
||
|
|
|
||
|
|
out_res_4x32b_l_0 = _mm_add_epi32(out_res_4x32b_r01_l_0, out_res_4x32b_r23_l_0);
|
||
|
|
out_res_4x32b_l_1 = _mm_add_epi32(out_res_4x32b_r01_l_1, out_res_4x32b_r23_l_1);
|
||
|
|
out_res_4x32b_h_0 = _mm_add_epi32(out_res_4x32b_r01_h_0, out_res_4x32b_r23_h_0);
|
||
|
|
out_res_4x32b_h_1 = _mm_add_epi32(out_res_4x32b_r01_h_1, out_res_4x32b_r23_h_1);
|
||
|
|
|
||
|
|
out_res_4x32b_l_0 =
|
||
|
|
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_0, const_512), 10);
|
||
|
|
out_res_4x32b_l_1 =
|
||
|
|
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_1, const_512), 10);
|
||
|
|
out_res_4x32b_h_0 =
|
||
|
|
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_0, const_512), 10);
|
||
|
|
out_res_4x32b_h_1 =
|
||
|
|
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_1, const_512), 10);
|
||
|
|
|
||
|
|
out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
|
||
|
|
out_res_8x16b_h = _mm_packs_epi32(out_res_4x32b_h_0, out_res_4x32b_h_1);
|
||
|
|
|
||
|
|
out_res_16x8b = _mm_packus_epi16(out_res_8x16b_l, out_res_8x16b_h);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_res_16x8b);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
WORD16 *pi2_interp_buff_temp;
|
||
|
|
pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
|
||
|
|
|
||
|
|
{
|
||
|
|
__m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
|
||
|
|
out_res_8x16b_r01_0;
|
||
|
|
__m128i inp_8x16b_r1, phs_mask_16x8b_r1, out_res_8x16b_r01_1;
|
||
|
|
__m128i inp_8x16b_r01_1, phs_mask_16x8b_r01_1;
|
||
|
|
|
||
|
|
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
|
||
|
|
{
|
||
|
|
arr_y_ref_pos_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
|
||
|
|
arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
|
||
|
|
}
|
||
|
|
pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
|
||
|
|
pi4_phase_luma = arr_phase_luma;
|
||
|
|
|
||
|
|
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
|
||
|
|
{
|
||
|
|
pu1_refarray_temp =
|
||
|
|
pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
|
||
|
|
inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
|
||
|
|
inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
|
||
|
|
|
||
|
|
inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
|
||
|
|
inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
|
||
|
|
|
||
|
|
phs_mask_16x8b_r0 = _mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
|
||
|
|
phs_mask_16x8b_r1 =
|
||
|
|
_mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
|
||
|
|
|
||
|
|
phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
|
||
|
|
phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
|
||
|
|
|
||
|
|
out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
|
||
|
|
out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
|
||
|
|
|
||
|
|
_mm_storeu_si128(
|
||
|
|
(__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
|
||
|
|
out_res_8x16b_r01_0);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
|
||
|
|
(i4_x_min - 1) + 8),
|
||
|
|
out_res_8x16b_r01_1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
{
|
||
|
|
WORD32 strt_indx = 10;
|
||
|
|
__m128i inp_8x16b_0, inp_8x16b_r0_0;
|
||
|
|
__m128i phs_mask_16x8b_0;
|
||
|
|
__m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r1_0;
|
||
|
|
__m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r01_l_0;
|
||
|
|
__m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r01_l_1;
|
||
|
|
__m128i inp_8x16b_r1_0, inp_8x16b_r01_l_0, out_res_4x32b_r01_l_0;
|
||
|
|
__m128i inp_8x16b_r01_l_1, out_res_4x32b_r01_l_1;
|
||
|
|
|
||
|
|
__m128i out_res_4x32b_l_0;
|
||
|
|
__m128i out_res_4x32b_l_1;
|
||
|
|
__m128i out_res_8x16b_l;
|
||
|
|
__m128i out_16x8b_r1;
|
||
|
|
__m128i chroma_mask;
|
||
|
|
__m128i const_512 = _mm_set1_epi32(512);
|
||
|
|
|
||
|
|
WORD32 i4_x2 = 0;
|
||
|
|
__m128i twos = _mm_set1_epi8(2);
|
||
|
|
strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
|
||
|
|
for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
|
||
|
|
{
|
||
|
|
arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
|
||
|
|
arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
|
||
|
|
arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
|
||
|
|
i4_x2 = i4_x << 1;
|
||
|
|
arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
|
||
|
|
arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
|
||
|
|
pi4_phase_luma = arr_phase_luma;
|
||
|
|
phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
|
||
|
|
x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
|
||
|
|
x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
|
||
|
|
|
||
|
|
ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma));
|
||
|
|
ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma + 16));
|
||
|
|
|
||
|
|
ip_filt_8x16b_r0_0 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
|
||
|
|
ip_filt_8x16b_r1_0 =
|
||
|
|
_mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
|
||
|
|
|
||
|
|
ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
|
||
|
|
ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
|
||
|
|
|
||
|
|
for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
|
||
|
|
{
|
||
|
|
inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
|
||
|
|
pi2_interp_buff_temp += i4_refarray_wd;
|
||
|
|
|
||
|
|
inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
|
||
|
|
inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
|
||
|
|
|
||
|
|
inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
|
||
|
|
inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
|
||
|
|
|
||
|
|
out_res_4x32b_r01_l_0 = _mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
|
||
|
|
out_res_4x32b_r01_l_1 = _mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
|
||
|
|
|
||
|
|
out_res_4x32b_l_0 =
|
||
|
|
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_0, const_512), 10);
|
||
|
|
out_res_4x32b_l_1 =
|
||
|
|
_mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_1, const_512), 10);
|
||
|
|
|
||
|
|
out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
|
||
|
|
|
||
|
|
chroma_mask = _mm_set1_epi16(0xFF00);
|
||
|
|
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)));
|
||
|
|
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
|
||
|
|
out_16x8b_r1 = _mm_add_epi8(out_res_8x16b_l, out_16x8b_r1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_16x8b_r1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return;
|
||
|
|
} /* End of Interpolation Function */
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_vert_interpol_chroma_dyadic_1_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* interpolation of a component to find the intra */
|
||
|
|
/* resampled value */
|
||
|
|
/* Inputs : pv_intra_samp_ctxt : intra sampling context */
|
||
|
|
/* pu1_out : output buffer pointer */
|
||
|
|
/* i4_out_stride : output buffer stride */
|
||
|
|
/* i4_refarray_wd : reference array width */
|
||
|
|
/* i4_x_offset : offset in reference layer in horz direction*/
|
||
|
|
/* ps_coord : current mb co-ordinate */
|
||
|
|
/* i4_chroma_flag : chroma processing flag */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation on horizontal direction */
|
||
|
|
/* Outputs : resampled pixels */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 06 09 2021 Kishore creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
void isvcd_vert_interpol_chroma_dyadic_1_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
|
||
|
|
WORD32 i4_phase_0, WORD32 i4_phase_1)
|
||
|
|
{
|
||
|
|
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
|
||
|
|
WORD32 i4_filt_stride, i4_src_stride;
|
||
|
|
UWORD8 *pu1_inp;
|
||
|
|
WORD16 *pi2_tmp;
|
||
|
|
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
|
||
|
|
i4_samp_16x8b_5;
|
||
|
|
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
|
||
|
|
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
|
||
|
|
__m128i i4_res_8x16b_r7_temp;
|
||
|
|
__m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
|
||
|
|
|
||
|
|
i4_coeff_0 = (WORD8) (8 - i4_phase_0);
|
||
|
|
i4_coeff_1 = (WORD8) (i4_phase_0);
|
||
|
|
i4_coeff_2 = (WORD8) (8 - i4_phase_1);
|
||
|
|
i4_coeff_3 = (WORD8) (i4_phase_1);
|
||
|
|
|
||
|
|
i4_c0_c1_16x8b =
|
||
|
|
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
||
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
||
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
||
|
|
i4_c2_c3_16x8b =
|
||
|
|
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
||
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
||
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
||
|
|
|
||
|
|
pu1_inp = pu1_inp_buf;
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
i4_filt_stride = 6;
|
||
|
|
i4_src_stride = DYADIC_REF_W_C;
|
||
|
|
|
||
|
|
i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
|
||
|
|
i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
|
||
|
|
i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
||
|
|
i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
||
|
|
i4_samp_16x8b_4 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2)));
|
||
|
|
i4_samp_16x8b_5 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
|
||
|
|
|
||
|
|
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
|
||
|
|
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
|
||
|
|
|
||
|
|
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
|
||
|
|
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
|
||
|
|
|
||
|
|
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
|
||
|
|
|
||
|
|
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
|
||
|
|
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
|
||
|
|
i4_res_8x16b_r3);
|
||
|
|
|
||
|
|
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
|
||
|
|
|
||
|
|
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
|
||
|
|
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
|
||
|
|
i4_res_8x16b_r5);
|
||
|
|
|
||
|
|
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
|
||
|
|
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
|
||
|
|
i4_res_8x16b_r6);
|
||
|
|
|
||
|
|
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
|
||
|
|
|
||
|
|
i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
|
||
|
|
|
||
|
|
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
|
||
|
|
|
||
|
|
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
|
||
|
|
|
||
|
|
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
|
||
|
|
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
|
||
|
|
i4_res_8x16b_r7_temp);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_vert_interpol_chroma_dyadic_2_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* vertical intra resampling for dyadic scaling ratios for */
|
||
|
|
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
|
||
|
|
/* chroma_phase_y_plus1: */
|
||
|
|
/* ref_lyr cur_lyr */
|
||
|
|
/* 0 1 */
|
||
|
|
/* 0 2 */
|
||
|
|
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
|
||
|
|
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
|
||
|
|
/* vertically interpolated data */
|
||
|
|
/* i4_phase_0 : y phase for even values of y */
|
||
|
|
/* i4_phase_1 : y phase for odd values of y */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation in vertical direction */
|
||
|
|
/* Outputs : vertically resampled samples */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 21 05 2021 Dolan creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
void isvcd_vert_interpol_chroma_dyadic_2_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
|
||
|
|
WORD32 i4_phase_0, WORD32 i4_phase_1)
|
||
|
|
{
|
||
|
|
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
|
||
|
|
WORD32 i4_filt_stride, i4_src_stride;
|
||
|
|
UWORD8 *pu1_inp;
|
||
|
|
WORD16 *pi2_tmp;
|
||
|
|
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
|
||
|
|
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
|
||
|
|
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
|
||
|
|
__m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
|
||
|
|
i4_coeff_0 = (WORD8) (8 - i4_phase_0);
|
||
|
|
i4_coeff_1 = (WORD8) (i4_phase_0);
|
||
|
|
i4_coeff_2 = (WORD8) (8 - i4_phase_1);
|
||
|
|
i4_coeff_3 = (WORD8) (i4_phase_1);
|
||
|
|
|
||
|
|
i4_c0_c1_16x8b =
|
||
|
|
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
||
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
||
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
||
|
|
i4_c2_c3_16x8b =
|
||
|
|
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
||
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
||
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
||
|
|
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
i4_filt_stride = 6;
|
||
|
|
i4_src_stride = DYADIC_REF_W_C;
|
||
|
|
pu1_inp = pu1_inp_buf + i4_src_stride;
|
||
|
|
|
||
|
|
i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
|
||
|
|
i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
|
||
|
|
i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
||
|
|
i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
||
|
|
i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
|
||
|
|
|
||
|
|
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
|
||
|
|
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
|
||
|
|
|
||
|
|
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
|
||
|
|
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
|
||
|
|
i4_res_8x16b_r3);
|
||
|
|
|
||
|
|
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
|
||
|
|
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
|
||
|
|
|
||
|
|
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
|
||
|
|
i4_res_8x16b_r5);
|
||
|
|
|
||
|
|
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
|
||
|
|
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
|
||
|
|
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
|
||
|
|
i4_res_8x16b_r6);
|
||
|
|
|
||
|
|
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
|
||
|
|
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
|
||
|
|
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
|
||
|
|
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
|
||
|
|
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
|
||
|
|
i4_res_8x16b_r7_temp);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_vert_interpol_chroma_dyadic_3_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* vertical intra resampling for dyadic scaling ratios for */
|
||
|
|
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
|
||
|
|
/* chroma_phase_y_plus1: */
|
||
|
|
/* ref_lyr cur_lyr */
|
||
|
|
/* 2 0 */
|
||
|
|
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
|
||
|
|
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
|
||
|
|
/* vertically interpolated data */
|
||
|
|
/* i4_phase_0 : y phase for even values of y */
|
||
|
|
/* i4_phase_1 : y phase for odd values of y */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation in vertical direction */
|
||
|
|
/* Outputs : vertically resampled samples */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 21 05 2021 Dolan creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
void isvcd_vert_interpol_chroma_dyadic_3_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
|
||
|
|
WORD32 i4_phase_0, WORD32 i4_phase_1)
|
||
|
|
{
|
||
|
|
WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
|
||
|
|
WORD32 i4_filt_stride, i4_src_stride;
|
||
|
|
UWORD8 *pu1_inp;
|
||
|
|
WORD16 *pi2_tmp;
|
||
|
|
__m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
|
||
|
|
__m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
|
||
|
|
i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
|
||
|
|
__m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
|
||
|
|
i4_coeff_0 = (WORD8) (8 - i4_phase_0);
|
||
|
|
i4_coeff_1 = (WORD8) (i4_phase_0);
|
||
|
|
i4_coeff_2 = (WORD8) (8 - i4_phase_1);
|
||
|
|
i4_coeff_3 = (WORD8) (i4_phase_1);
|
||
|
|
|
||
|
|
i4_c0_c1_16x8b =
|
||
|
|
_mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
||
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
|
||
|
|
i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
||
|
|
i4_c2_c3_16x8b =
|
||
|
|
_mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
||
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
|
||
|
|
i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
||
|
|
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
i4_filt_stride = 6;
|
||
|
|
i4_src_stride = DYADIC_REF_W_C;
|
||
|
|
pu1_inp = pu1_inp_buf;
|
||
|
|
|
||
|
|
i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
|
||
|
|
i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
|
||
|
|
i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
|
||
|
|
i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
|
||
|
|
i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
|
||
|
|
|
||
|
|
i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
|
||
|
|
i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
|
||
|
|
|
||
|
|
i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
|
||
|
|
|
||
|
|
i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
|
||
|
|
i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
|
||
|
|
|
||
|
|
i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
|
||
|
|
i4_res_8x16b_r3);
|
||
|
|
|
||
|
|
i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
|
||
|
|
i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
|
||
|
|
|
||
|
|
i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
|
||
|
|
i4_res_8x16b_r5);
|
||
|
|
|
||
|
|
i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
|
||
|
|
i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
|
||
|
|
_mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
|
||
|
|
i4_res_8x16b_r6);
|
||
|
|
|
||
|
|
i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
|
||
|
|
i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
|
||
|
|
i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
|
||
|
|
i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
|
||
|
|
_mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
|
||
|
|
i4_res_8x16b_r7_temp);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_horz_interpol_chroma_dyadic_1_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* vertical intra resampling for dyadic scaling ratios for */
|
||
|
|
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
|
||
|
|
/* chroma_phase_y_plus1: */
|
||
|
|
/* ref_lyr cur_lyr */
|
||
|
|
/* 2 0 */
|
||
|
|
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
|
||
|
|
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */
|
||
|
|
/* vertically interpolated data */
|
||
|
|
/* i4_phase_0 : y phase for even values of y */
|
||
|
|
/* i4_phase_1 : y phase for odd values of y */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation in vertical direction */
|
||
|
|
/* Outputs : vertically resampled samples */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 21 05 2021 Dolan creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
void isvcd_horz_interpol_chroma_dyadic_1_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
|
||
|
|
WORD32 i4_out_stride, WORD32 i4_phase_0,
|
||
|
|
WORD32 i4_phase_1)
|
||
|
|
{
|
||
|
|
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
|
||
|
|
WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
|
||
|
|
UWORD8 *pu1_out;
|
||
|
|
WORD16 *pi2_tmp;
|
||
|
|
|
||
|
|
__m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
|
||
|
|
__m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
|
||
|
|
__m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
|
||
|
|
__m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
|
||
|
|
__m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
|
||
|
|
__m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
|
||
|
|
__m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
|
||
|
|
__m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
|
||
|
|
|
||
|
|
__m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
|
||
|
|
__m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
|
||
|
|
__m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
|
||
|
|
__m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
|
||
|
|
__m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
|
||
|
|
__m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
|
||
|
|
__m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
|
||
|
|
__m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
|
||
|
|
|
||
|
|
__m128i i4_res_final_8x16b_r1;
|
||
|
|
__m128i i4_res_final_8x16b_r2;
|
||
|
|
__m128i i4_res_final_8x16b_r3;
|
||
|
|
__m128i i4_res_final_8x16b_r4;
|
||
|
|
__m128i i4_res_final_8x16b_r5;
|
||
|
|
__m128i i4_res_final_8x16b_r6;
|
||
|
|
__m128i i4_res_final_8x16b_r7;
|
||
|
|
__m128i i4_res_final_8x16b_r8;
|
||
|
|
|
||
|
|
__m128i out_16x8b_r1;
|
||
|
|
__m128i out_16x8b_r2;
|
||
|
|
__m128i out_16x8b_r3;
|
||
|
|
__m128i out_16x8b_r4;
|
||
|
|
__m128i out_16x8b_r5;
|
||
|
|
__m128i out_16x8b_r6;
|
||
|
|
__m128i out_16x8b_r7;
|
||
|
|
__m128i out_16x8b_r8;
|
||
|
|
__m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
|
||
|
|
__m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
|
||
|
|
__m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
|
||
|
|
__m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
|
||
|
|
__m128i chroma_mask, chroma_mask2;
|
||
|
|
__m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
|
||
|
|
|
||
|
|
i4_coeff_0 = 8 - i4_phase_0;
|
||
|
|
i4_coeff_1 = i4_phase_0;
|
||
|
|
i4_coeff_2 = 8 - i4_phase_1;
|
||
|
|
i4_coeff_3 = i4_phase_1;
|
||
|
|
coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
|
||
|
|
i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
||
|
|
coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
|
||
|
|
i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
||
|
|
res_32 = _mm_set1_epi32(32);
|
||
|
|
pu1_out = pu1_out_buf;
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf;
|
||
|
|
i4_dst_stride = i4_out_stride;
|
||
|
|
|
||
|
|
i4_dst_stride2 = i4_dst_stride << 1;
|
||
|
|
i4_dst_stride4 = i4_dst_stride << 2;
|
||
|
|
|
||
|
|
/* Horizontal interpolation */
|
||
|
|
/* x = 0, x_phase = phase_0 */
|
||
|
|
i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
|
||
|
|
i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
|
||
|
|
i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
|
||
|
|
i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4); // a2 a3 a4 a5 a6 a7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4); // b2 b3 b4 b5 b6 b7 0 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
|
||
|
|
i4_samp_8x16b_r1_1); // a0 a1 a1 a2 a2 a3 a3 a4
|
||
|
|
i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
|
||
|
|
i4_samp_8x16b_r2_1); // b0 b1 b1 b2 b2 b3 b3 b4
|
||
|
|
i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
|
||
|
|
i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
|
||
|
|
i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
|
||
|
|
i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
|
||
|
|
i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
|
||
|
|
i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
|
||
|
|
|
||
|
|
i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1,
|
||
|
|
i4_samp_8x16b_r1_2); // a1 a2 a2 a3 a3 a4 a4 a5
|
||
|
|
i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1,
|
||
|
|
i4_samp_8x16b_r2_2); // b1 b2 b2 b3 b3 b4 b4 b5
|
||
|
|
i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
|
||
|
|
i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
|
||
|
|
i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
|
||
|
|
i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
|
||
|
|
i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
|
||
|
|
i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
|
||
|
|
|
||
|
|
// a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
|
||
|
|
i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
|
||
|
|
// b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
|
||
|
|
i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
|
||
|
|
|
||
|
|
// a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
|
||
|
|
i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
|
||
|
|
// b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
|
||
|
|
i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
|
||
|
|
i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
|
||
|
|
i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
|
||
|
|
i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
|
||
|
|
i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
|
||
|
|
i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
|
||
|
|
i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
|
||
|
|
i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
|
||
|
|
i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
|
||
|
|
i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
|
||
|
|
i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
|
||
|
|
i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
|
||
|
|
i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
|
||
|
|
i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
|
||
|
|
i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
|
||
|
|
i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
|
||
|
|
i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
|
||
|
|
i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
|
||
|
|
i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
|
||
|
|
i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
|
||
|
|
i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
|
||
|
|
i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
|
||
|
|
i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
|
||
|
|
i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
|
||
|
|
i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
|
||
|
|
i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
|
||
|
|
i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
|
||
|
|
i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
|
||
|
|
i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
|
||
|
|
i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
|
||
|
|
i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
|
||
|
|
i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
|
||
|
|
i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
|
||
|
|
i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
|
||
|
|
i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
|
||
|
|
i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
|
||
|
|
i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
|
||
|
|
i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
|
||
|
|
i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
|
||
|
|
i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
|
||
|
|
i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
|
||
|
|
i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
|
||
|
|
|
||
|
|
chroma_mask = _mm_set1_epi16(0xFF00);
|
||
|
|
chroma_mask2 = _mm_set1_epi16(0x00FF);
|
||
|
|
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
|
||
|
|
out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
|
||
|
|
out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
|
||
|
|
out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
|
||
|
|
out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
|
||
|
|
out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
|
||
|
|
out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
|
||
|
|
out_16x8b_r8 =
|
||
|
|
_mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
|
||
|
|
|
||
|
|
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
|
||
|
|
out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
|
||
|
|
out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
|
||
|
|
out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
|
||
|
|
out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
|
||
|
|
out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
|
||
|
|
out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
|
||
|
|
out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
|
||
|
|
|
||
|
|
out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
|
||
|
|
out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
|
||
|
|
out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
|
||
|
|
out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
|
||
|
|
out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
|
||
|
|
out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
|
||
|
|
out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
|
||
|
|
out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
|
||
|
|
|
||
|
|
_mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
|
||
|
|
/* End of loop over x */
|
||
|
|
} /* isvcd_horz_interpol_chroma_dyadic_1_sse42 */
|
||
|
|
|
||
|
|
/*****************************************************************************/
|
||
|
|
/* */
|
||
|
|
/* Function Name : isvcd_horz_interpol_chroma_dyadic_2_sse42 */
|
||
|
|
/* */
|
||
|
|
/* Description : This function takes the reference array buffer & performs*/
|
||
|
|
/* vertical intra resampling for dyadic scaling ratios for */
|
||
|
|
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
|
||
|
|
/* chroma_phase_y_plus1: */
|
||
|
|
/* ref_lyr cur_lyr */
|
||
|
|
/* 2 0 */
|
||
|
|
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
|
||
|
|
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
|
||
|
|
/* vertically interpolated data */
|
||
|
|
/* i4_phase_0 : y phase for even values of y */
|
||
|
|
/* i4_phase_1 : y phase for odd values of y */
|
||
|
|
/* Globals : none */
|
||
|
|
/* Processing : it does the interpolation in vertical direction */
|
||
|
|
/* Outputs : vertically resampled samples */
|
||
|
|
/* Returns : none */
|
||
|
|
/* */
|
||
|
|
/* Issues : none */
|
||
|
|
/* */
|
||
|
|
/* Revision History: */
|
||
|
|
/* */
|
||
|
|
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||
|
|
/* 21 05 2021 Dolan creation */
|
||
|
|
/* */
|
||
|
|
/*****************************************************************************/
|
||
|
|
void isvcd_horz_interpol_chroma_dyadic_2_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
|
||
|
|
WORD32 i4_out_stride, WORD32 i4_phase_0,
|
||
|
|
WORD32 i4_phase_1)
|
||
|
|
{
|
||
|
|
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
|
||
|
|
WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
|
||
|
|
UWORD8 *pu1_out;
|
||
|
|
WORD16 *pi2_tmp;
|
||
|
|
|
||
|
|
__m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1;
|
||
|
|
__m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1;
|
||
|
|
__m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1;
|
||
|
|
__m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1;
|
||
|
|
__m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1;
|
||
|
|
__m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1;
|
||
|
|
__m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1;
|
||
|
|
__m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1;
|
||
|
|
|
||
|
|
__m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
|
||
|
|
__m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
|
||
|
|
__m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
|
||
|
|
__m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
|
||
|
|
__m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
|
||
|
|
__m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
|
||
|
|
__m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
|
||
|
|
__m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
|
||
|
|
|
||
|
|
__m128i i4_res_final_8x16b_r1;
|
||
|
|
__m128i i4_res_final_8x16b_r2;
|
||
|
|
__m128i i4_res_final_8x16b_r3;
|
||
|
|
__m128i i4_res_final_8x16b_r4;
|
||
|
|
__m128i i4_res_final_8x16b_r5;
|
||
|
|
__m128i i4_res_final_8x16b_r6;
|
||
|
|
__m128i i4_res_final_8x16b_r7;
|
||
|
|
__m128i i4_res_final_8x16b_r8;
|
||
|
|
|
||
|
|
__m128i out_16x8b_r1;
|
||
|
|
__m128i out_16x8b_r2;
|
||
|
|
__m128i out_16x8b_r3;
|
||
|
|
__m128i out_16x8b_r4;
|
||
|
|
__m128i out_16x8b_r5;
|
||
|
|
__m128i out_16x8b_r6;
|
||
|
|
__m128i out_16x8b_r7;
|
||
|
|
__m128i out_16x8b_r8;
|
||
|
|
__m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
|
||
|
|
__m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
|
||
|
|
__m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
|
||
|
|
__m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
|
||
|
|
__m128i chroma_mask, chroma_mask2;
|
||
|
|
__m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
|
||
|
|
|
||
|
|
i4_coeff_0 = 8 - i4_phase_0;
|
||
|
|
i4_coeff_1 = i4_phase_0;
|
||
|
|
i4_coeff_2 = 8 - i4_phase_1;
|
||
|
|
i4_coeff_3 = i4_phase_1;
|
||
|
|
coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
|
||
|
|
i4_coeff_0, i4_coeff_1, i4_coeff_0);
|
||
|
|
coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
|
||
|
|
i4_coeff_2, i4_coeff_3, i4_coeff_2);
|
||
|
|
res_32 = _mm_set1_epi32(32);
|
||
|
|
pu1_out = pu1_out_buf;
|
||
|
|
pi2_tmp = pi2_tmp_filt_buf + 1;
|
||
|
|
i4_dst_stride = i4_out_stride;
|
||
|
|
|
||
|
|
i4_dst_stride2 = i4_dst_stride << 1;
|
||
|
|
i4_dst_stride4 = i4_dst_stride << 2;
|
||
|
|
|
||
|
|
/* Horizontal interpolation */
|
||
|
|
/* x = 0, x_phase = phase_0 */
|
||
|
|
i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
|
||
|
|
i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42)); // b0 b1 b2 b3 b4 b5 b6 b7
|
||
|
|
|
||
|
|
i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
|
||
|
|
i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
|
||
|
|
|
||
|
|
i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
|
||
|
|
i4_samp_8x16b_r1_1); // a0 a1 a1 a2 a2 a3 a3 a4
|
||
|
|
i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
|
||
|
|
i4_samp_8x16b_r2_1); // b0 b1 b1 b2 b2 b3 b3 b4
|
||
|
|
i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
|
||
|
|
i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
|
||
|
|
i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
|
||
|
|
i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
|
||
|
|
i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
|
||
|
|
i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
|
||
|
|
|
||
|
|
// a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
|
||
|
|
i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
|
||
|
|
// b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
|
||
|
|
i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
|
||
|
|
i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
|
||
|
|
|
||
|
|
// a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
|
||
|
|
i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c2_c3_8x16b);
|
||
|
|
// b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
|
||
|
|
i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c2_c3_8x16b);
|
||
|
|
i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c2_c3_8x16b);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
|
||
|
|
i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
|
||
|
|
i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
|
||
|
|
i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
|
||
|
|
i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
|
||
|
|
i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
|
||
|
|
i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
|
||
|
|
i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
|
||
|
|
i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
|
||
|
|
i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
|
||
|
|
i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
|
||
|
|
i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
|
||
|
|
i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
|
||
|
|
i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
|
||
|
|
i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
|
||
|
|
i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
|
||
|
|
i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
|
||
|
|
i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
|
||
|
|
i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
|
||
|
|
i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
|
||
|
|
i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
|
||
|
|
i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
|
||
|
|
|
||
|
|
i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
|
||
|
|
i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
|
||
|
|
i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
|
||
|
|
i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
|
||
|
|
i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
|
||
|
|
i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
|
||
|
|
i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
|
||
|
|
i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
|
||
|
|
i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
|
||
|
|
i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
|
||
|
|
i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
|
||
|
|
i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
|
||
|
|
i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
|
||
|
|
i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
|
||
|
|
i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
|
||
|
|
i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
|
||
|
|
i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
|
||
|
|
i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
|
||
|
|
i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
|
||
|
|
i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
|
||
|
|
i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
|
||
|
|
|
||
|
|
chroma_mask = _mm_set1_epi16(0xFF00);
|
||
|
|
chroma_mask2 = _mm_set1_epi16(0x00FF);
|
||
|
|
out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
|
||
|
|
out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
|
||
|
|
out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
|
||
|
|
out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
|
||
|
|
out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
|
||
|
|
out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
|
||
|
|
out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
|
||
|
|
out_16x8b_r8 =
|
||
|
|
_mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
|
||
|
|
|
||
|
|
out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
|
||
|
|
out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
|
||
|
|
out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
|
||
|
|
out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
|
||
|
|
out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
|
||
|
|
out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
|
||
|
|
out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
|
||
|
|
out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
|
||
|
|
|
||
|
|
i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
|
||
|
|
i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
|
||
|
|
|
||
|
|
out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
|
||
|
|
out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
|
||
|
|
out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
|
||
|
|
out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
|
||
|
|
out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
|
||
|
|
out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
|
||
|
|
out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
|
||
|
|
out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
|
||
|
|
|
||
|
|
_mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
|
||
|
|
_mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
|
||
|
|
|
||
|
|
/* End of loop over x */
|
||
|
|
}
|