/****************************************************************************** * * Copyright (C) 2022 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * isvcd_intra_resamp_neonintr.c * * @brief * Contains routines that resample for SVC resampling * * @author * Kishore * * @par List of Functions: * - isvcd_interpolate_base_luma_dyadic_neonintr() * - isvcd_interpolate_intra_base_neonintr() * - isvcd_horz_interpol_chroma_dyadic_1_neonintr() * - isvcd_horz_interpol_chroma_dyadic_2_neonintr() * - isvcd_vert_interpol_chroma_dyadic_1_neonintr() * - isvcd_vert_interpol_chroma_dyadic_2_neonintr() * - isvcd_vert_interpol_chroma_dyadic_3_neonintr() * * @remarks * None * ******************************************************************************* */ #include #include #include #include "ih264_typedefs.h" #include "ih264_macros.h" #include "ih264_platform_macros.h" #include "isvcd_structs.h" #include "ih264_debug.h" /*****************************************************************************/ /* */ /* Function Name : isvcd_interpolate_base_luma_dyadic_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* intra resampling for dyadic scaling ratios */ /* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */ /* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */ /* vertically interpolated data */ /* pu1_out_buf : output buffer pointer */ /* i4_out_stride : output buffer stride */ /* Globals : none */ /* Processing : it does the interpolation in vertical direction followed */ /* by horizontal direction */ /* Outputs : resampled pixels */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 05 21 2021 Dolan creation */ /* */ /*****************************************************************************/ void isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf, WORD32 i4_out_stride) { WORD32 i4_y; WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; WORD32 i4_filt_stride, i4_src_stride; UWORD8 *pu1_inp, *pu1_out; WORD16 *pi2_tmp; int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2; uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3; int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2; /* Horizontal interpolation */ int32x4_t const_512_32x4 = vdupq_n_s32(512); int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2; uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp; uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2; int32x4x2_t i4_rslt_horz_32x4x2_t; int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3, i4_samp_horz_16x4_4; int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8; int16_t i4_coeff_c0 = -3; int16_t i4_coeff_c1 = 28; int16_t i4_coeff_c2 = 8; int16_t i4_coeff_c3 = -1; int32x4_t i4_rslt_horz_r0_1_tmp32, i4_rslt_horz_r1_1_tmp32, i4_rslt_horz_r0_2_tmp32, i4_rslt_horz_r1_2_tmp32; /* Filter coefficient values for phase 4 */ i4_coeff_0 = -3; i4_coeff_1 = 28; i4_coeff_2 = 8; i4_coeff_3 = -1; i4_filt_stride = 12; i4_src_stride = DYADIC_REF_W_Y; pu1_inp = pu1_inp_buf; pi2_tmp = pi2_tmp_filt_buf; pu1_out = pu1_out_buf; /* Vertical interpolation */ // First 64 bits i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_rslt_vert_16x8_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0); vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0); pi2_tmp += i4_filt_stride; for(i4_y = 1; i4_y < 15; i4_y += 2) { i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); i4_rslt_vert_16x8_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3); i4_rslt_vert_16x8_2 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3); i4_rslt_vert_16x8_2 = vmlaq_n_s16( i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2); i4_rslt_vert_16x8_2 = vmlaq_n_s16( i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1); i4_rslt_vert_16x8_2 = vmlaq_n_s16( i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0); /* Storing the results */ vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0)); pi2_tmp += i4_filt_stride; vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2)); pi2_tmp += i4_filt_stride; pu1_inp += i4_src_stride; } /*End of Loop over y*/ /* y = 15, y_phase = 4 */ i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); i4_rslt_vert_16x8_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2); i4_rslt_vert_16x8_0 = vmlaq_n_s16( i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3); vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0)); /* End of loop over x */ // Remaining 32 bits pu1_inp = pu1_inp_buf + 8; pi2_tmp = pi2_tmp_filt_buf + 8; i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); pu1_inp += i4_src_stride; i4_rslt_vert_16x4_1 = vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3); i4_rslt_vert_16x4_1 = vmla_n_s16(i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2); i4_rslt_vert_16x4_1 = vmla_n_s16(i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1); i4_rslt_vert_16x4_1 = vmla_n_s16(i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0); vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1)); pi2_tmp += i4_filt_stride; for(i4_y = 1; i4_y < 15; i4_y += 2) { i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); i4_rslt_vert_16x4_1 = vmul_n_s16( vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0); i4_rslt_vert_16x4_1 = vmla_n_s16( i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1); i4_rslt_vert_16x4_1 = vmla_n_s16( i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2); i4_rslt_vert_16x4_1 = vmla_n_s16( i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3); i4_rslt_vert_16x4_2 = vmul_n_s16( vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3); i4_rslt_vert_16x4_2 = vmla_n_s16( i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2); i4_rslt_vert_16x4_2 = vmla_n_s16( i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1); i4_rslt_vert_16x4_2 = vmla_n_s16( i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0); vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1)); pi2_tmp += i4_filt_stride; vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2)); pi2_tmp += i4_filt_stride; pu1_inp += i4_src_stride; } i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1; i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2; i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3; i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp); i4_rslt_vert_16x4_1 = vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0); i4_rslt_vert_16x4_1 = vmla_n_s16(i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1); i4_rslt_vert_16x4_1 = vmla_n_s16(i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2); i4_rslt_vert_16x4_1 = vmla_n_s16(i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3); vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1)); /* Reinitializing the ptrs */ pu1_inp = pu1_inp_buf; pi2_tmp = pi2_tmp_filt_buf; /* Horizontal interpolation */ for(i4_y = 0; i4_y < 16; i4_y++) { i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp); i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1); i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2); i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3); i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4); i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5); i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6); i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7); i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8); i4_rslt_horz_r0_1 = vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1, i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1); i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0); i4_rslt_horz_r1_1 = vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2, i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2); i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3); i4_rslt_horz_r0_2 = vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5, i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1); i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0); i4_rslt_horz_r1_2 = vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6, i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2); i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3); i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1); i4_rslt_horz_r0_1_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 0 to 3 i4_rslt_horz_r1_1_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 4 to 7 i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2); i4_rslt_horz_r0_2_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 8 to 11 i4_rslt_horz_r1_2_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 12 to 15 i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_1_tmp32, const_512_32x4); i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r1_1_tmp32, const_512_32x4); i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r0_2_tmp32, const_512_32x4); i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_2_tmp32, const_512_32x4); i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10); i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10); i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10); i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10); rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp); // 0 to 7 rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp); // 8 to 15 vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1)); vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2)); pu1_out += i4_out_stride; pi2_tmp += i4_filt_stride; } } /*****************************************************************************/ /* */ /* Function Name : isvcd_interpolate_intra_base_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* interpolation of a component to find the intra */ /* resampled value */ /* Inputs : pv_intra_samp_ctxt : intra sampling context */ /* pu1_out : output buffer pointer */ /* i4_out_stride : output buffer stride */ /* i4_refarray_wd : reference array width */ /* i4_x_offset : offset in reference layer in horz direction*/ /* ps_coord : current mb co-ordinate */ /* i4_chroma_flag : chroma processing flag */ /* Globals : none */ /* Processing : it does the interpolation in vertical direction followed */ /* by horizontal direction */ /* Outputs : resampled pixels */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 26 06 2009 vijayakumar creation */ /* */ /*****************************************************************************/ void isvcd_interpolate_intra_base_neonintr(void *pv_intra_samp_ctxt, UWORD8 *pu1_out, WORD32 i4_out_stride, WORD32 i4_refarray_wd, WORD32 i4_mb_x, WORD32 i4_mb_y, WORD32 i4_chroma_flag, WORD32 i4_refarray_flag) { /* --------------------------------------------------------------------- */ /* Index Parameters */ /* --------------------------------------------------------------------- */ intra_sampling_ctxt_t *ps_ctxt; intra_samp_map_ctxt_t *ps_map_ctxt; intra_samp_lyr_ctxt *ps_lyr_ctxt; WORD32 i4_x, i4_y; WORD32 i4_frm_mb_x, i4_frm_mb_y; UWORD8 *pu1_refarray = NULL; ref_pixel_map_t *ps_x_pos_phase; ref_pixel_map_t *ps_y_pos_phase; WORD32 i4_temp_array_ht; WORD32 *pi4_interp_buff; UWORD8 arr_y_ref_pos_luma[16] = {0}; UWORD8 arr_x_ref_pos_luma[16] = {0}; UWORD8 arr_x_ref_pos_luma_low[16] = {0}; UWORD8 arr_x_ref_pos_luma_high[16] = {0}; UWORD8 arr_phase_luma[16] = {0}; UWORD8 *pi4_y_ref_pos_luma; UWORD8 *pi4_x_ref_pos_luma_low; UWORD8 *pi4_x_ref_pos_luma_high; UWORD8 *pi4_phase_luma; WORD16 *pi2_interp_buff_temp; WORD32 i4_mb_wd; WORD32 i4_mb_ht; WORD32 i4_x_min; ref_min_max_map_t *ps_x_min_max; UWORD8 *pu1_refarray_temp; ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt; ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id]; if(0 == i4_refarray_flag) { pu1_refarray = ps_ctxt->pu1_refarray_buffer; } else if(1 == i4_refarray_flag) { pu1_refarray = ps_ctxt->pu1_refarray_cb; } /* --------------------------------------------------------------------- */ /* LUMA or CHROMA */ /* --------------------------------------------------------------------- */ if(1 == i4_chroma_flag) ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt); else ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt); i4_mb_wd = MB_WIDTH >> i4_chroma_flag; i4_mb_ht = MB_HEIGHT >> i4_chroma_flag; ps_x_min_max = ps_map_ctxt->ps_x_min_max; i4_frm_mb_y = i4_mb_y * i4_mb_ht; i4_frm_mb_x = i4_mb_x * i4_mb_wd; /* get the min and max positions */ i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos; /* --------------------------------------------------------------------- */ /* Projected frame level pointers */ /* --------------------------------------------------------------------- */ ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase; ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase; /* --------------------------------------------------------------------- */ /* Pointers and Dimenstion of the temporary buffer */ /* --------------------------------------------------------------------- */ i4_temp_array_ht = i4_mb_ht; pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer; pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff; /* --------------------------------------------------------------------- */ /* Loop for interpolation in vertical direction */ /* --------------------------------------------------------------------- */ if(i4_chroma_flag == 0) { { uint8x8_t inp_8x8_r0, inp_8x8_r0_1; uint8x8_t inp_8x8_r1, inp_8x8_r1_1; uint8x8_t inp_8x8_r2, inp_8x8_r2_1; uint8x8_t inp_8x8_r3, inp_8x8_r3_1; int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1; for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) { arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase; arr_y_ref_pos_luma[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos); } pi4_y_ref_pos_luma = arr_y_ref_pos_luma; pi4_phase_luma = arr_phase_luma; for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) { pu1_refarray_temp = pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1); inp_8x8_r0 = vld1_u8((pu1_refarray_temp - i4_refarray_wd)); inp_8x8_r1 = vld1_u8((pu1_refarray_temp)); inp_8x8_r2 = vld1_u8((pu1_refarray_temp + i4_refarray_wd)); inp_8x8_r3 = vld1_u8((pu1_refarray_temp + 2 * i4_refarray_wd)); inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8 - i4_refarray_wd)); inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8)); inp_8x8_r2_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd)); inp_8x8_r3_1 = vld1_u8((pu1_refarray_temp + 8 + 2 * i4_refarray_wd)); out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]); out_res_16x8_r0_0 = vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]); out_res_16x8_r0_0 = vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]); out_res_16x8_r0_0 = vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]); out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]); out_res_16x8_r0_1 = vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]); out_res_16x8_r0_1 = vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2_1)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]); out_res_16x8_r0_1 = vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3_1)), g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]); vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)), out_res_16x8_r0_0); vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8), out_res_16x8_r0_1); } } /*Horizontal Interpolation*/ { WORD32 strt_indx = 10; uint8x16_t phs_mask_8x8_0; uint8x16_t x_ref_pos_luma_mask_r0_0; uint8x16_t x_ref_pos_luma_mask_r0_1; uint8x16_t x_ref_pos_luma_mask_r1_0; uint8x16_t x_ref_pos_luma_mask_r1_1; uint8x16_t x_ref_pos_luma_mask_r2_0; uint8x16_t x_ref_pos_luma_mask_r2_1; uint8x16_t x_ref_pos_luma_mask_r3_0; uint8x16_t x_ref_pos_luma_mask_r3_1; WORD32 strt_indx_h = 0, i4_x2 = 0; WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1); uint8x16_t twos = vdupq_n_u8(2); strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1; strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1); for(i4_x = 0; i4_x < i4_mb_wd; i4_x++) { arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos; arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase; arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1; } for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++) { i4_x2 = i4_x << 1; arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1; arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1; } for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++) { i4_x2 = (i4_x - i4_mb_wd_hlf) << 1; arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1); arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1; } pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low; pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high; pi4_phase_luma = arr_phase_luma; phs_mask_8x8_0 = vld1q_u8((const uint8_t *) pi4_phase_luma); x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low); x_ref_pos_luma_mask_r0_1 = vld1q_u8(pi4_x_ref_pos_luma_high); x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos); x_ref_pos_luma_mask_r1_1 = vaddq_u8(x_ref_pos_luma_mask_r0_1, twos); x_ref_pos_luma_mask_r2_0 = vaddq_u8(x_ref_pos_luma_mask_r1_0, twos); x_ref_pos_luma_mask_r2_1 = vaddq_u8(x_ref_pos_luma_mask_r1_1, twos); x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0; x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1; { int8x16_t ip_filt_8x16_r0; int8x16_t ip_filt_8x16_r1; int8x16_t ip_filt_8x16_r2; int8x16_t ip_filt_8x16_r3; int16x8_t ip_filt_16x8_r0_0, ip_filt_16x8_r0_1; int16x8_t ip_filt_16x8_r1_0, ip_filt_16x8_r1_1; int16x8_t ip_filt_16x8_r2_0, ip_filt_16x8_r2_1; int16x8_t ip_filt_16x8_r3_0, ip_filt_16x8_r3_1; int16x8_t inp_16x8_0; int16x8_t inp_16x8_1; int16x8_t inp_16x8_2; int16x8_t inp_16x8_3; int16x8_t inp_16x8_r0_0, inp_16x8_r2_0; int16x8_t inp_16x8_r0_1, inp_16x8_r2_1; int16x8_t inp_16x8_r1_0, inp_16x8_r3_0; int16x8_t inp_16x8_r1_1, inp_16x8_r3_1; int16x4_t inp_16x4_r0_0, inp_16x4_r2_0; int16x4_t inp_16x4_r0_1, inp_16x4_r2_1; int16x4_t inp_16x4_r1_0, inp_16x4_r3_0; int16x4_t inp_16x4_r1_1, inp_16x4_r3_1; int32x4_t out_res_32x4_r0_l_0; int32x4_t out_res_32x4_r0_l_1; int32x4_t out_res_32x4_r0_h_0; int32x4_t out_res_32x4_r0_h_1; uint16x4_t out_res_16x4_r0_l_0; uint16x4_t out_res_16x4_r0_l_1; uint16x4_t out_res_16x4_r0_h_0; uint16x4_t out_res_16x4_r0_h_1; uint8x8_t out_res_8x8_r0_l, out_res_8x8_r0_h; uint8x8x2_t u1_temp_8x8x2_t; uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1; ip_filt_8x16_r0 = vld1q_s8((g_ai1_interp_filter_luma)); ip_filt_8x16_r1 = vld1q_s8((g_ai1_interp_filter_luma + 16)); ip_filt_8x16_r2 = vld1q_s8((g_ai1_interp_filter_luma + 32)); ip_filt_8x16_r3 = vld1q_s8((g_ai1_interp_filter_luma + 48)); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r0)); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r0)); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); ip_filt_8x16_r0 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r1)); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r1)); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); ip_filt_8x16_r1 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r2)); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r2)); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); ip_filt_8x16_r2 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r3)); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r3)); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); ip_filt_8x16_r3 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)); ip_filt_16x8_r0_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r0)); ip_filt_16x8_r1_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r1)); ip_filt_16x8_r2_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r2)); ip_filt_16x8_r3_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r3)); ip_filt_16x8_r0_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r0)); ip_filt_16x8_r1_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r1)); ip_filt_16x8_r2_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r2)); ip_filt_16x8_r3_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r3)); for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++) { inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx)); inp_16x8_1 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h)); inp_16x8_2 = vld1q_s16((pi2_interp_buff_temp + strt_indx + 3)); inp_16x8_3 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h + 3)); pi2_interp_buff_temp += i4_refarray_wd; u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0)); inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_1)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_1)); inp_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0)); inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_1)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_1)); inp_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_0)); inp_16x8_r2_0 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_1)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_1)); inp_16x8_r2_1 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_2))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_2))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_0)); inp_16x8_r3_0 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_3))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_3))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_1)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_1)); inp_16x8_r3_1 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0); inp_16x4_r0_1 = vget_low_s16(inp_16x8_r0_1); inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0); inp_16x4_r1_1 = vget_low_s16(inp_16x8_r1_1); inp_16x4_r2_0 = vget_low_s16(inp_16x8_r2_0); inp_16x4_r2_1 = vget_low_s16(inp_16x8_r2_1); inp_16x4_r3_0 = vget_low_s16(inp_16x8_r3_0); inp_16x4_r3_1 = vget_low_s16(inp_16x8_r3_1); out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0)); out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0, vget_low_s16(ip_filt_16x8_r1_0)); out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r2_0, vget_low_s16(ip_filt_16x8_r2_0)); out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r3_0, vget_low_s16(ip_filt_16x8_r3_0)); out_res_32x4_r0_l_1 = vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0)); out_res_32x4_r0_l_1 = vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0), vget_high_s16(ip_filt_16x8_r1_0)); out_res_32x4_r0_l_1 = vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r2_0), vget_high_s16(ip_filt_16x8_r2_0)); out_res_32x4_r0_l_1 = vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r3_0), vget_high_s16(ip_filt_16x8_r3_0)); out_res_32x4_r0_h_0 = vmull_s16(inp_16x4_r0_1, vget_low_s16(ip_filt_16x8_r0_1)); out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r1_1, vget_low_s16(ip_filt_16x8_r1_1)); out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r2_1, vget_low_s16(ip_filt_16x8_r2_1)); out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r3_1, vget_low_s16(ip_filt_16x8_r3_1)); out_res_32x4_r0_h_1 = vmull_s16(vget_high_s16(inp_16x8_r0_1), vget_high_s16(ip_filt_16x8_r0_1)); out_res_32x4_r0_h_1 = vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r1_1), vget_high_s16(ip_filt_16x8_r1_1)); out_res_32x4_r0_h_1 = vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r2_1), vget_high_s16(ip_filt_16x8_r2_1)); out_res_32x4_r0_h_1 = vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r3_1), vget_high_s16(ip_filt_16x8_r3_1)); out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10); out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10); out_res_16x4_r0_h_0 = vqrshrun_n_s32(out_res_32x4_r0_h_0, 10); out_res_16x4_r0_h_1 = vqrshrun_n_s32(out_res_32x4_r0_h_1, 10); out_res_8x8_r0_l = vqmovn_u16(vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1)); out_res_8x8_r0_h = vqmovn_u16(vcombine_u16(out_res_16x4_r0_h_0, out_res_16x4_r0_h_1)); vst1q_u8((pu1_out + (i4_y * i4_out_stride)), vcombine_u8(out_res_8x8_r0_l, out_res_8x8_r0_h)); } } } } else { for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) { arr_y_ref_pos_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos; arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase; } pi4_y_ref_pos_luma = arr_y_ref_pos_luma; pi4_phase_luma = arr_phase_luma; { uint8x8_t inp_8x8_r0, inp_8x8_r0_1; uint8x8_t inp_8x8_r1, inp_8x8_r1_1; int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1; for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++) { pu1_refarray_temp = pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1); inp_8x8_r0 = vld1_u8((pu1_refarray_temp)); inp_8x8_r1 = vld1_u8((pu1_refarray_temp + i4_refarray_wd)); inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8)); inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd)); out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)), g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]); out_res_16x8_r0_0 = vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)), g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]); out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)), g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]); out_res_16x8_r0_1 = vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)), g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]); vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)), out_res_16x8_r0_0); vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8), out_res_16x8_r0_1); } } { WORD32 strt_indx = 10; uint8x16_t phs_mask_8x8_0; uint8x16_t x_ref_pos_luma_mask_r0_0; uint8x16_t x_ref_pos_luma_mask_r1_0; WORD32 i4_x2 = 0; uint8x16_t twos = vdupq_n_u8(2); strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos; for(i4_x = 0; i4_x < i4_mb_wd; i4_x++) { arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos; arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase; arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx; i4_x2 = i4_x << 1; arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1; arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1; } pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low; pi4_phase_luma = arr_phase_luma; phs_mask_8x8_0 = vld1q_u8(pi4_phase_luma); x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low); x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos); { uint8x16_t ip_filt_8x16_r0; uint8x16_t ip_filt_8x16_r1; int16x8_t ip_filt_16x8_r0_0; int16x8_t ip_filt_16x8_r1_0; int16x8_t inp_16x8_0; int16x8_t inp_16x8_r0_0; int16x8_t inp_16x8_r1_0; int16x4_t inp_16x4_r0_0; int16x4_t inp_16x4_r1_0; int32x4_t out_res_32x4_r0_l_0; int32x4_t out_res_32x4_r0_l_1; uint16x4_t out_res_16x4_r0_l_0; uint16x4_t out_res_16x4_r0_l_1; uint16x8_t out_res_16x8_r0_l; uint8x16_t out_8x16_r0; uint8x8x2_t u1_incr_8x8x2_t; uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1; uint8x8x2_t u1_temp_8x8x2_t; uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1; uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff)); ip_filt_8x16_r0 = vld1q_u8((g_au1_interp_filter_chroma)); ip_filt_8x16_r1 = vld1q_u8((g_au1_interp_filter_chroma + 16)); u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r0); u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r0); u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); ip_filt_8x16_r0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r1); u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r1); u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0)); u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0)); ip_filt_8x16_r1 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1); ip_filt_16x8_r0_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r0))); ip_filt_16x8_r1_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r1))); for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++) { inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx)); pi2_interp_buff_temp += i4_refarray_wd; u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0)); inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0))); u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0)); u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0)); inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8( vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1))); inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0); inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0); out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0)); out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0, vget_low_s16(ip_filt_16x8_r1_0)); out_res_32x4_r0_l_1 = vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0)); out_res_32x4_r0_l_1 = vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0), vget_high_s16(ip_filt_16x8_r1_0)); out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10); out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10); out_res_16x8_r0_l = vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1); out_8x16_r0 = vld1q_u8(pu1_out + (i4_y * i4_out_stride)); out_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_u16(out_res_16x8_r0_l), out_8x16_r0); vst1q_u8((pu1_out + (i4_y * i4_out_stride)), out_8x16_r0); } } } } return; } /* End of Interpolation Function */ /*****************************************************************************/ /* */ /* Function Name : isvcd_horz_interpol_chroma_dyadic_1_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* interpolation of a component to find the intra */ /* resampled value */ /* Inputs : pv_intra_samp_ctxt : intra sampling context */ /* pu1_out : output buffer pointer */ /* i4_out_stride : output buffer stride */ /* i4_refarray_wd : reference array width */ /* i4_x_offset : offset in reference layer in horz direction*/ /* ps_coord : current mb co-ordinate */ /* i4_chroma_flag : chroma processing flag */ /* Globals : none */ /* Processing : it does the interpolation on horizontal direction */ /* Outputs : resampled pixels */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 26 06 2009 vijayakumar creation */ /* */ /*****************************************************************************/ void isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf, WORD32 i4_out_stride, WORD32 i4_phase_0, WORD32 i4_phase_1) { WORD32 i4_y; WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; WORD32 i4_filt_stride, i4_dst_stride; UWORD8 *pu1_out; WORD16 *pi2_tmp; int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2; int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2; int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2; int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2; int16x8_t final_horz_16x8_r0_1; int16x8_t final_horz_16x8_r1_1; uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1; uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff)); i4_coeff_0 = 8 - i4_phase_0; i4_coeff_1 = i4_phase_0; i4_coeff_2 = 8 - i4_phase_1; i4_coeff_3 = i4_phase_1; pu1_out = pu1_out_buf; pi2_tmp = pi2_tmp_filt_buf; i4_filt_stride = 6; i4_dst_stride = i4_out_stride; /* Horizontal interpolation */ for(i4_y = 0; i4_y < 8; i4_y += 2) { i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7 i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4 i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2); // a2 a3 a4 a5 i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride); i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1); i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2)); i4_rslt_horz_r0_1 = vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0 i4_rslt_horz_r0_2 = vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2 i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1, i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2, i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0); i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2); i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1); i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3); final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0]; final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0]; final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6); final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6); i4_out_horz_8x16_r0 = vld1q_u8(pu1_out); i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride); i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1), i4_out_horz_8x16_r0); i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1), i4_out_horz_8x16_r1); vst1q_u8(pu1_out, i4_out_horz_8x16_r0); vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1); /* Incrementing ptr */ pi2_tmp += (i4_filt_stride << 1); pu1_out += (i4_dst_stride << 1); } /* End of loop over y */ } /*****************************************************************************/ /* */ /* Function Name : isvcd_horz_interpol_chroma_dyadic_2_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* vertical intra resampling for dyadic scaling ratios for */ /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ /* chroma_phase_y_plus1: */ /* ref_lyr cur_lyr */ /* 0 1 */ /* 0 2 */ /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */ /* vertically interpolated data */ /* i4_phase_0 : y phase for even values of y */ /* i4_phase_1 : y phase for odd values of y */ /* Globals : none */ /* Processing : it does the interpolation in vertical direction */ /* Outputs : vertically resampled samples */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 21 05 2021 Dolan creation */ /* */ /*****************************************************************************/ void isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf, WORD32 i4_out_stride, WORD32 i4_phase_0, WORD32 i4_phase_1) { WORD32 i4_y; WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; WORD32 i4_filt_stride, i4_dst_stride; UWORD8 *pu1_out; WORD16 *pi2_tmp; int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1; int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1; int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2; int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2; int16x8_t final_horz_16x8_r0_1; int16x8_t final_horz_16x8_r1_1; uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1; uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff)); i4_coeff_0 = 8 - i4_phase_0; i4_coeff_1 = i4_phase_0; i4_coeff_2 = 8 - i4_phase_1; i4_coeff_3 = i4_phase_1; pu1_out = pu1_out_buf; pi2_tmp = pi2_tmp_filt_buf + 1; i4_filt_stride = 6; i4_dst_stride = i4_out_stride; /* Horizontal interpolation */ for(i4_y = 0; i4_y < 8; i4_y += 2) { i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7 i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4 i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride); i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1); i4_rslt_horz_r0_1 = vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0 i4_rslt_horz_r0_2 = vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2 i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1, i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_1, i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3 i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0); i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_2); i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1); i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_1, i4_coeff_3); final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0]; final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0]; final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6); final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6); i4_out_horz_8x16_r0 = vld1q_u8(pu1_out); i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride); i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1), i4_out_horz_8x16_r0); i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1), i4_out_horz_8x16_r1); vst1q_u8(pu1_out, i4_out_horz_8x16_r0); vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1); /* Incrementing ptr */ pi2_tmp += (i4_filt_stride << 1); pu1_out += (i4_dst_stride << 1); } /* End of loop over y */ } /*****************************************************************************/ /* */ /* Function Name : isvcd_vert_interpol_chroma_dyadic_1_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* vertical intra resampling for dyadic scaling ratios for */ /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ /* chroma_phase_y_plus1: */ /* ref_lyr cur_lyr */ /* 2 0 */ /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */ /* vertically interpolated data */ /* i4_phase_0 : y phase for even values of y */ /* i4_phase_1 : y phase for odd values of y */ /* Globals : none */ /* Processing : it does the interpolation in vertical direction */ /* Outputs : vertically resampled samples */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 21 05 2021 Dolan creation */ /* */ /*****************************************************************************/ void isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, WORD32 i4_phase_0, WORD32 i4_phase_1) { WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; WORD32 i4_src_stride; UWORD8 *pu1_inp; WORD16 *pi2_tmp; uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2; uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5; int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2, i4_rslt_vert_16x8_r3; int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6, i4_rslt_vert_16x8_r7; i4_coeff_0 = 8 - i4_phase_0; i4_coeff_1 = i4_phase_0; i4_coeff_2 = 8 - i4_phase_1; i4_coeff_3 = i4_phase_1; pu1_inp = pu1_inp_buf; pi2_tmp = pi2_tmp_filt_buf; i4_src_stride = DYADIC_REF_W_C; /* Vertical interpolation */ i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_rslt_vert_16x8_r0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0); i4_rslt_vert_16x8_r0 = vmlaq_n_s16( i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1); vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0); i4_rslt_vert_16x8_r1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2); i4_rslt_vert_16x8_r1 = vmlaq_n_s16( i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3); vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1); i4_rslt_vert_16x8_r2 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0); i4_rslt_vert_16x8_r2 = vmlaq_n_s16( i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1); vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2); i4_rslt_vert_16x8_r3 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2); i4_rslt_vert_16x8_r3 = vmlaq_n_s16( i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3); vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3); i4_rslt_vert_16x8_r4 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0); i4_rslt_vert_16x8_r4 = vmlaq_n_s16( i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1); vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4); i4_rslt_vert_16x8_r5 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2); i4_rslt_vert_16x8_r5 = vmlaq_n_s16( i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3); vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5); i4_rslt_vert_16x8_r6 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0); i4_rslt_vert_16x8_r6 = vmlaq_n_s16( i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1); vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6); i4_rslt_vert_16x8_r7 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2); i4_rslt_vert_16x8_r7 = vmlaq_n_s16( i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3); vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7)); vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4); vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5); } /*****************************************************************************/ /* */ /* Function Name : isvcd_vert_interpol_chroma_dyadic_2_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* vertical intra resampling for dyadic scaling ratios for */ /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ /* chroma_phase_y_plus1: */ /* ref_lyr cur_lyr */ /* 2 0 */ /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */ /* vertically interpolated data */ /* i4_phase_0 : y phase for even values of y */ /* i4_phase_1 : y phase for odd values of y */ /* Globals : none */ /* Processing : it does the interpolation in vertical direction */ /* Outputs : vertically resampled samples */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 21 05 2021 Dolan creation */ /* */ /*****************************************************************************/ void isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, WORD32 i4_phase_0, WORD32 i4_phase_1) { WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; WORD32 i4_src_stride; UWORD8 *pu1_inp; WORD16 *pi2_tmp; uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3; uint8x8_t i4_samp_vert_8x8_r4; int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2, i4_rslt_vert_16x8_r3; int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6, i4_rslt_vert_16x8_r7; i4_coeff_0 = 8 - i4_phase_0; i4_coeff_1 = i4_phase_0; i4_coeff_2 = 8 - i4_phase_1; i4_coeff_3 = i4_phase_1; pi2_tmp = pi2_tmp_filt_buf; i4_src_stride = DYADIC_REF_W_C; pu1_inp = pu1_inp_buf + i4_src_stride; /* Vertical interpolation */ i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; /* since y_phase = phase_0 for y = 0 */ i4_rslt_vert_16x8_r0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0); i4_rslt_vert_16x8_r0 = vmlaq_n_s16( i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1); vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0); i4_rslt_vert_16x8_r1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2); i4_rslt_vert_16x8_r1 = vmlaq_n_s16( i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3); vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1); i4_rslt_vert_16x8_r2 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0); i4_rslt_vert_16x8_r2 = vmlaq_n_s16( i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1); vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2); i4_rslt_vert_16x8_r3 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2); i4_rslt_vert_16x8_r3 = vmlaq_n_s16( i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3); vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3); i4_rslt_vert_16x8_r4 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0); i4_rslt_vert_16x8_r4 = vmlaq_n_s16( i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1); vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4); i4_rslt_vert_16x8_r5 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2); i4_rslt_vert_16x8_r5 = vmlaq_n_s16( i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3); vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5); i4_rslt_vert_16x8_r6 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0); i4_rslt_vert_16x8_r6 = vmlaq_n_s16( i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1); vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6); i4_rslt_vert_16x8_r7 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2); i4_rslt_vert_16x8_r7 = vmlaq_n_s16( i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3); vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7)); vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4); vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5); } /*****************************************************************************/ /* */ /* Function Name : isvcd_vert_interpol_chroma_dyadic_3_neonintr */ /* */ /* Description : This function takes the reference array buffer & performs*/ /* vertical intra resampling for dyadic scaling ratios for */ /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/ /* chroma_phase_y_plus1: */ /* ref_lyr cur_lyr */ /* 2 0 */ /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */ /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */ /* vertically interpolated data */ /* i4_phase_0 : y phase for even values of y */ /* i4_phase_1 : y phase for odd values of y */ /* Globals : none */ /* Processing : it does the interpolation in vertical direction */ /* Outputs : vertically resampled samples */ /* Returns : none */ /* */ /* Issues : none */ /* */ /* Revision History: */ /* */ /* DD MM YYYY Author(s) Changes (Describe the changes made) */ /* 21 05 2021 Dolan creation */ /* */ /*****************************************************************************/ void isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf, WORD32 i4_phase_0, WORD32 i4_phase_1) { WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3; WORD32 i4_src_stride; UWORD8 *pu1_inp; WORD16 *pi2_tmp; uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2; uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4; int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2, i4_rslt_vert_16x8_r3; int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6, i4_rslt_vert_16x8_r7; i4_coeff_0 = 8 - i4_phase_0; i4_coeff_1 = i4_phase_0; i4_coeff_2 = 8 - i4_phase_1; i4_coeff_3 = i4_phase_1; pi2_tmp = pi2_tmp_filt_buf; i4_src_stride = DYADIC_REF_W_C; pu1_inp = pu1_inp_buf; /* Vertical interpolation */ /* y = 0, y_phase = phase_0 */ i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp); pu1_inp += i4_src_stride; /* since y_phase = phase_0 for y = 0 */ i4_rslt_vert_16x8_r0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0); i4_rslt_vert_16x8_r0 = vmlaq_n_s16( i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1); vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0); i4_rslt_vert_16x8_r1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2); i4_rslt_vert_16x8_r1 = vmlaq_n_s16( i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3); vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1); i4_rslt_vert_16x8_r2 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0); i4_rslt_vert_16x8_r2 = vmlaq_n_s16( i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1); vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2); i4_rslt_vert_16x8_r3 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2); i4_rslt_vert_16x8_r3 = vmlaq_n_s16( i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3); vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3); i4_rslt_vert_16x8_r4 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0); i4_rslt_vert_16x8_r4 = vmlaq_n_s16( i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1); vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4); i4_rslt_vert_16x8_r5 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2); i4_rslt_vert_16x8_r5 = vmlaq_n_s16( i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3); vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5); i4_rslt_vert_16x8_r6 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0); i4_rslt_vert_16x8_r6 = vmlaq_n_s16( i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1); vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6); i4_rslt_vert_16x8_r7 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2); i4_rslt_vert_16x8_r7 = vmlaq_n_s16( i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3); vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7)); vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4); vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5); }