unplugged-system/external/libavc/decoder/arm/svc/isvcd_intra_resamp_neon.c

1549 lines
80 KiB
C

/******************************************************************************
*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* isvcd_intra_resamp_neonintr.c
*
* @brief
* Contains routines that resample for SVC resampling
*
* @author
* Kishore
*
* @par List of Functions:
* - isvcd_interpolate_base_luma_dyadic_neonintr()
* - isvcd_interpolate_intra_base_neonintr()
* - isvcd_horz_interpol_chroma_dyadic_1_neonintr()
* - isvcd_horz_interpol_chroma_dyadic_2_neonintr()
* - isvcd_vert_interpol_chroma_dyadic_1_neonintr()
* - isvcd_vert_interpol_chroma_dyadic_2_neonintr()
* - isvcd_vert_interpol_chroma_dyadic_3_neonintr()
*
* @remarks
* None
*
*******************************************************************************
*/
#include <assert.h>
#include <string.h>
#include <arm_neon.h>
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "isvcd_structs.h"
#include "ih264_debug.h"
/*****************************************************************************/
/* */
/* Function Name : isvcd_interpolate_base_luma_dyadic_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* intra resampling for dyadic scaling ratios */
/* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
/* vertically interpolated data */
/* pu1_out_buf : output buffer pointer */
/* i4_out_stride : output buffer stride */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction followed */
/* by horizontal direction */
/* Outputs : resampled pixels */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 05 21 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
{
WORD32 i4_y;
WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_filt_stride, i4_src_stride;
UWORD8 *pu1_inp, *pu1_out;
WORD16 *pi2_tmp;
int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2;
uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3;
int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2;
/* Horizontal interpolation */
int32x4_t const_512_32x4 = vdupq_n_s32(512);
int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2;
uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp,
i4_rslt_horz_r1_2_tmp;
uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2;
int32x4x2_t i4_rslt_horz_32x4x2_t;
int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3,
i4_samp_horz_16x4_4;
int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8;
int16_t i4_coeff_c0 = -3;
int16_t i4_coeff_c1 = 28;
int16_t i4_coeff_c2 = 8;
int16_t i4_coeff_c3 = -1;
int32x4_t i4_rslt_horz_r0_1_tmp32, i4_rslt_horz_r1_1_tmp32, i4_rslt_horz_r0_2_tmp32,
i4_rslt_horz_r1_2_tmp32;
/* Filter coefficient values for phase 4 */
i4_coeff_0 = -3;
i4_coeff_1 = 28;
i4_coeff_2 = 8;
i4_coeff_3 = -1;
i4_filt_stride = 12;
i4_src_stride = DYADIC_REF_W_Y;
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
pu1_out = pu1_out_buf;
/* Vertical interpolation */
// First 64 bits
i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_rslt_vert_16x8_0 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0);
pi2_tmp += i4_filt_stride;
for(i4_y = 1; i4_y < 15; i4_y += 2)
{
i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
i4_rslt_vert_16x8_0 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
i4_rslt_vert_16x8_2 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
i4_rslt_vert_16x8_2 = vmlaq_n_s16(
i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
i4_rslt_vert_16x8_2 = vmlaq_n_s16(
i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
i4_rslt_vert_16x8_2 = vmlaq_n_s16(
i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
/* Storing the results */
vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
pi2_tmp += i4_filt_stride;
vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2));
pi2_tmp += i4_filt_stride;
pu1_inp += i4_src_stride;
} /*End of Loop over y*/
/* y = 15, y_phase = 4 */
i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
i4_rslt_vert_16x8_0 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
i4_rslt_vert_16x8_0 = vmlaq_n_s16(
i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
/* End of loop over x */
// Remaining 32 bits
pu1_inp = pu1_inp_buf + 8;
pi2_tmp = pi2_tmp_filt_buf + 8;
i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
pu1_inp += i4_src_stride;
i4_rslt_vert_16x4_1 =
vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
i4_rslt_vert_16x4_1 =
vmla_n_s16(i4_rslt_vert_16x4_1,
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2);
i4_rslt_vert_16x4_1 =
vmla_n_s16(i4_rslt_vert_16x4_1,
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1);
i4_rslt_vert_16x4_1 =
vmla_n_s16(i4_rslt_vert_16x4_1,
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0);
vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
pi2_tmp += i4_filt_stride;
for(i4_y = 1; i4_y < 15; i4_y += 2)
{
i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
i4_rslt_vert_16x4_1 = vmul_n_s16(
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
i4_rslt_vert_16x4_1 = vmla_n_s16(
i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
i4_coeff_1);
i4_rslt_vert_16x4_1 = vmla_n_s16(
i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
i4_coeff_2);
i4_rslt_vert_16x4_1 = vmla_n_s16(
i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
i4_coeff_3);
i4_rslt_vert_16x4_2 = vmul_n_s16(
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
i4_rslt_vert_16x4_2 = vmla_n_s16(
i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
i4_coeff_2);
i4_rslt_vert_16x4_2 = vmla_n_s16(
i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
i4_coeff_1);
i4_rslt_vert_16x4_2 = vmla_n_s16(
i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
i4_coeff_0);
vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
pi2_tmp += i4_filt_stride;
vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2));
pi2_tmp += i4_filt_stride;
pu1_inp += i4_src_stride;
}
i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
i4_rslt_vert_16x4_1 =
vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
i4_rslt_vert_16x4_1 =
vmla_n_s16(i4_rslt_vert_16x4_1,
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1);
i4_rslt_vert_16x4_1 =
vmla_n_s16(i4_rslt_vert_16x4_1,
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2);
i4_rslt_vert_16x4_1 =
vmla_n_s16(i4_rslt_vert_16x4_1,
vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3);
vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
/* Reinitializing the ptrs */
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
/* Horizontal interpolation */
for(i4_y = 0; i4_y < 16; i4_y++)
{
i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp);
i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1);
i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2);
i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3);
i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4);
i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5);
i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6);
i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7);
i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8);
i4_rslt_horz_r0_1 = vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3
i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1,
i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1);
i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0);
i4_rslt_horz_r1_1 = vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0
i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2,
i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2);
i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3);
i4_rslt_horz_r0_2 = vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3
i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5,
i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1);
i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0);
i4_rslt_horz_r1_2 = vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0
i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6,
i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2);
i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3);
i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1);
i4_rslt_horz_r0_1_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 0 to 3
i4_rslt_horz_r1_1_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 4 to 7
i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2);
i4_rslt_horz_r0_2_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 8 to 11
i4_rslt_horz_r1_2_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 12 to 15
i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_1_tmp32, const_512_32x4);
i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r1_1_tmp32, const_512_32x4);
i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r0_2_tmp32, const_512_32x4);
i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_2_tmp32, const_512_32x4);
i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10);
i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10);
i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10);
i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10);
rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp); // 0 to 7
rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp); // 8 to 15
vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1));
vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2));
pu1_out += i4_out_stride;
pi2_tmp += i4_filt_stride;
}
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_interpolate_intra_base_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* interpolation of a component to find the intra */
/* resampled value */
/* Inputs : pv_intra_samp_ctxt : intra sampling context */
/* pu1_out : output buffer pointer */
/* i4_out_stride : output buffer stride */
/* i4_refarray_wd : reference array width */
/* i4_x_offset : offset in reference layer in horz direction*/
/* ps_coord : current mb co-ordinate */
/* i4_chroma_flag : chroma processing flag */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction followed */
/* by horizontal direction */
/* Outputs : resampled pixels */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 26 06 2009 vijayakumar creation */
/* */
/*****************************************************************************/
void isvcd_interpolate_intra_base_neonintr(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
WORD32 i4_out_stride, WORD32 i4_refarray_wd,
WORD32 i4_mb_x, WORD32 i4_mb_y, WORD32 i4_chroma_flag,
WORD32 i4_refarray_flag)
{
/* --------------------------------------------------------------------- */
/* Index Parameters */
/* --------------------------------------------------------------------- */
intra_sampling_ctxt_t *ps_ctxt;
intra_samp_map_ctxt_t *ps_map_ctxt;
intra_samp_lyr_ctxt *ps_lyr_ctxt;
WORD32 i4_x, i4_y;
WORD32 i4_frm_mb_x, i4_frm_mb_y;
UWORD8 *pu1_refarray = NULL;
ref_pixel_map_t *ps_x_pos_phase;
ref_pixel_map_t *ps_y_pos_phase;
WORD32 i4_temp_array_ht;
WORD32 *pi4_interp_buff;
UWORD8 arr_y_ref_pos_luma[16] = {0};
UWORD8 arr_x_ref_pos_luma[16] = {0};
UWORD8 arr_x_ref_pos_luma_low[16] = {0};
UWORD8 arr_x_ref_pos_luma_high[16] = {0};
UWORD8 arr_phase_luma[16] = {0};
UWORD8 *pi4_y_ref_pos_luma;
UWORD8 *pi4_x_ref_pos_luma_low;
UWORD8 *pi4_x_ref_pos_luma_high;
UWORD8 *pi4_phase_luma;
WORD16 *pi2_interp_buff_temp;
WORD32 i4_mb_wd;
WORD32 i4_mb_ht;
WORD32 i4_x_min;
ref_min_max_map_t *ps_x_min_max;
UWORD8 *pu1_refarray_temp;
ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
if(0 == i4_refarray_flag)
{
pu1_refarray = ps_ctxt->pu1_refarray_buffer;
}
else if(1 == i4_refarray_flag)
{
pu1_refarray = ps_ctxt->pu1_refarray_cb;
}
/* --------------------------------------------------------------------- */
/* LUMA or CHROMA */
/* --------------------------------------------------------------------- */
if(1 == i4_chroma_flag)
ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
else
ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
ps_x_min_max = ps_map_ctxt->ps_x_min_max;
i4_frm_mb_y = i4_mb_y * i4_mb_ht;
i4_frm_mb_x = i4_mb_x * i4_mb_wd;
/* get the min and max positions */
i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
/* --------------------------------------------------------------------- */
/* Projected frame level pointers */
/* --------------------------------------------------------------------- */
ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
/* --------------------------------------------------------------------- */
/* Pointers and Dimenstion of the temporary buffer */
/* --------------------------------------------------------------------- */
i4_temp_array_ht = i4_mb_ht;
pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
/* --------------------------------------------------------------------- */
/* Loop for interpolation in vertical direction */
/* --------------------------------------------------------------------- */
if(i4_chroma_flag == 0)
{
{
uint8x8_t inp_8x8_r0, inp_8x8_r0_1;
uint8x8_t inp_8x8_r1, inp_8x8_r1_1;
uint8x8_t inp_8x8_r2, inp_8x8_r2_1;
uint8x8_t inp_8x8_r3, inp_8x8_r3_1;
int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
arr_y_ref_pos_luma[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
}
pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
pi4_phase_luma = arr_phase_luma;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
pu1_refarray_temp =
pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
inp_8x8_r0 = vld1_u8((pu1_refarray_temp - i4_refarray_wd));
inp_8x8_r1 = vld1_u8((pu1_refarray_temp));
inp_8x8_r2 = vld1_u8((pu1_refarray_temp + i4_refarray_wd));
inp_8x8_r3 = vld1_u8((pu1_refarray_temp + 2 * i4_refarray_wd));
inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8 - i4_refarray_wd));
inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8));
inp_8x8_r2_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd));
inp_8x8_r3_1 = vld1_u8((pu1_refarray_temp + 8 + 2 * i4_refarray_wd));
out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
out_res_16x8_r0_0 =
vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
out_res_16x8_r0_0 =
vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
out_res_16x8_r0_0 =
vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
out_res_16x8_r0_1 =
vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
out_res_16x8_r0_1 =
vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2_1)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
out_res_16x8_r0_1 =
vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3_1)),
g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
out_res_16x8_r0_0);
vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8),
out_res_16x8_r0_1);
}
}
/*Horizontal Interpolation*/
{
WORD32 strt_indx = 10;
uint8x16_t phs_mask_8x8_0;
uint8x16_t x_ref_pos_luma_mask_r0_0;
uint8x16_t x_ref_pos_luma_mask_r0_1;
uint8x16_t x_ref_pos_luma_mask_r1_0;
uint8x16_t x_ref_pos_luma_mask_r1_1;
uint8x16_t x_ref_pos_luma_mask_r2_0;
uint8x16_t x_ref_pos_luma_mask_r2_1;
uint8x16_t x_ref_pos_luma_mask_r3_0;
uint8x16_t x_ref_pos_luma_mask_r3_1;
WORD32 strt_indx_h = 0, i4_x2 = 0;
WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
uint8x16_t twos = vdupq_n_u8(2);
strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
{
arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
}
for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
{
i4_x2 = i4_x << 1;
arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
}
for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
{
i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
}
pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
pi4_phase_luma = arr_phase_luma;
phs_mask_8x8_0 = vld1q_u8((const uint8_t *) pi4_phase_luma);
x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low);
x_ref_pos_luma_mask_r0_1 = vld1q_u8(pi4_x_ref_pos_luma_high);
x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos);
x_ref_pos_luma_mask_r1_1 = vaddq_u8(x_ref_pos_luma_mask_r0_1, twos);
x_ref_pos_luma_mask_r2_0 = vaddq_u8(x_ref_pos_luma_mask_r1_0, twos);
x_ref_pos_luma_mask_r2_1 = vaddq_u8(x_ref_pos_luma_mask_r1_1, twos);
x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
{
int8x16_t ip_filt_8x16_r0;
int8x16_t ip_filt_8x16_r1;
int8x16_t ip_filt_8x16_r2;
int8x16_t ip_filt_8x16_r3;
int16x8_t ip_filt_16x8_r0_0, ip_filt_16x8_r0_1;
int16x8_t ip_filt_16x8_r1_0, ip_filt_16x8_r1_1;
int16x8_t ip_filt_16x8_r2_0, ip_filt_16x8_r2_1;
int16x8_t ip_filt_16x8_r3_0, ip_filt_16x8_r3_1;
int16x8_t inp_16x8_0;
int16x8_t inp_16x8_1;
int16x8_t inp_16x8_2;
int16x8_t inp_16x8_3;
int16x8_t inp_16x8_r0_0, inp_16x8_r2_0;
int16x8_t inp_16x8_r0_1, inp_16x8_r2_1;
int16x8_t inp_16x8_r1_0, inp_16x8_r3_0;
int16x8_t inp_16x8_r1_1, inp_16x8_r3_1;
int16x4_t inp_16x4_r0_0, inp_16x4_r2_0;
int16x4_t inp_16x4_r0_1, inp_16x4_r2_1;
int16x4_t inp_16x4_r1_0, inp_16x4_r3_0;
int16x4_t inp_16x4_r1_1, inp_16x4_r3_1;
int32x4_t out_res_32x4_r0_l_0;
int32x4_t out_res_32x4_r0_l_1;
int32x4_t out_res_32x4_r0_h_0;
int32x4_t out_res_32x4_r0_h_1;
uint16x4_t out_res_16x4_r0_l_0;
uint16x4_t out_res_16x4_r0_l_1;
uint16x4_t out_res_16x4_r0_h_0;
uint16x4_t out_res_16x4_r0_h_1;
uint8x8_t out_res_8x8_r0_l, out_res_8x8_r0_h;
uint8x8x2_t u1_temp_8x8x2_t;
uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
ip_filt_8x16_r0 = vld1q_s8((g_ai1_interp_filter_luma));
ip_filt_8x16_r1 = vld1q_s8((g_ai1_interp_filter_luma + 16));
ip_filt_8x16_r2 = vld1q_s8((g_ai1_interp_filter_luma + 32));
ip_filt_8x16_r3 = vld1q_s8((g_ai1_interp_filter_luma + 48));
u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r0));
u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r0));
u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
ip_filt_8x16_r0 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
vreinterpret_s8_u8(u1_temp_8x8_t1));
u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r1));
u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r1));
u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
ip_filt_8x16_r1 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
vreinterpret_s8_u8(u1_temp_8x8_t1));
u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r2));
u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r2));
u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
ip_filt_8x16_r2 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
vreinterpret_s8_u8(u1_temp_8x8_t1));
u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r3));
u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r3));
u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
ip_filt_8x16_r3 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
vreinterpret_s8_u8(u1_temp_8x8_t1));
ip_filt_16x8_r0_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r0));
ip_filt_16x8_r1_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r1));
ip_filt_16x8_r2_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r2));
ip_filt_16x8_r3_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r3));
ip_filt_16x8_r0_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r0));
ip_filt_16x8_r1_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r1));
ip_filt_16x8_r2_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r2));
ip_filt_16x8_r3_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r3));
for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
{
inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx));
inp_16x8_1 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h));
inp_16x8_2 = vld1q_s16((pi2_interp_buff_temp + strt_indx + 3));
inp_16x8_3 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
pi2_interp_buff_temp += i4_refarray_wd;
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0));
inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_1));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_1));
inp_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0));
inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_1));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_1));
inp_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_0));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_0));
inp_16x8_r2_0 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_1));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_1));
inp_16x8_r2_1 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_2)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_2)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_0));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_0));
inp_16x8_r3_0 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_3)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_3)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_1));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_1));
inp_16x8_r3_1 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0);
inp_16x4_r0_1 = vget_low_s16(inp_16x8_r0_1);
inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0);
inp_16x4_r1_1 = vget_low_s16(inp_16x8_r1_1);
inp_16x4_r2_0 = vget_low_s16(inp_16x8_r2_0);
inp_16x4_r2_1 = vget_low_s16(inp_16x8_r2_1);
inp_16x4_r3_0 = vget_low_s16(inp_16x8_r3_0);
inp_16x4_r3_1 = vget_low_s16(inp_16x8_r3_1);
out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0));
out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0,
vget_low_s16(ip_filt_16x8_r1_0));
out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r2_0,
vget_low_s16(ip_filt_16x8_r2_0));
out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r3_0,
vget_low_s16(ip_filt_16x8_r3_0));
out_res_32x4_r0_l_1 =
vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0));
out_res_32x4_r0_l_1 =
vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0),
vget_high_s16(ip_filt_16x8_r1_0));
out_res_32x4_r0_l_1 =
vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r2_0),
vget_high_s16(ip_filt_16x8_r2_0));
out_res_32x4_r0_l_1 =
vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r3_0),
vget_high_s16(ip_filt_16x8_r3_0));
out_res_32x4_r0_h_0 = vmull_s16(inp_16x4_r0_1, vget_low_s16(ip_filt_16x8_r0_1));
out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r1_1,
vget_low_s16(ip_filt_16x8_r1_1));
out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r2_1,
vget_low_s16(ip_filt_16x8_r2_1));
out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r3_1,
vget_low_s16(ip_filt_16x8_r3_1));
out_res_32x4_r0_h_1 =
vmull_s16(vget_high_s16(inp_16x8_r0_1), vget_high_s16(ip_filt_16x8_r0_1));
out_res_32x4_r0_h_1 =
vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r1_1),
vget_high_s16(ip_filt_16x8_r1_1));
out_res_32x4_r0_h_1 =
vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r2_1),
vget_high_s16(ip_filt_16x8_r2_1));
out_res_32x4_r0_h_1 =
vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r3_1),
vget_high_s16(ip_filt_16x8_r3_1));
out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10);
out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10);
out_res_16x4_r0_h_0 = vqrshrun_n_s32(out_res_32x4_r0_h_0, 10);
out_res_16x4_r0_h_1 = vqrshrun_n_s32(out_res_32x4_r0_h_1, 10);
out_res_8x8_r0_l =
vqmovn_u16(vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1));
out_res_8x8_r0_h =
vqmovn_u16(vcombine_u16(out_res_16x4_r0_h_0, out_res_16x4_r0_h_1));
vst1q_u8((pu1_out + (i4_y * i4_out_stride)),
vcombine_u8(out_res_8x8_r0_l, out_res_8x8_r0_h));
}
}
}
}
else
{
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
arr_y_ref_pos_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
}
pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
pi4_phase_luma = arr_phase_luma;
{
uint8x8_t inp_8x8_r0, inp_8x8_r0_1;
uint8x8_t inp_8x8_r1, inp_8x8_r1_1;
int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1;
for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
{
pu1_refarray_temp =
pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
inp_8x8_r0 = vld1_u8((pu1_refarray_temp));
inp_8x8_r1 = vld1_u8((pu1_refarray_temp + i4_refarray_wd));
inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8));
inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd));
out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)),
g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
out_res_16x8_r0_0 =
vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)),
g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)),
g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
out_res_16x8_r0_1 =
vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)),
g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
out_res_16x8_r0_0);
vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8),
out_res_16x8_r0_1);
}
}
{
WORD32 strt_indx = 10;
uint8x16_t phs_mask_8x8_0;
uint8x16_t x_ref_pos_luma_mask_r0_0;
uint8x16_t x_ref_pos_luma_mask_r1_0;
WORD32 i4_x2 = 0;
uint8x16_t twos = vdupq_n_u8(2);
strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
{
arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
i4_x2 = i4_x << 1;
arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
}
pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
pi4_phase_luma = arr_phase_luma;
phs_mask_8x8_0 = vld1q_u8(pi4_phase_luma);
x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low);
x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos);
{
uint8x16_t ip_filt_8x16_r0;
uint8x16_t ip_filt_8x16_r1;
int16x8_t ip_filt_16x8_r0_0;
int16x8_t ip_filt_16x8_r1_0;
int16x8_t inp_16x8_0;
int16x8_t inp_16x8_r0_0;
int16x8_t inp_16x8_r1_0;
int16x4_t inp_16x4_r0_0;
int16x4_t inp_16x4_r1_0;
int32x4_t out_res_32x4_r0_l_0;
int32x4_t out_res_32x4_r0_l_1;
uint16x4_t out_res_16x4_r0_l_0;
uint16x4_t out_res_16x4_r0_l_1;
uint16x8_t out_res_16x8_r0_l;
uint8x16_t out_8x16_r0;
uint8x8x2_t u1_incr_8x8x2_t;
uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
uint8x8x2_t u1_temp_8x8x2_t;
uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
ip_filt_8x16_r0 = vld1q_u8((g_au1_interp_filter_chroma));
ip_filt_8x16_r1 = vld1q_u8((g_au1_interp_filter_chroma + 16));
u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r0);
u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r0);
u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
ip_filt_8x16_r0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r1);
u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r1);
u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
ip_filt_8x16_r1 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
ip_filt_16x8_r0_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r0)));
ip_filt_16x8_r1_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r1)));
for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
{
inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx));
pi2_interp_buff_temp += i4_refarray_wd;
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0));
inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
u1_temp_8x8x2_t.val[0] =
vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8x2_t.val[1] =
vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
u1_temp_8x8_t0 =
vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0));
u1_temp_8x8_t1 =
vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0));
inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0);
inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0);
out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0));
out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0,
vget_low_s16(ip_filt_16x8_r1_0));
out_res_32x4_r0_l_1 =
vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0));
out_res_32x4_r0_l_1 =
vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0),
vget_high_s16(ip_filt_16x8_r1_0));
out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10);
out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10);
out_res_16x8_r0_l = vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1);
out_8x16_r0 = vld1q_u8(pu1_out + (i4_y * i4_out_stride));
out_8x16_r0 = vbslq_u8(chroma_mask_8x16,
vreinterpretq_u8_u16(out_res_16x8_r0_l), out_8x16_r0);
vst1q_u8((pu1_out + (i4_y * i4_out_stride)), out_8x16_r0);
}
}
}
}
return;
} /* End of Interpolation Function */
/*****************************************************************************/
/* */
/* Function Name : isvcd_horz_interpol_chroma_dyadic_1_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* interpolation of a component to find the intra */
/* resampled value */
/* Inputs : pv_intra_samp_ctxt : intra sampling context */
/* pu1_out : output buffer pointer */
/* i4_out_stride : output buffer stride */
/* i4_refarray_wd : reference array width */
/* i4_x_offset : offset in reference layer in horz direction*/
/* ps_coord : current mb co-ordinate */
/* i4_chroma_flag : chroma processing flag */
/* Globals : none */
/* Processing : it does the interpolation on horizontal direction */
/* Outputs : resampled pixels */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 26 06 2009 vijayakumar creation */
/* */
/*****************************************************************************/
void isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
WORD32 i4_out_stride, WORD32 i4_phase_0,
WORD32 i4_phase_1)
{
WORD32 i4_y;
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_filt_stride, i4_dst_stride;
UWORD8 *pu1_out;
WORD16 *pi2_tmp;
int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2;
int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2;
int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
int16x8_t final_horz_16x8_r0_1;
int16x8_t final_horz_16x8_r1_1;
uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
pu1_out = pu1_out_buf;
pi2_tmp = pi2_tmp_filt_buf;
i4_filt_stride = 6;
i4_dst_stride = i4_out_stride;
/* Horizontal interpolation */
for(i4_y = 0; i4_y < 8; i4_y += 2)
{
i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4
i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2); // a2 a3 a4 a5
i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2));
i4_rslt_horz_r0_1 =
vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0
i4_rslt_horz_r0_2 =
vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2
i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2,
i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2);
i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3);
final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0];
final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0];
final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6);
final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6);
i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
i4_out_horz_8x16_r0);
i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
i4_out_horz_8x16_r1);
vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
/* Incrementing ptr */
pi2_tmp += (i4_filt_stride << 1);
pu1_out += (i4_dst_stride << 1);
} /* End of loop over y */
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_horz_interpol_chroma_dyadic_2_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 0 1 */
/* 0 2 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
WORD32 i4_out_stride, WORD32 i4_phase_0,
WORD32 i4_phase_1)
{
WORD32 i4_y;
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_filt_stride, i4_dst_stride;
UWORD8 *pu1_out;
WORD16 *pi2_tmp;
int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1;
int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1;
int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
int16x8_t final_horz_16x8_r0_1;
int16x8_t final_horz_16x8_r1_1;
uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
pu1_out = pu1_out_buf;
pi2_tmp = pi2_tmp_filt_buf + 1;
i4_filt_stride = 6;
i4_dst_stride = i4_out_stride;
/* Horizontal interpolation */
for(i4_y = 0; i4_y < 8; i4_y += 2)
{
i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4
i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
i4_rslt_horz_r0_1 =
vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0
i4_rslt_horz_r0_2 =
vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2
i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_1,
i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_2);
i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_1, i4_coeff_3);
final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0];
final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0];
final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6);
final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6);
i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
i4_out_horz_8x16_r0);
i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
i4_out_horz_8x16_r1);
vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
/* Incrementing ptr */
pi2_tmp += (i4_filt_stride << 1);
pu1_out += (i4_dst_stride << 1);
} /* End of loop over y */
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_vert_interpol_chroma_dyadic_1_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 2 0 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
WORD32 i4_phase_0, WORD32 i4_phase_1)
{
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_src_stride;
UWORD8 *pu1_inp;
WORD16 *pi2_tmp;
uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2;
uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5;
int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
i4_rslt_vert_16x8_r3;
int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
i4_rslt_vert_16x8_r7;
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
pu1_inp = pu1_inp_buf;
pi2_tmp = pi2_tmp_filt_buf;
i4_src_stride = DYADIC_REF_W_C;
/* Vertical interpolation */
i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_rslt_vert_16x8_r0 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
i4_rslt_vert_16x8_r1 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
i4_rslt_vert_16x8_r2 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
i4_rslt_vert_16x8_r3 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
i4_rslt_vert_16x8_r4 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
i4_rslt_vert_16x8_r5 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
i4_rslt_vert_16x8_r6 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
i4_rslt_vert_16x8_r7 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2);
i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3);
vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_vert_interpol_chroma_dyadic_2_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 2 0 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
WORD32 i4_phase_0, WORD32 i4_phase_1)
{
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_src_stride;
UWORD8 *pu1_inp;
WORD16 *pi2_tmp;
uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3;
uint8x8_t i4_samp_vert_8x8_r4;
int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
i4_rslt_vert_16x8_r3;
int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
i4_rslt_vert_16x8_r7;
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
pi2_tmp = pi2_tmp_filt_buf;
i4_src_stride = DYADIC_REF_W_C;
pu1_inp = pu1_inp_buf + i4_src_stride;
/* Vertical interpolation */
i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
/* since y_phase = phase_0 for y = 0 */
i4_rslt_vert_16x8_r0 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
i4_rslt_vert_16x8_r1 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2);
i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3);
vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
i4_rslt_vert_16x8_r2 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
i4_rslt_vert_16x8_r3 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
i4_rslt_vert_16x8_r4 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
i4_rslt_vert_16x8_r5 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
i4_rslt_vert_16x8_r6 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
i4_rslt_vert_16x8_r7 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
}
/*****************************************************************************/
/* */
/* Function Name : isvcd_vert_interpol_chroma_dyadic_3_neonintr */
/* */
/* Description : This function takes the reference array buffer & performs*/
/* vertical intra resampling for dyadic scaling ratios for */
/* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
/* chroma_phase_y_plus1: */
/* ref_lyr cur_lyr */
/* 2 0 */
/* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
/* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
/* vertically interpolated data */
/* i4_phase_0 : y phase for even values of y */
/* i4_phase_1 : y phase for odd values of y */
/* Globals : none */
/* Processing : it does the interpolation in vertical direction */
/* Outputs : vertically resampled samples */
/* Returns : none */
/* */
/* Issues : none */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 21 05 2021 Dolan creation */
/* */
/*****************************************************************************/
void isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
WORD32 i4_phase_0, WORD32 i4_phase_1)
{
WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
WORD32 i4_src_stride;
UWORD8 *pu1_inp;
WORD16 *pi2_tmp;
uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2;
uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4;
int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
i4_rslt_vert_16x8_r3;
int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
i4_rslt_vert_16x8_r7;
i4_coeff_0 = 8 - i4_phase_0;
i4_coeff_1 = i4_phase_0;
i4_coeff_2 = 8 - i4_phase_1;
i4_coeff_3 = i4_phase_1;
pi2_tmp = pi2_tmp_filt_buf;
i4_src_stride = DYADIC_REF_W_C;
pu1_inp = pu1_inp_buf;
/* Vertical interpolation */
/* y = 0, y_phase = phase_0 */
i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
pu1_inp += i4_src_stride;
/* since y_phase = phase_0 for y = 0 */
i4_rslt_vert_16x8_r0 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
i4_rslt_vert_16x8_r1 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2);
i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3);
vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
i4_rslt_vert_16x8_r2 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
i4_rslt_vert_16x8_r3 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
i4_rslt_vert_16x8_r4 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
i4_rslt_vert_16x8_r5 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
i4_rslt_vert_16x8_r6 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
i4_rslt_vert_16x8_r7 =
vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
}