/****************************************************************************** * * Copyright (C) 2022 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ****************************************************************************** * @file ih264e_downscaler_neon.c * * @brief * This file contains the ARMV8 SIMD version of the function which does * horizontal scaling and transpose * * @author * Ittiam * * @par List of Functions: * - ih264e_horizontal_downscale_and_transpose_av8() * * @remarks * None * ******************************************************************************* */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include #include #include /* User include files */ #include "ih264_typedefs.h" #include "ih264_macros.h" #include "ih264_platform_macros.h" #include "isvc_defs.h" #include "isvce_defs.h" #include "isvc_structs.h" #include "isvce_downscaler_private_defs.h" void isvce_horizontal_downscale_and_transpose_neon( downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst, FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma) { WORD32 i, j; UWORD8 u1_phase; UWORD8 *pu1_src_j, *pu1_dst_j; UWORD8 *pu1_in_pixel; UWORD8 *pu1_out_pixel; WORD8 *pi1_filter_grid; UWORD16 u2_full_pixel_inc; UWORD32 u4_num_iterations_vertical_by_16, u4_num_iterations_vertical_by_8; UWORD32 u4_rem_vert_loop_by_8, u4_rem_vert_loop_by_4; UWORD32 u4_rem_vert_loop; UWORD32 u4_height_finished; uint8x8_t reg_8x8_src_r0, reg_8x8_src_r1, reg_8x8_src_r2, reg_8x8_src_r3, reg_8x8_src_r4, reg_8x8_src_r5, reg_8x8_src_r6, reg_8x8_src_r7; uint16x8_t reg_16x8_src_r0, reg_16x8_src_r1, reg_16x8_src_r2, reg_16x8_src_r3, reg_16x8_src_r4, reg_16x8_src_r5, reg_16x8_src_r6, reg_16x8_src_r7; int16x8_t reg_16x8_mul_r0, reg_16x8_mul_r1, reg_16x8_mul_r2, reg_16x8_mul_r3, reg_16x8_mul_r4, reg_16x8_mul_r5, reg_16x8_mul_r6, reg_16x8_mul_r7; int32x4_t reg_32x4_sum_r0, reg_32x4_sum_r1, reg_32x4_sum_r2, reg_32x4_sum_r3, reg_32x4_sum_r4, reg_32x4_sum_r5, reg_32x4_sum_r6, reg_32x4_sum_r7; int32x4_t reg_32x4_sum_r01, reg_32x4_sum_r23, reg_32x4_sum_r45, reg_32x4_sum_r67, reg_32x4_sum_r89, reg_32x4_sum_r1011, reg_32x4_sum_r1213, reg_32x4_sum_r1415; uint8x8_t reg_8x8_src_r8, reg_8x8_src_r9, reg_8x8_src_r10, reg_8x8_src_r11, reg_8x8_src_r12, reg_8x8_src_r13, reg_8x8_src_r14, reg_8x8_src_r15; uint16x8_t reg_16x8_src_r8, reg_16x8_src_r9, reg_16x8_src_r10, reg_16x8_src_r11, reg_16x8_src_r12, reg_16x8_src_r13, reg_16x8_src_r14, reg_16x8_src_r15; int16x8_t reg_16x8_mul_r8, reg_16x8_mul_r9, reg_16x8_mul_r10, reg_16x8_mul_r11, reg_16x8_mul_r12, reg_16x8_mul_r13, reg_16x8_mul_r14, reg_16x8_mul_r15; int32x4_t reg_32x4_sum_r8, reg_32x4_sum_r9, reg_32x4_sum_r10, reg_32x4_sum_r11, reg_32x4_sum_r12, reg_32x4_sum_r13, reg_32x4_sum_r14, reg_32x4_sum_r15; uint8x16_t reg_8x16_src_r0, reg_8x16_src_r1, reg_8x16_src_r2, reg_8x16_src_r3, reg_8x16_src_r4, reg_8x16_src_r5, reg_8x16_src_r6, reg_8x16_src_r7; uint16x8_t reg_16x8_src_cb_r0, reg_16x8_src_cb_r1, reg_16x8_src_cb_r2, reg_16x8_src_cb_r3, reg_16x8_src_cb_r4, reg_16x8_src_cb_r5, reg_16x8_src_cb_r6, reg_16x8_src_cb_r7; uint16x8_t reg_16x8_src_cr_r0, reg_16x8_src_cr_r1, reg_16x8_src_cr_r2, reg_16x8_src_cr_r3, reg_16x8_src_cr_r4, reg_16x8_src_cr_r5, reg_16x8_src_cr_r6, reg_16x8_src_cr_r7; int16x8_t reg_16x8_mul_cb_r0, reg_16x8_mul_cb_r1, reg_16x8_mul_cb_r2, reg_16x8_mul_cb_r3, reg_16x8_mul_cb_r4, reg_16x8_mul_cb_r5, reg_16x8_mul_cb_r6, reg_16x8_mul_cb_r7; int16x8_t reg_16x8_mul_cr_r0, reg_16x8_mul_cr_r1, reg_16x8_mul_cr_r2, reg_16x8_mul_cr_r3, reg_16x8_mul_cr_r4, reg_16x8_mul_cr_r5, reg_16x8_mul_cr_r6, reg_16x8_mul_cr_r7; int32x4_t reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1, reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3, reg_32x4_sum_cb_r4, reg_32x4_sum_cb_r5, reg_32x4_sum_cb_r6, reg_32x4_sum_cb_r7; int32x4_t reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1, reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3, reg_32x4_sum_cr_r4, reg_32x4_sum_cr_r5, reg_32x4_sum_cr_r6, reg_32x4_sum_cr_r7; int32x4_t reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23, reg_32x4_sum_cb_r45, reg_32x4_sum_cb_r67; uint16x4_t reg_16x4_sum_cb_r01_23, reg_16x4_sum_cb_r45_67; uint16x8_t reg_16x8_sum_cb_r0_r7; uint8x8_t reg_8x8_sum_cb_r0_r7; int32x4_t reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23, reg_32x4_sum_cr_r45, reg_32x4_sum_cr_r67; uint16x4_t reg_16x4_sum_cr_r01_23, reg_16x4_sum_cr_r45_67; uint16x8_t reg_16x8_sum_cr_r0_r7; uint8x8_t reg_8x8_sum_cr_r0_r7; uint16x8_t reg_16x8_sum_cb_cr_r0_r3; uint8x8_t reg_8x8_sum_cb_cr_r0_r3; int32x4_t reg_32x4_sum_cb_cr_r0; uint16x4_t reg_16x4_sum_cb_cr_r0; int32x4_t reg_32x4_zero = vdupq_n_s32(0); uint16x4_t reg_16x4_sum_r01_23, reg_16x4_sum_r45_67; uint16x4_t reg_16x4_sum_r8_r11, reg_16x4_sum_r12_r15; uint16x8_t reg_16x8_sum_r0_r7, reg_16x8_sum_r8_r15; uint8x8_t reg_8x8_sum_r0_r7, reg_8x8_sum_r8_r15; uint8x16_t reg_8x16_sum_r0_r15; int8x8_t reg_8x8_filt_coeff_grid; int16x8_t reg_16x8_filt_coeff_grid; int32x4x2_t reg_32x4x2_sum_r01, reg_32x4x2_sum_r23, reg_32x4x2_sum_r45, reg_32x4x2_sum_r67; int32x4x2_t reg_32x4x2_sum_r89, reg_32x4x2_sum_r1011, reg_32x4x2_sum_r1213, reg_32x4x2_sum_r1415; uint8x16x2_t reg_8x16x2_src_r0, reg_8x16x2_src_r1, reg_8x16x2_src_r2, reg_8x16x2_src_r3; downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state; UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset; UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment; UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment; UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data; UWORD32 u4_in_stride = ps_src->i4_data_stride; UWORD8 *pu1_dst = (UWORD8 *) ps_dst->pv_data; UWORD32 u4_out_stride = ps_dst->i4_data_stride; UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos; /* Offset the input so that the input pixel to be processed co-incides with the centre of filter (4th coefficient)*/ pu1_src += (1 + u1_is_chroma); ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments); if(!u1_is_chroma) { u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4; u4_rem_vert_loop = u4_blk_ht % 16; for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++) { pu1_src_j = pu1_src + ((j << 4) * u4_in_stride); pu1_dst_j = pu1_dst + (j << 4); u4_center_pixel_pos = u4_center_pixel_pos_src; for(i = 0; i < (WORD32) u4_blk_wd; i++) { u1_phase = get_filter_phase(u4_center_pixel_pos); pi1_filter_grid = pai1_filters[u1_phase]; /* Doing the Calculation for current Loop Count */ u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid); /******************************************************/ /* This loop is going vertically in bottom direction */ /* but the output pixels are stored in horizontal */ /* direction in transpose manner */ /******************************************************/ /* r0-r7 */ reg_8x8_src_r0 = vld1_u8(pu1_in_pixel); reg_8x8_src_r1 = vld1_u8(pu1_in_pixel + u4_in_stride); reg_8x8_src_r2 = vld1_u8(pu1_in_pixel + 2 * u4_in_stride); reg_8x8_src_r3 = vld1_u8(pu1_in_pixel + 3 * u4_in_stride); reg_8x8_src_r4 = vld1_u8(pu1_in_pixel + 4 * u4_in_stride); reg_8x8_src_r5 = vld1_u8(pu1_in_pixel + 5 * u4_in_stride); reg_8x8_src_r6 = vld1_u8(pu1_in_pixel + 6 * u4_in_stride); reg_8x8_src_r7 = vld1_u8(pu1_in_pixel + 7 * u4_in_stride); /* r0-r7 */ reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0); reg_16x8_src_r1 = vmovl_u8(reg_8x8_src_r1); reg_16x8_src_r2 = vmovl_u8(reg_8x8_src_r2); reg_16x8_src_r3 = vmovl_u8(reg_8x8_src_r3); reg_16x8_src_r4 = vmovl_u8(reg_8x8_src_r4); reg_16x8_src_r5 = vmovl_u8(reg_8x8_src_r5); reg_16x8_src_r6 = vmovl_u8(reg_8x8_src_r6); reg_16x8_src_r7 = vmovl_u8(reg_8x8_src_r7); /* r8-r15 */ reg_8x8_src_r8 = vld1_u8(pu1_in_pixel + 8 * u4_in_stride); reg_8x8_src_r9 = vld1_u8(pu1_in_pixel + 9 * u4_in_stride); reg_8x8_src_r10 = vld1_u8(pu1_in_pixel + 10 * u4_in_stride); reg_8x8_src_r11 = vld1_u8(pu1_in_pixel + 11 * u4_in_stride); reg_8x8_src_r12 = vld1_u8(pu1_in_pixel + 12 * u4_in_stride); reg_8x8_src_r13 = vld1_u8(pu1_in_pixel + 13 * u4_in_stride); reg_8x8_src_r14 = vld1_u8(pu1_in_pixel + 14 * u4_in_stride); reg_8x8_src_r15 = vld1_u8(pu1_in_pixel + 15 * u4_in_stride); reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid); /*r0-r7 */ reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r1), reg_16x8_filt_coeff_grid); reg_16x8_mul_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r2), reg_16x8_filt_coeff_grid); reg_16x8_mul_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r3), reg_16x8_filt_coeff_grid); reg_16x8_mul_r4 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r4), reg_16x8_filt_coeff_grid); reg_16x8_mul_r5 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r5), reg_16x8_filt_coeff_grid); reg_16x8_mul_r6 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r6), reg_16x8_filt_coeff_grid); reg_16x8_mul_r7 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r7), reg_16x8_filt_coeff_grid); /* r8-r15 */ reg_16x8_src_r8 = vmovl_u8(reg_8x8_src_r8); reg_16x8_src_r9 = vmovl_u8(reg_8x8_src_r9); reg_16x8_src_r10 = vmovl_u8(reg_8x8_src_r10); reg_16x8_src_r11 = vmovl_u8(reg_8x8_src_r11); reg_16x8_src_r12 = vmovl_u8(reg_8x8_src_r12); reg_16x8_src_r13 = vmovl_u8(reg_8x8_src_r13); reg_16x8_src_r14 = vmovl_u8(reg_8x8_src_r14); reg_16x8_src_r15 = vmovl_u8(reg_8x8_src_r15); /* r0-r7 */ reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0); reg_32x4_sum_r1 = vpaddlq_s16(reg_16x8_mul_r1); reg_32x4_sum_r2 = vpaddlq_s16(reg_16x8_mul_r2); reg_32x4_sum_r3 = vpaddlq_s16(reg_16x8_mul_r3); reg_32x4_sum_r4 = vpaddlq_s16(reg_16x8_mul_r4); reg_32x4_sum_r5 = vpaddlq_s16(reg_16x8_mul_r5); reg_32x4_sum_r6 = vpaddlq_s16(reg_16x8_mul_r6); reg_32x4_sum_r7 = vpaddlq_s16(reg_16x8_mul_r7); /* r8-r15 */ reg_16x8_mul_r8 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r8), reg_16x8_filt_coeff_grid); reg_16x8_mul_r9 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r9), reg_16x8_filt_coeff_grid); reg_16x8_mul_r10 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r10), reg_16x8_filt_coeff_grid); reg_16x8_mul_r11 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r11), reg_16x8_filt_coeff_grid); reg_16x8_mul_r12 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r12), reg_16x8_filt_coeff_grid); reg_16x8_mul_r13 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r13), reg_16x8_filt_coeff_grid); reg_16x8_mul_r14 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r14), reg_16x8_filt_coeff_grid); reg_16x8_mul_r15 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r15), reg_16x8_filt_coeff_grid); /* r0-r7 */ reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_sum_r1); reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_r2, reg_32x4_sum_r3); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r4, reg_32x4_sum_r5); reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_r6, reg_32x4_sum_r7); reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]); reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_32x4_sum_r67 = vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]); /* r8-r15 */ reg_32x4_sum_r8 = vpaddlq_s16(reg_16x8_mul_r8); reg_32x4_sum_r9 = vpaddlq_s16(reg_16x8_mul_r9); reg_32x4_sum_r10 = vpaddlq_s16(reg_16x8_mul_r10); reg_32x4_sum_r11 = vpaddlq_s16(reg_16x8_mul_r11); reg_32x4_sum_r12 = vpaddlq_s16(reg_16x8_mul_r12); reg_32x4_sum_r13 = vpaddlq_s16(reg_16x8_mul_r13); reg_32x4_sum_r14 = vpaddlq_s16(reg_16x8_mul_r14); reg_32x4_sum_r15 = vpaddlq_s16(reg_16x8_mul_r15); /* r0-r7 */ reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_sum_r23); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r45, reg_32x4_sum_r67); reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); /* r8-r15 */ reg_32x4x2_sum_r89 = vuzpq_s32(reg_32x4_sum_r8, reg_32x4_sum_r9); reg_32x4x2_sum_r1011 = vuzpq_s32(reg_32x4_sum_r10, reg_32x4_sum_r11); reg_32x4x2_sum_r1213 = vuzpq_s32(reg_32x4_sum_r12, reg_32x4_sum_r13); reg_32x4x2_sum_r1415 = vuzpq_s32(reg_32x4_sum_r14, reg_32x4_sum_r15); reg_32x4_sum_r89 = vaddq_s32(reg_32x4x2_sum_r89.val[0], reg_32x4x2_sum_r89.val[1]); reg_32x4_sum_r1011 = vaddq_s32(reg_32x4x2_sum_r1011.val[0], reg_32x4x2_sum_r1011.val[1]); reg_32x4_sum_r1213 = vaddq_s32(reg_32x4x2_sum_r1213.val[0], reg_32x4x2_sum_r1213.val[1]); reg_32x4_sum_r1415 = vaddq_s32(reg_32x4x2_sum_r1415.val[0], reg_32x4x2_sum_r1415.val[1]); /* r0-r7 */ reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7); reg_16x4_sum_r45_67 = vqrshrun_n_s32(reg_32x4_sum_r45, 7); /* r8-r15 */ reg_32x4x2_sum_r89 = vuzpq_s32(reg_32x4_sum_r89, reg_32x4_sum_r1011); reg_32x4x2_sum_r1213 = vuzpq_s32(reg_32x4_sum_r1213, reg_32x4_sum_r1415); reg_32x4_sum_r89 = vaddq_s32(reg_32x4x2_sum_r89.val[0], reg_32x4x2_sum_r89.val[1]); reg_32x4_sum_r1213 = vaddq_s32(reg_32x4x2_sum_r1213.val[0], reg_32x4x2_sum_r1213.val[1]); /* r0-r7 */ reg_16x8_sum_r0_r7 = vcombine_u16(reg_16x4_sum_r01_23, reg_16x4_sum_r45_67); reg_8x8_sum_r0_r7 = vqmovn_u16(reg_16x8_sum_r0_r7); reg_16x4_sum_r8_r11 = vqrshrun_n_s32(reg_32x4_sum_r89, 7); reg_16x4_sum_r12_r15 = vqrshrun_n_s32(reg_32x4_sum_r1213, 7); reg_16x8_sum_r8_r15 = vcombine_u16(reg_16x4_sum_r8_r11, reg_16x4_sum_r12_r15); reg_8x8_sum_r8_r15 = vqmovn_u16(reg_16x8_sum_r8_r15); reg_8x16_sum_r0_r15 = vcombine_u8(reg_8x8_sum_r0_r7, reg_8x8_sum_r8_r15); /* r0-r7 */ vst1q_u8(pu1_out_pixel, reg_8x16_sum_r0_r15); pu1_out_pixel += 16; pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q; /* Update the context for next Loop Count */ u4_center_pixel_pos += u4_src_horz_increments; } } /* Loop for the remaining height less than 16 */ if(u4_rem_vert_loop) { u4_rem_vert_loop_by_8 = u4_rem_vert_loop >> 3; u4_rem_vert_loop = u4_rem_vert_loop % 8; u4_height_finished = (u4_num_iterations_vertical_by_16 << 4); pu1_src_j = pu1_src + ((u4_height_finished) *u4_in_stride); pu1_dst_j = pu1_dst + u4_height_finished; u4_center_pixel_pos = u4_center_pixel_pos_src; /* 8 <= remaining height < 16 */ if(u4_rem_vert_loop_by_8) { for(i = 0; i < (WORD32) u4_blk_wd; i++) { u1_phase = get_filter_phase(u4_center_pixel_pos); pi1_filter_grid = pai1_filters[u1_phase]; u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid); for(j = u4_rem_vert_loop_by_8; j > 0; j--) { /******************************************************/ /* This loop is going vertically in bottom direction */ /* but the output pixels are stored in horizontal */ /* direction in transpose manner */ /******************************************************/ reg_8x8_src_r0 = vld1_u8(pu1_in_pixel); reg_8x8_src_r1 = vld1_u8(pu1_in_pixel + u4_in_stride); reg_8x8_src_r2 = vld1_u8(pu1_in_pixel + 2 * u4_in_stride); reg_8x8_src_r3 = vld1_u8(pu1_in_pixel + 3 * u4_in_stride); reg_8x8_src_r4 = vld1_u8(pu1_in_pixel + 4 * u4_in_stride); reg_8x8_src_r5 = vld1_u8(pu1_in_pixel + 5 * u4_in_stride); reg_8x8_src_r6 = vld1_u8(pu1_in_pixel + 6 * u4_in_stride); reg_8x8_src_r7 = vld1_u8(pu1_in_pixel + 7 * u4_in_stride); reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0); reg_16x8_src_r1 = vmovl_u8(reg_8x8_src_r1); reg_16x8_src_r2 = vmovl_u8(reg_8x8_src_r2); reg_16x8_src_r3 = vmovl_u8(reg_8x8_src_r3); reg_16x8_src_r4 = vmovl_u8(reg_8x8_src_r4); reg_16x8_src_r5 = vmovl_u8(reg_8x8_src_r5); reg_16x8_src_r6 = vmovl_u8(reg_8x8_src_r6); reg_16x8_src_r7 = vmovl_u8(reg_8x8_src_r7); reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid); reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r1), reg_16x8_filt_coeff_grid); reg_16x8_mul_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r2), reg_16x8_filt_coeff_grid); reg_16x8_mul_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r3), reg_16x8_filt_coeff_grid); reg_16x8_mul_r4 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r4), reg_16x8_filt_coeff_grid); reg_16x8_mul_r5 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r5), reg_16x8_filt_coeff_grid); reg_16x8_mul_r6 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r6), reg_16x8_filt_coeff_grid); reg_16x8_mul_r7 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r7), reg_16x8_filt_coeff_grid); reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0); reg_32x4_sum_r1 = vpaddlq_s16(reg_16x8_mul_r1); reg_32x4_sum_r2 = vpaddlq_s16(reg_16x8_mul_r2); reg_32x4_sum_r3 = vpaddlq_s16(reg_16x8_mul_r3); reg_32x4_sum_r4 = vpaddlq_s16(reg_16x8_mul_r4); reg_32x4_sum_r5 = vpaddlq_s16(reg_16x8_mul_r5); reg_32x4_sum_r6 = vpaddlq_s16(reg_16x8_mul_r6); reg_32x4_sum_r7 = vpaddlq_s16(reg_16x8_mul_r7); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_sum_r1); reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_r2, reg_32x4_sum_r3); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r4, reg_32x4_sum_r5); reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_r6, reg_32x4_sum_r7); reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]); reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_32x4_sum_r67 = vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_sum_r23); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_r45, reg_32x4_sum_r67); reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7); reg_16x4_sum_r45_67 = vqrshrun_n_s32(reg_32x4_sum_r45, 7); reg_16x8_sum_r0_r7 = vcombine_u16(reg_16x4_sum_r01_23, reg_16x4_sum_r45_67); reg_8x8_sum_r0_r7 = vqmovn_u16(reg_16x8_sum_r0_r7); vst1_u8(pu1_out_pixel, reg_8x8_sum_r0_r7); pu1_out_pixel += 8; pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 3)) >> DOWNSCALER_Q; } /* Update the context for next Loop Count */ u4_center_pixel_pos += u4_src_horz_increments; } } /* 1 <= remaining height < 8 */ if(u4_rem_vert_loop) { u4_height_finished = ((u4_num_iterations_vertical_by_16 << 4) + (u4_rem_vert_loop_by_8 << 3)); pu1_src_j = pu1_src + u4_height_finished * u4_in_stride; pu1_dst_j = pu1_dst + u4_height_finished; u4_center_pixel_pos = u4_center_pixel_pos_src; for(i = 0; i < (WORD32) u4_blk_wd; i++) { u1_phase = get_filter_phase(u4_center_pixel_pos); pi1_filter_grid = pai1_filters[u1_phase]; u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid); for(j = u4_rem_vert_loop; j > 0; j--) { /******************************************************/ /* This loop is going vertically in bottom direction */ /* but the output pixels are stored in horizontal */ /* direction in transpose manner */ /******************************************************/ reg_8x8_src_r0 = vld1_u8(pu1_in_pixel); reg_16x8_src_r0 = vmovl_u8(reg_8x8_src_r0); reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid); reg_16x8_mul_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_r0), reg_16x8_filt_coeff_grid); reg_32x4_sum_r0 = vpaddlq_s16(reg_16x8_mul_r0); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r0, reg_32x4_zero); reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_r01, reg_32x4_zero); reg_32x4_sum_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_16x4_sum_r01_23 = vqrshrun_n_s32(reg_32x4_sum_r01, 7); vst1_lane_u8(pu1_out_pixel, vreinterpret_u8_u16(reg_16x4_sum_r01_23), 0); pu1_out_pixel += 1; pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q; } /* Update the context for next Loop Count */ u4_center_pixel_pos += u4_src_horz_increments; } } } } /* for chroma */ else { u4_num_iterations_vertical_by_8 = u4_blk_ht >> 3; u4_rem_vert_loop = u4_blk_ht % 8; for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_8; j++) { pu1_src_j = pu1_src + ((j << 3) * u4_in_stride); pu1_dst_j = pu1_dst + (j << 3); u4_center_pixel_pos = u4_center_pixel_pos_src; for(i = 0; i < (WORD32) u4_blk_wd; i++) { u1_phase = get_filter_phase(u4_center_pixel_pos); pi1_filter_grid = pai1_filters[u1_phase]; /*Doing the Calculation for current Loop Count */ u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid); /******************************************************/ /* This loop is going vertically in bottom direction */ /* but the output pixels are stored in horizontal */ /* direction in transpose manner */ /******************************************************/ reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel); reg_8x16_src_r1 = vld1q_u8(pu1_in_pixel + u4_in_stride); reg_8x16_src_r2 = vld1q_u8(pu1_in_pixel + 2 * u4_in_stride); reg_8x16_src_r3 = vld1q_u8(pu1_in_pixel + 3 * u4_in_stride); reg_8x16_src_r4 = vld1q_u8(pu1_in_pixel + 4 * u4_in_stride); reg_8x16_src_r5 = vld1q_u8(pu1_in_pixel + 5 * u4_in_stride); reg_8x16_src_r6 = vld1q_u8(pu1_in_pixel + 6 * u4_in_stride); reg_8x16_src_r7 = vld1q_u8(pu1_in_pixel + 7 * u4_in_stride); reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r1); reg_8x16x2_src_r1 = vuzpq_u8(reg_8x16_src_r2, reg_8x16_src_r3); reg_8x16x2_src_r2 = vuzpq_u8(reg_8x16_src_r4, reg_8x16_src_r5); reg_8x16x2_src_r3 = vuzpq_u8(reg_8x16_src_r6, reg_8x16_src_r7); reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0])); reg_16x8_src_cb_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[0])); reg_16x8_src_cb_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[0])); reg_16x8_src_cb_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[0])); reg_16x8_src_cb_r4 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r2.val[0])); reg_16x8_src_cb_r5 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r2.val[0])); reg_16x8_src_cb_r6 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r3.val[0])); reg_16x8_src_cb_r7 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r3.val[0])); reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1])); reg_16x8_src_cr_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[1])); reg_16x8_src_cr_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[1])); reg_16x8_src_cr_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[1])); reg_16x8_src_cr_r4 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r2.val[1])); reg_16x8_src_cr_r5 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r2.val[1])); reg_16x8_src_cr_r6 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r3.val[1])); reg_16x8_src_cr_r7 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r3.val[1])); reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid); reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r1), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r2), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r3), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r4 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r4), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r5 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r5), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r6 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r6), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r7 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r7), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r1), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r2), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r3), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r4 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r4), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r5 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r5), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r6 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r6), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r7 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r7), reg_16x8_filt_coeff_grid); reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0); reg_32x4_sum_cb_r1 = vpaddlq_s16(reg_16x8_mul_cb_r1); reg_32x4_sum_cb_r2 = vpaddlq_s16(reg_16x8_mul_cb_r2); reg_32x4_sum_cb_r3 = vpaddlq_s16(reg_16x8_mul_cb_r3); reg_32x4_sum_cb_r4 = vpaddlq_s16(reg_16x8_mul_cb_r4); reg_32x4_sum_cb_r5 = vpaddlq_s16(reg_16x8_mul_cb_r5); reg_32x4_sum_cb_r6 = vpaddlq_s16(reg_16x8_mul_cb_r6); reg_32x4_sum_cb_r7 = vpaddlq_s16(reg_16x8_mul_cb_r7); reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0); reg_32x4_sum_cr_r1 = vpaddlq_s16(reg_16x8_mul_cr_r1); reg_32x4_sum_cr_r2 = vpaddlq_s16(reg_16x8_mul_cr_r2); reg_32x4_sum_cr_r3 = vpaddlq_s16(reg_16x8_mul_cr_r3); reg_32x4_sum_cr_r4 = vpaddlq_s16(reg_16x8_mul_cr_r4); reg_32x4_sum_cr_r5 = vpaddlq_s16(reg_16x8_mul_cr_r5); reg_32x4_sum_cr_r6 = vpaddlq_s16(reg_16x8_mul_cr_r6); reg_32x4_sum_cr_r7 = vpaddlq_s16(reg_16x8_mul_cr_r7); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1); reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cb_r4, reg_32x4_sum_cb_r5); reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_cb_r6, reg_32x4_sum_cb_r7); reg_32x4_sum_cb_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_cb_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]); reg_32x4_sum_cb_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_32x4_sum_cb_r67 = vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cb_r45, reg_32x4_sum_cb_r67); reg_32x4_sum_cb_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_cb_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1); reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cr_r4, reg_32x4_sum_cr_r5); reg_32x4x2_sum_r67 = vuzpq_s32(reg_32x4_sum_cr_r6, reg_32x4_sum_cr_r7); reg_32x4_sum_cr_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_cr_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]); reg_32x4_sum_cr_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_32x4_sum_cr_r67 = vaddq_s32(reg_32x4x2_sum_r67.val[0], reg_32x4x2_sum_r67.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23); reg_32x4x2_sum_r45 = vuzpq_s32(reg_32x4_sum_cr_r45, reg_32x4_sum_cr_r67); reg_32x4_sum_cr_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_cr_r45 = vaddq_s32(reg_32x4x2_sum_r45.val[0], reg_32x4x2_sum_r45.val[1]); reg_16x4_sum_cb_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cb_r01, 7); reg_16x4_sum_cb_r45_67 = vqrshrun_n_s32(reg_32x4_sum_cb_r45, 7); reg_16x4_sum_cr_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cr_r01, 7); reg_16x4_sum_cr_r45_67 = vqrshrun_n_s32(reg_32x4_sum_cr_r45, 7); reg_16x8_sum_cb_r0_r7 = vcombine_u16(reg_16x4_sum_cb_r01_23, reg_16x4_sum_cb_r45_67); reg_16x8_sum_cr_r0_r7 = vcombine_u16(reg_16x4_sum_cr_r01_23, reg_16x4_sum_cr_r45_67); reg_8x8_sum_cb_r0_r7 = vqmovn_u16(reg_16x8_sum_cb_r0_r7); reg_8x8_sum_cr_r0_r7 = vqmovn_u16(reg_16x8_sum_cr_r0_r7); vst1_u8(pu1_out_pixel, reg_8x8_sum_cb_r0_r7); vst1_u8(pu1_out_pixel + u4_out_stride, reg_8x8_sum_cr_r0_r7); pu1_out_pixel += 8; pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 3)) >> DOWNSCALER_Q; /* Update the context for next Loop Count */ u4_center_pixel_pos += u4_src_horz_increments; } } /* Loop for the remaining height less than 8 */ if(u4_rem_vert_loop) { u4_rem_vert_loop_by_4 = u4_rem_vert_loop >> 2; u4_rem_vert_loop = u4_rem_vert_loop % 4; u4_height_finished = (u4_num_iterations_vertical_by_8 << 3); pu1_src_j = pu1_src + ((u4_height_finished) *u4_in_stride); pu1_dst_j = pu1_dst + u4_height_finished; u4_center_pixel_pos = u4_center_pixel_pos_src; /* 4<= remaining height < 8 */ if(u4_rem_vert_loop_by_4) { for(i = 0; i < (WORD32) u4_blk_wd; i++) { u1_phase = get_filter_phase(u4_center_pixel_pos); pi1_filter_grid = pai1_filters[u1_phase]; u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid); for(j = u4_rem_vert_loop_by_4; j > 0; j--) { /******************************************************/ /* This loop is going vertically in bottom direction */ /* but the output pixels are stored in horizontal */ /* direction in transpose manner */ /******************************************************/ reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel); reg_8x16_src_r1 = vld1q_u8(pu1_in_pixel + u4_in_stride); reg_8x16_src_r2 = vld1q_u8(pu1_in_pixel + 2 * u4_in_stride); reg_8x16_src_r3 = vld1q_u8(pu1_in_pixel + 3 * u4_in_stride); reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r1); reg_8x16x2_src_r1 = vuzpq_u8(reg_8x16_src_r2, reg_8x16_src_r3); reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0])); reg_16x8_src_cb_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[0])); reg_16x8_src_cb_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[0])); reg_16x8_src_cb_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[0])); reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1])); reg_16x8_src_cr_r1 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r0.val[1])); reg_16x8_src_cr_r2 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r1.val[1])); reg_16x8_src_cr_r3 = vmovl_u8(vget_high_u8(reg_8x16x2_src_r1.val[1])); reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid); reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r1), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r2), reg_16x8_filt_coeff_grid); reg_16x8_mul_cb_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r3), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r1 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r1), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r2 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r2), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r3 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r3), reg_16x8_filt_coeff_grid); reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0); reg_32x4_sum_cb_r1 = vpaddlq_s16(reg_16x8_mul_cb_r1); reg_32x4_sum_cb_r2 = vpaddlq_s16(reg_16x8_mul_cb_r2); reg_32x4_sum_cb_r3 = vpaddlq_s16(reg_16x8_mul_cb_r3); reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0); reg_32x4_sum_cr_r1 = vpaddlq_s16(reg_16x8_mul_cr_r1); reg_32x4_sum_cr_r2 = vpaddlq_s16(reg_16x8_mul_cr_r2); reg_32x4_sum_cr_r3 = vpaddlq_s16(reg_16x8_mul_cr_r3); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cb_r1); reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cb_r2, reg_32x4_sum_cb_r3); reg_32x4_sum_cb_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_cb_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r01, reg_32x4_sum_cb_r23); reg_32x4_sum_cb_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r0, reg_32x4_sum_cr_r1); reg_32x4x2_sum_r23 = vuzpq_s32(reg_32x4_sum_cr_r2, reg_32x4_sum_cr_r3); reg_32x4_sum_cr_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4_sum_cr_r23 = vaddq_s32(reg_32x4x2_sum_r23.val[0], reg_32x4x2_sum_r23.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cr_r01, reg_32x4_sum_cr_r23); reg_32x4_sum_cr_r01 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_16x4_sum_cb_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cb_r01, 7); reg_16x4_sum_cr_r01_23 = vqrshrun_n_s32(reg_32x4_sum_cr_r01, 7); reg_16x8_sum_cb_cr_r0_r3 = vcombine_u16(reg_16x4_sum_cb_r01_23, reg_16x4_sum_cr_r01_23); reg_8x8_sum_cb_cr_r0_r3 = vmovn_u16(reg_16x8_sum_cb_cr_r0_r3); vst1_lane_u32((uint32_t *) (pu1_out_pixel), vreinterpret_u32_u8(reg_8x8_sum_cb_cr_r0_r3), 0); vst1_lane_u32((uint32_t *) (pu1_out_pixel + u4_out_stride), vreinterpret_u32_u8(reg_8x8_sum_cb_cr_r0_r3), 1); pu1_out_pixel += 4; pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 2)) >> DOWNSCALER_Q; } /* Update the context for next Loop Count */ u4_center_pixel_pos += u4_src_horz_increments; } } /* 1<= remaining height < 4 */ if(u4_rem_vert_loop) { u4_height_finished = ((u4_num_iterations_vertical_by_8 << 3) + (u4_rem_vert_loop_by_4 << 2)); pu1_src_j = pu1_src + u4_height_finished * u4_in_stride; pu1_dst_j = pu1_dst + u4_height_finished; u4_center_pixel_pos = u4_center_pixel_pos_src; for(i = 0; i < (WORD32) u4_blk_wd; i++) { u1_phase = get_filter_phase(u4_center_pixel_pos); pi1_filter_grid = pai1_filters[u1_phase]; u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q; pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma); pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride); reg_8x8_filt_coeff_grid = vld1_s8(pi1_filter_grid); for(j = u4_rem_vert_loop; j > 0; j--) { /******************************************************/ /* This loop is going vertically in bottom direction */ /* but the output pixels are stored in horizontal */ /* direction in transpose manner */ /******************************************************/ reg_8x16_src_r0 = vld1q_u8(pu1_in_pixel); reg_8x16x2_src_r0 = vuzpq_u8(reg_8x16_src_r0, reg_8x16_src_r0); reg_16x8_src_cb_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[0])); reg_16x8_src_cr_r0 = vmovl_u8(vget_low_u8(reg_8x16x2_src_r0.val[1])); reg_16x8_filt_coeff_grid = vmovl_s8(reg_8x8_filt_coeff_grid); reg_16x8_mul_cb_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cb_r0), reg_16x8_filt_coeff_grid); reg_16x8_mul_cr_r0 = vmulq_s16(vreinterpretq_s16_u16(reg_16x8_src_cr_r0), reg_16x8_filt_coeff_grid); reg_32x4_sum_cb_r0 = vpaddlq_s16(reg_16x8_mul_cb_r0); reg_32x4_sum_cr_r0 = vpaddlq_s16(reg_16x8_mul_cr_r0); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_r0, reg_32x4_sum_cr_r0); reg_32x4_sum_cb_cr_r0 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_32x4x2_sum_r01 = vuzpq_s32(reg_32x4_sum_cb_cr_r0, reg_32x4_zero); reg_32x4_sum_cb_cr_r0 = vaddq_s32(reg_32x4x2_sum_r01.val[0], reg_32x4x2_sum_r01.val[1]); reg_16x4_sum_cb_cr_r0 = vqrshrun_n_s32(reg_32x4_sum_cb_cr_r0, 7); vst1_lane_u8((pu1_out_pixel), vreinterpret_u8_u16(reg_16x4_sum_cb_cr_r0), 0); vst1_lane_u8((pu1_out_pixel + u4_out_stride), vreinterpret_u8_u16(reg_16x4_sum_cb_cr_r0), 2); pu1_out_pixel += 1; pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride)) >> DOWNSCALER_Q; } /* Update the context for next Loop Count */ u4_center_pixel_pos += u4_src_horz_increments; } } } } }