/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- // DC static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, const uint16_t *above, const uint16_t *left) { assert(bw >= 4); assert(IS_POWER_OF_TWO(bw)); int expected_dc, sum = 0; const int count = bw * 2; uint32x4_t sum_q = vdupq_n_u32(0); uint32x2_t sum_d; uint16_t *dst_1; if (bw >= 8) { for (int i = 0; i < bw; i += 8) { sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); above += 8; left += 8; } sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); expected_dc = (sum + (count >> 1)) / count; const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); for (int r = 0; r < bw; r++) { dst_1 = dst; for (int i = 0; i < bw; i += 8) { vst1q_u16(dst_1, dc); dst_1 += 8; } dst += stride; } } else { // 4x4 sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); expected_dc = (sum + (count >> 1)) / count; const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); for (int r = 0; r < bw; r++) { vst1_u16(dst, dc); dst += stride; } } } #define INTRA_PRED_HIGHBD_SIZED_NEON(type, width) \ void aom_highbd_##type##_predictor_##width##x##width##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_##type##_predictor(dst, stride, width, above, left); \ } #define INTRA_PRED_SQUARE(type) \ INTRA_PRED_HIGHBD_SIZED_NEON(type, 4) \ INTRA_PRED_HIGHBD_SIZED_NEON(type, 8) \ INTRA_PRED_HIGHBD_SIZED_NEON(type, 16) \ INTRA_PRED_HIGHBD_SIZED_NEON(type, 32) \ INTRA_PRED_HIGHBD_SIZED_NEON(type, 64) INTRA_PRED_SQUARE(dc) #undef INTRA_PRED_SQUARE // ----------------------------------------------------------------------------- // V_PRED #define HIGHBD_V_NXM(W, H) \ void aom_highbd_v_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)left; \ (void)bd; \ vertical##W##xh_neon(dst, stride, above, H); \ } static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) { uint16x8x2_t x; // Clang/gcc uses ldp here. x.val[0] = vld1q_u16(ptr); x.val[1] = vld1q_u16(ptr + 8); return x; } static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) { vst1q_u16(ptr, x.val[0]); vst1q_u16(ptr + 8, x.val[1]); } static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x4_t row = vld1_u16(above); int y = height; do { vst1_u16(dst, row); vst1_u16(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x8_t row = vld1q_u16(above); int y = height; do { vst1q_u16(dst, row); vst1q_u16(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x8x2_t row = load_uint16x8x2(above); int y = height; do { store_uint16x8x2(dst, row); store_uint16x8x2(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) { uint16x8x4_t x; // Clang/gcc uses ldp here. x.val[0] = vld1q_u16(ptr); x.val[1] = vld1q_u16(ptr + 8); x.val[2] = vld1q_u16(ptr + 16); x.val[3] = vld1q_u16(ptr + 24); return x; } static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) { vst1q_u16(ptr, x.val[0]); vst1q_u16(ptr + 8, x.val[1]); vst1q_u16(ptr + 16, x.val[2]); vst1q_u16(ptr + 24, x.val[3]); } static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x8x4_t row = load_uint16x8x4(above); int y = height; do { store_uint16x8x4(dst, row); store_uint16x8x4(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { uint16_t *dst32 = dst + 32; const uint16x8x4_t row = load_uint16x8x4(above); const uint16x8x4_t row32 = load_uint16x8x4(above + 32); int y = height; do { store_uint16x8x4(dst, row); store_uint16x8x4(dst32, row32); store_uint16x8x4(dst + stride, row); store_uint16x8x4(dst32 + stride, row32); dst += stride << 1; dst32 += stride << 1; y -= 2; } while (y != 0); } HIGHBD_V_NXM(4, 4) HIGHBD_V_NXM(4, 8) HIGHBD_V_NXM(4, 16) HIGHBD_V_NXM(8, 4) HIGHBD_V_NXM(8, 8) HIGHBD_V_NXM(8, 16) HIGHBD_V_NXM(8, 32) HIGHBD_V_NXM(16, 4) HIGHBD_V_NXM(16, 8) HIGHBD_V_NXM(16, 16) HIGHBD_V_NXM(16, 32) HIGHBD_V_NXM(16, 64) HIGHBD_V_NXM(32, 8) HIGHBD_V_NXM(32, 16) HIGHBD_V_NXM(32, 32) HIGHBD_V_NXM(32, 64) HIGHBD_V_NXM(64, 16) HIGHBD_V_NXM(64, 32) HIGHBD_V_NXM(64, 64) // ----------------------------------------------------------------------------- // PAETH static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, int width, int height) { const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); uint16x8_t top; if (width == 4) { top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0)); } else { // width == 8 top = vld1q_u16(top_row); } for (int y = 0; y < height; ++y) { const uint16x8_t left = vdupq_n_u16(left_column[y]); const uint16x8_t left_dist = vabdq_u16(top, top_left); const uint16x8_t top_dist = vabdq_u16(left, top_left); const uint16x8_t top_left_dist = vabdq_u16(vaddq_u16(top, left), top_left_x2); const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); // if (left_dist <= top_dist && left_dist <= top_left_dist) const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); // dest[x] = left_column[y]; // Fill all the unused spaces with 'top'. They will be overwritten when // the positions for top_left are known. uint16x8_t result = vbslq_u16(left_mask, left, top); // else if (top_dist <= top_left_dist) // dest[x] = top_row[x]; // Add these values to the mask. They were already set. const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); // else // dest[x] = top_left; result = vbslq_u16(left_or_top_mask, result, top_left); if (width == 4) { vst1_u16(dest, vget_low_u16(result)); } else { // width == 8 vst1q_u16(dest, result); } dest += stride; } } #define HIGHBD_PAETH_NXM(W, H) \ void aom_highbd_paeth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ } HIGHBD_PAETH_NXM(4, 4) HIGHBD_PAETH_NXM(4, 8) HIGHBD_PAETH_NXM(4, 16) HIGHBD_PAETH_NXM(8, 4) HIGHBD_PAETH_NXM(8, 8) HIGHBD_PAETH_NXM(8, 16) HIGHBD_PAETH_NXM(8, 32) // Select the closest values and collect them. static INLINE uint16x8_t select_paeth(const uint16x8_t top, const uint16x8_t left, const uint16x8_t top_left, const uint16x8_t left_le_top, const uint16x8_t left_le_top_left, const uint16x8_t top_le_top_left) { // if (left_dist <= top_dist && left_dist <= top_left_dist) const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); // dest[x] = left_column[y]; // Fill all the unused spaces with 'top'. They will be overwritten when // the positions for top_left are known. const uint16x8_t result = vbslq_u16(left_mask, left, top); // else if (top_dist <= top_left_dist) // dest[x] = top_row[x]; // Add these values to the mask. They were already set. const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); // else // dest[x] = top_left; return vbslq_u16(left_or_top_mask, result, top_left); } #define PAETH_PREDICTOR(num) \ do { \ const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \ const uint16x8_t top_left_dist = \ vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \ const uint16x8_t result = \ select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \ top_le_top_left); \ vst1q_u16(dest + (num * 8), result); \ } while (0) #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8)) static INLINE void highbd_paeth16_plus_x_h_neon( uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, int width, int height) { const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); uint16x8_t top[8]; top[0] = LOAD_TOP_ROW(0); top[1] = LOAD_TOP_ROW(1); if (width > 16) { top[2] = LOAD_TOP_ROW(2); top[3] = LOAD_TOP_ROW(3); if (width == 64) { top[4] = LOAD_TOP_ROW(4); top[5] = LOAD_TOP_ROW(5); top[6] = LOAD_TOP_ROW(6); top[7] = LOAD_TOP_ROW(7); } } for (int y = 0; y < height; ++y) { const uint16x8_t left = vdupq_n_u16(left_column[y]); const uint16x8_t top_dist = vabdq_u16(left, top_left); PAETH_PREDICTOR(0); PAETH_PREDICTOR(1); if (width > 16) { PAETH_PREDICTOR(2); PAETH_PREDICTOR(3); if (width == 64) { PAETH_PREDICTOR(4); PAETH_PREDICTOR(5); PAETH_PREDICTOR(6); PAETH_PREDICTOR(7); } } dest += stride; } } #define HIGHBD_PAETH_NXM_WIDE(W, H) \ void aom_highbd_paeth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ } HIGHBD_PAETH_NXM_WIDE(16, 4) HIGHBD_PAETH_NXM_WIDE(16, 8) HIGHBD_PAETH_NXM_WIDE(16, 16) HIGHBD_PAETH_NXM_WIDE(16, 32) HIGHBD_PAETH_NXM_WIDE(16, 64) HIGHBD_PAETH_NXM_WIDE(32, 8) HIGHBD_PAETH_NXM_WIDE(32, 16) HIGHBD_PAETH_NXM_WIDE(32, 32) HIGHBD_PAETH_NXM_WIDE(32, 64) HIGHBD_PAETH_NXM_WIDE(64, 16) HIGHBD_PAETH_NXM_WIDE(64, 32) HIGHBD_PAETH_NXM_WIDE(64, 64) // ----------------------------------------------------------------------------- // SMOOTH // 256 - v = vneg_s8(v) static INLINE uint16x4_t negate_s8(const uint16x4_t v) { return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); } static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[3]; const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4_t top_v = vld1_u16(top_row); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16); const uint16x4_t scaled_weights_x = negate_s8(weights_x_v); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { // Each variable in the running summation is named for the last item to be // accumulated. const uint32x4_t weighted_top = vmlal_n_u16(weighted_tr, top_v, weights_y[y]); const uint32x4_t weighted_left = vmlal_n_u16(weighted_top, weights_x_v, left_column[y]); const uint32x4_t weighted_bl = vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]); const uint16x4_t pred = vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1); vst1_u16(dst, pred); dst += stride; } } // Common code between 8xH and [16|32|64]xH. static INLINE void highbd_calculate_pred8( uint16_t *dst, const uint32x4_t weighted_corners_low, const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals, const uint16x4x2_t weights_x, const uint16_t left_y, const uint16_t weight_y) { // Each variable in the running summation is named for the last item to be // accumulated. const uint32x4_t weighted_top_low = vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y); const uint32x4_t weighted_edges_low = vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y); const uint16x4_t pred_low = vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1); vst1_u16(dst, pred_low); const uint32x4_t weighted_top_high = vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y); const uint32x4_t weighted_edges_high = vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y); const uint16x4_t pred_high = vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1); vst1_u16(dst + 4, pred_high); } static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[7]; const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4x2_t top_vals = { { vld1_u16(top_row), vld1_u16(top_row + 4) } }; const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), vld1_u16(smooth_weights_u16 + 8) } }; const uint32x4_t weighted_tr_low = vmull_n_u16(negate_s8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = vmull_n_u16(negate_s8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_corners_low = vaddq_u32(weighted_bl, weighted_tr_low); const uint32x4_t weighted_corners_high = vaddq_u32(weighted_bl, weighted_tr_high); highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high, top_vals, weights_x, left_column[y], weights_y[y]); dst += stride; } } #define HIGHBD_SMOOTH_NXM(W, H) \ void aom_highbd_smooth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ } HIGHBD_SMOOTH_NXM(4, 4) HIGHBD_SMOOTH_NXM(4, 8) HIGHBD_SMOOTH_NXM(8, 4) HIGHBD_SMOOTH_NXM(8, 8) HIGHBD_SMOOTH_NXM(4, 16) HIGHBD_SMOOTH_NXM(8, 16) HIGHBD_SMOOTH_NXM(8, 32) #undef HIGHBD_SMOOTH_NXM // For width 16 and above. #define HIGHBD_SMOOTH_PREDICTOR(W) \ static void highbd_smooth_##W##xh_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t top_right = top_row[(W)-1]; \ const uint16_t bottom_left = left_column[height - 1]; \ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ \ /* Precompute weighted values that don't vary with |y|. */ \ uint32x4_t weighted_tr_low[(W) >> 3]; \ uint32x4_t weighted_tr_high[(W) >> 3]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ const uint16x4_t weights_x_low = \ vld1_u16(smooth_weights_u16 + (W)-4 + x); \ weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \ const uint16x4_t weights_x_high = \ vld1_u16(smooth_weights_u16 + (W) + x); \ weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \ } \ \ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ for (int y = 0; y < height; ++y) { \ const uint32x4_t weighted_bl = \ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ uint16_t *dst_x = dst; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \ vld1_u16(top_row + x + 4) } }; \ const uint32x4_t weighted_corners_low = \ vaddq_u32(weighted_bl, weighted_tr_low[i]); \ const uint32x4_t weighted_corners_high = \ vaddq_u32(weighted_bl, weighted_tr_high[i]); \ /* Accumulate weighted edge values and store. */ \ const uint16x4x2_t weights_x = { \ { vld1_u16(smooth_weights_u16 + (W)-4 + x), \ vld1_u16(smooth_weights_u16 + (W) + x) } \ }; \ highbd_calculate_pred8(dst_x, weighted_corners_low, \ weighted_corners_high, top_vals, weights_x, \ left_column[y], weights_y[y]); \ dst_x += 8; \ } \ dst += stride; \ } \ } HIGHBD_SMOOTH_PREDICTOR(16) HIGHBD_SMOOTH_PREDICTOR(32) HIGHBD_SMOOTH_PREDICTOR(64) #undef HIGHBD_SMOOTH_PREDICTOR #define HIGHBD_SMOOTH_NXM_WIDE(W, H) \ void aom_highbd_smooth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ } HIGHBD_SMOOTH_NXM_WIDE(16, 4) HIGHBD_SMOOTH_NXM_WIDE(16, 8) HIGHBD_SMOOTH_NXM_WIDE(16, 16) HIGHBD_SMOOTH_NXM_WIDE(16, 32) HIGHBD_SMOOTH_NXM_WIDE(16, 64) HIGHBD_SMOOTH_NXM_WIDE(32, 8) HIGHBD_SMOOTH_NXM_WIDE(32, 16) HIGHBD_SMOOTH_NXM_WIDE(32, 32) HIGHBD_SMOOTH_NXM_WIDE(32, 64) HIGHBD_SMOOTH_NXM_WIDE(64, 16) HIGHBD_SMOOTH_NXM_WIDE(64, 32) HIGHBD_SMOOTH_NXM_WIDE(64, 64) #undef HIGHBD_SMOOTH_NXM_WIDE static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4_t top_v = vld1_u16(top_row); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_top = vmlal_n_u16(weighted_bl, top_v, weights_y[y]); vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4_t top_low = vld1_u16(top_row); const uint16x4_t top_high = vld1_u16(top_row + 4); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_top_low = vmlal_n_u16(weighted_bl, top_low, weights_y[y]); vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); const uint32x4_t weighted_top_high = vmlal_n_u16(weighted_bl, top_high, weights_y[y]); vst1_u16(dst + 4, vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } #define HIGHBD_SMOOTH_V_NXM(W, H) \ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ } HIGHBD_SMOOTH_V_NXM(4, 4) HIGHBD_SMOOTH_V_NXM(4, 8) HIGHBD_SMOOTH_V_NXM(4, 16) HIGHBD_SMOOTH_V_NXM(8, 4) HIGHBD_SMOOTH_V_NXM(8, 8) HIGHBD_SMOOTH_V_NXM(8, 16) HIGHBD_SMOOTH_V_NXM(8, 32) #undef HIGHBD_SMOOTH_V_NXM // For width 16 and above. #define HIGHBD_SMOOTH_V_PREDICTOR(W) \ static void highbd_smooth_v_##W##xh_neon( \ uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t bottom_left = left_column[height - 1]; \ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ \ uint16x4x2_t top_vals[(W) >> 3]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ top_vals[i].val[0] = vld1_u16(top_row + x); \ top_vals[i].val[1] = vld1_u16(top_row + x + 4); \ } \ \ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ for (int y = 0; y < height; ++y) { \ const uint32x4_t weighted_bl = \ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ \ uint16_t *dst_x = dst; \ for (int i = 0; i < (W) >> 3; ++i) { \ const uint32x4_t weighted_top_low = \ vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \ vst1_u16(dst_x, \ vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ \ const uint32x4_t weighted_top_high = \ vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \ vst1_u16(dst_x + 4, \ vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ dst_x += 8; \ } \ dst += stride; \ } \ } HIGHBD_SMOOTH_V_PREDICTOR(16) HIGHBD_SMOOTH_V_PREDICTOR(32) HIGHBD_SMOOTH_V_PREDICTOR(64) #undef HIGHBD_SMOOTH_V_PREDICTOR #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ } HIGHBD_SMOOTH_V_NXM_WIDE(16, 4) HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) HIGHBD_SMOOTH_V_NXM_WIDE(16, 64) HIGHBD_SMOOTH_V_NXM_WIDE(32, 8) HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) HIGHBD_SMOOTH_V_NXM_WIDE(64, 16) HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) #undef HIGHBD_SMOOTH_V_NXM_WIDE static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[3]; const uint16x4_t weights_x = vld1_u16(smooth_weights_u16); const uint16x4_t scaled_weights_x = negate_s8(weights_x); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_left = vmlal_n_u16(weighted_tr, weights_x, left_column[y]); vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[7]; const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), vld1_u16(smooth_weights_u16 + 8) } }; const uint32x4_t weighted_tr_low = vmull_n_u16(negate_s8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = vmull_n_u16(negate_s8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { const uint16_t left_y = left_column[y]; const uint32x4_t weighted_left_low = vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y); vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); const uint32x4_t weighted_left_high = vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y); vst1_u16(dst + 4, vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } #define HIGHBD_SMOOTH_H_NXM(W, H) \ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ } HIGHBD_SMOOTH_H_NXM(4, 4) HIGHBD_SMOOTH_H_NXM(4, 8) HIGHBD_SMOOTH_H_NXM(4, 16) HIGHBD_SMOOTH_H_NXM(8, 4) HIGHBD_SMOOTH_H_NXM(8, 8) HIGHBD_SMOOTH_H_NXM(8, 16) HIGHBD_SMOOTH_H_NXM(8, 32) #undef HIGHBD_SMOOTH_H_NXM // For width 16 and above. #define HIGHBD_SMOOTH_H_PREDICTOR(W) \ void highbd_smooth_h_##W##xh_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t top_right = top_row[(W)-1]; \ \ uint16x4_t weights_x_low[(W) >> 3]; \ uint16x4_t weights_x_high[(W) >> 3]; \ uint32x4_t weighted_tr_low[(W) >> 3]; \ uint32x4_t weighted_tr_high[(W) >> 3]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \ weighted_tr_low[i] = \ vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \ weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \ weighted_tr_high[i] = \ vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \ } \ \ for (int y = 0; y < height; ++y) { \ uint16_t *dst_x = dst; \ const uint16_t left_y = left_column[y]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const uint32x4_t weighted_left_low = \ vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \ vst1_u16(dst_x, \ vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ \ const uint32x4_t weighted_left_high = \ vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \ vst1_u16(dst_x + 4, \ vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ dst_x += 8; \ } \ dst += stride; \ } \ } HIGHBD_SMOOTH_H_PREDICTOR(16) HIGHBD_SMOOTH_H_PREDICTOR(32) HIGHBD_SMOOTH_H_PREDICTOR(64) #undef HIGHBD_SMOOTH_H_PREDICTOR #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ } HIGHBD_SMOOTH_H_NXM_WIDE(16, 4) HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) HIGHBD_SMOOTH_H_NXM_WIDE(16, 64) HIGHBD_SMOOTH_H_NXM_WIDE(32, 8) HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) HIGHBD_SMOOTH_H_NXM_WIDE(64, 16) HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) #undef HIGHBD_SMOOTH_H_NXM_WIDE