| /* | 
 |  * Copyright (c) 2022, Alliance for Open Media. All rights reserved | 
 |  * | 
 |  * This source code is subject to the terms of the BSD 2 Clause License and | 
 |  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 |  * was not distributed with this source code in the LICENSE file, you can | 
 |  * obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 |  * Media Patent License 1.0 was not distributed with this source code in the | 
 |  * PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 |  */ | 
 |  | 
 | #include <arm_neon.h> | 
 |  | 
 | #include "config/aom_config.h" | 
 | #include "config/aom_dsp_rtcd.h" | 
 |  | 
 | #include "aom/aom_integer.h" | 
 | #include "aom_dsp/intrapred_common.h" | 
 |  | 
 | // ----------------------------------------------------------------------------- | 
 | // DC | 
 |  | 
 | static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, | 
 |                                        const uint16_t *above, | 
 |                                        const uint16_t *left) { | 
 |   assert(bw >= 4); | 
 |   assert(IS_POWER_OF_TWO(bw)); | 
 |   int expected_dc, sum = 0; | 
 |   const int count = bw * 2; | 
 |   uint32x4_t sum_q = vdupq_n_u32(0); | 
 |   uint32x2_t sum_d; | 
 |   uint16_t *dst_1; | 
 |   if (bw >= 8) { | 
 |     for (int i = 0; i < bw; i += 8) { | 
 |       sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); | 
 |       sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); | 
 |       above += 8; | 
 |       left += 8; | 
 |     } | 
 |     sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); | 
 |     sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); | 
 |     expected_dc = (sum + (count >> 1)) / count; | 
 |     const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); | 
 |     for (int r = 0; r < bw; r++) { | 
 |       dst_1 = dst; | 
 |       for (int i = 0; i < bw; i += 8) { | 
 |         vst1q_u16(dst_1, dc); | 
 |         dst_1 += 8; | 
 |       } | 
 |       dst += stride; | 
 |     } | 
 |   } else {  // 4x4 | 
 |     sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); | 
 |     sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); | 
 |     sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); | 
 |     expected_dc = (sum + (count >> 1)) / count; | 
 |     const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); | 
 |     for (int r = 0; r < bw; r++) { | 
 |       vst1_u16(dst, dc); | 
 |       dst += stride; | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | #define INTRA_PRED_HIGHBD_SIZED_NEON(type, width)               \ | 
 |   void aom_highbd_##type##_predictor_##width##x##width##_neon(  \ | 
 |       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \ | 
 |       const uint16_t *left, int bd) {                           \ | 
 |     (void)bd;                                                   \ | 
 |     highbd_##type##_predictor(dst, stride, width, above, left); \ | 
 |   } | 
 |  | 
 | #define INTRA_PRED_SQUARE(type)          \ | 
 |   INTRA_PRED_HIGHBD_SIZED_NEON(type, 4)  \ | 
 |   INTRA_PRED_HIGHBD_SIZED_NEON(type, 8)  \ | 
 |   INTRA_PRED_HIGHBD_SIZED_NEON(type, 16) \ | 
 |   INTRA_PRED_HIGHBD_SIZED_NEON(type, 32) \ | 
 |   INTRA_PRED_HIGHBD_SIZED_NEON(type, 64) | 
 |  | 
 | INTRA_PRED_SQUARE(dc) | 
 |  | 
 | #undef INTRA_PRED_SQUARE | 
 |  | 
 | // ----------------------------------------------------------------------------- | 
 | // V_PRED | 
 |  | 
 | #define HIGHBD_V_NXM(W, H)                                    \ | 
 |   void aom_highbd_v_predictor_##W##x##H##_neon(               \ | 
 |       uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ | 
 |       const uint16_t *left, int bd) {                         \ | 
 |     (void)left;                                               \ | 
 |     (void)bd;                                                 \ | 
 |     vertical##W##xh_neon(dst, stride, above, H);              \ | 
 |   } | 
 |  | 
 | static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) { | 
 |   uint16x8x2_t x; | 
 |   // Clang/gcc uses ldp here. | 
 |   x.val[0] = vld1q_u16(ptr); | 
 |   x.val[1] = vld1q_u16(ptr + 8); | 
 |   return x; | 
 | } | 
 |  | 
 | static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) { | 
 |   vst1q_u16(ptr, x.val[0]); | 
 |   vst1q_u16(ptr + 8, x.val[1]); | 
 | } | 
 |  | 
 | static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                     const uint16_t *const above, int height) { | 
 |   const uint16x4_t row = vld1_u16(above); | 
 |   int y = height; | 
 |   do { | 
 |     vst1_u16(dst, row); | 
 |     vst1_u16(dst + stride, row); | 
 |     dst += stride << 1; | 
 |     y -= 2; | 
 |   } while (y != 0); | 
 | } | 
 |  | 
 | static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                     const uint16_t *const above, int height) { | 
 |   const uint16x8_t row = vld1q_u16(above); | 
 |   int y = height; | 
 |   do { | 
 |     vst1q_u16(dst, row); | 
 |     vst1q_u16(dst + stride, row); | 
 |     dst += stride << 1; | 
 |     y -= 2; | 
 |   } while (y != 0); | 
 | } | 
 |  | 
 | static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                      const uint16_t *const above, int height) { | 
 |   const uint16x8x2_t row = load_uint16x8x2(above); | 
 |   int y = height; | 
 |   do { | 
 |     store_uint16x8x2(dst, row); | 
 |     store_uint16x8x2(dst + stride, row); | 
 |     dst += stride << 1; | 
 |     y -= 2; | 
 |   } while (y != 0); | 
 | } | 
 |  | 
 | static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) { | 
 |   uint16x8x4_t x; | 
 |   // Clang/gcc uses ldp here. | 
 |   x.val[0] = vld1q_u16(ptr); | 
 |   x.val[1] = vld1q_u16(ptr + 8); | 
 |   x.val[2] = vld1q_u16(ptr + 16); | 
 |   x.val[3] = vld1q_u16(ptr + 24); | 
 |   return x; | 
 | } | 
 |  | 
 | static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) { | 
 |   vst1q_u16(ptr, x.val[0]); | 
 |   vst1q_u16(ptr + 8, x.val[1]); | 
 |   vst1q_u16(ptr + 16, x.val[2]); | 
 |   vst1q_u16(ptr + 24, x.val[3]); | 
 | } | 
 |  | 
 | static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                      const uint16_t *const above, int height) { | 
 |   const uint16x8x4_t row = load_uint16x8x4(above); | 
 |   int y = height; | 
 |   do { | 
 |     store_uint16x8x4(dst, row); | 
 |     store_uint16x8x4(dst + stride, row); | 
 |     dst += stride << 1; | 
 |     y -= 2; | 
 |   } while (y != 0); | 
 | } | 
 |  | 
 | static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                      const uint16_t *const above, int height) { | 
 |   uint16_t *dst32 = dst + 32; | 
 |   const uint16x8x4_t row = load_uint16x8x4(above); | 
 |   const uint16x8x4_t row32 = load_uint16x8x4(above + 32); | 
 |   int y = height; | 
 |   do { | 
 |     store_uint16x8x4(dst, row); | 
 |     store_uint16x8x4(dst32, row32); | 
 |     store_uint16x8x4(dst + stride, row); | 
 |     store_uint16x8x4(dst32 + stride, row32); | 
 |     dst += stride << 1; | 
 |     dst32 += stride << 1; | 
 |     y -= 2; | 
 |   } while (y != 0); | 
 | } | 
 |  | 
 | HIGHBD_V_NXM(4, 4) | 
 | HIGHBD_V_NXM(4, 8) | 
 | HIGHBD_V_NXM(4, 16) | 
 |  | 
 | HIGHBD_V_NXM(8, 4) | 
 | HIGHBD_V_NXM(8, 8) | 
 | HIGHBD_V_NXM(8, 16) | 
 | HIGHBD_V_NXM(8, 32) | 
 |  | 
 | HIGHBD_V_NXM(16, 4) | 
 | HIGHBD_V_NXM(16, 8) | 
 | HIGHBD_V_NXM(16, 16) | 
 | HIGHBD_V_NXM(16, 32) | 
 | HIGHBD_V_NXM(16, 64) | 
 |  | 
 | HIGHBD_V_NXM(32, 8) | 
 | HIGHBD_V_NXM(32, 16) | 
 | HIGHBD_V_NXM(32, 32) | 
 | HIGHBD_V_NXM(32, 64) | 
 |  | 
 | HIGHBD_V_NXM(64, 16) | 
 | HIGHBD_V_NXM(64, 32) | 
 | HIGHBD_V_NXM(64, 64) | 
 |  | 
 | // ----------------------------------------------------------------------------- | 
 | // PAETH | 
 |  | 
 | static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride, | 
 |                                               const uint16_t *const top_row, | 
 |                                               const uint16_t *const left_column, | 
 |                                               int width, int height) { | 
 |   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); | 
 |   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); | 
 |   uint16x8_t top; | 
 |   if (width == 4) { | 
 |     top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0)); | 
 |   } else {  // width == 8 | 
 |     top = vld1q_u16(top_row); | 
 |   } | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint16x8_t left = vdupq_n_u16(left_column[y]); | 
 |  | 
 |     const uint16x8_t left_dist = vabdq_u16(top, top_left); | 
 |     const uint16x8_t top_dist = vabdq_u16(left, top_left); | 
 |     const uint16x8_t top_left_dist = | 
 |         vabdq_u16(vaddq_u16(top, left), top_left_x2); | 
 |  | 
 |     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); | 
 |     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); | 
 |     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); | 
 |  | 
 |     // if (left_dist <= top_dist && left_dist <= top_left_dist) | 
 |     const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); | 
 |     //   dest[x] = left_column[y]; | 
 |     // Fill all the unused spaces with 'top'. They will be overwritten when | 
 |     // the positions for top_left are known. | 
 |     uint16x8_t result = vbslq_u16(left_mask, left, top); | 
 |     // else if (top_dist <= top_left_dist) | 
 |     //   dest[x] = top_row[x]; | 
 |     // Add these values to the mask. They were already set. | 
 |     const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); | 
 |     // else | 
 |     //   dest[x] = top_left; | 
 |     result = vbslq_u16(left_or_top_mask, result, top_left); | 
 |  | 
 |     if (width == 4) { | 
 |       vst1_u16(dest, vget_low_u16(result)); | 
 |     } else {  // width == 8 | 
 |       vst1q_u16(dest, result); | 
 |     } | 
 |     dest += stride; | 
 |   } | 
 | } | 
 |  | 
 | #define HIGHBD_PAETH_NXM(W, H)                                  \ | 
 |   void aom_highbd_paeth_predictor_##W##x##H##_neon(             \ | 
 |       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \ | 
 |       const uint16_t *left, int bd) {                           \ | 
 |     (void)bd;                                                   \ | 
 |     highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ | 
 |   } | 
 |  | 
 | HIGHBD_PAETH_NXM(4, 4) | 
 | HIGHBD_PAETH_NXM(4, 8) | 
 | HIGHBD_PAETH_NXM(4, 16) | 
 | HIGHBD_PAETH_NXM(8, 4) | 
 | HIGHBD_PAETH_NXM(8, 8) | 
 | HIGHBD_PAETH_NXM(8, 16) | 
 | HIGHBD_PAETH_NXM(8, 32) | 
 |  | 
 | // Select the closest values and collect them. | 
 | static INLINE uint16x8_t select_paeth(const uint16x8_t top, | 
 |                                       const uint16x8_t left, | 
 |                                       const uint16x8_t top_left, | 
 |                                       const uint16x8_t left_le_top, | 
 |                                       const uint16x8_t left_le_top_left, | 
 |                                       const uint16x8_t top_le_top_left) { | 
 |   // if (left_dist <= top_dist && left_dist <= top_left_dist) | 
 |   const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); | 
 |   //   dest[x] = left_column[y]; | 
 |   // Fill all the unused spaces with 'top'. They will be overwritten when | 
 |   // the positions for top_left are known. | 
 |   const uint16x8_t result = vbslq_u16(left_mask, left, top); | 
 |   // else if (top_dist <= top_left_dist) | 
 |   //   dest[x] = top_row[x]; | 
 |   // Add these values to the mask. They were already set. | 
 |   const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); | 
 |   // else | 
 |   //   dest[x] = top_left; | 
 |   return vbslq_u16(left_or_top_mask, result, top_left); | 
 | } | 
 |  | 
 | #define PAETH_PREDICTOR(num)                                                  \ | 
 |   do {                                                                        \ | 
 |     const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \ | 
 |     const uint16x8_t top_left_dist =                                          \ | 
 |         vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \ | 
 |     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \ | 
 |     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \ | 
 |     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \ | 
 |     const uint16x8_t result =                                                 \ | 
 |         select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \ | 
 |                      top_le_top_left);                                        \ | 
 |     vst1q_u16(dest + (num * 8), result);                                      \ | 
 |   } while (0) | 
 |  | 
 | #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8)) | 
 |  | 
 | static INLINE void highbd_paeth16_plus_x_h_neon( | 
 |     uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, | 
 |     const uint16_t *const left_column, int width, int height) { | 
 |   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); | 
 |   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); | 
 |   uint16x8_t top[8]; | 
 |   top[0] = LOAD_TOP_ROW(0); | 
 |   top[1] = LOAD_TOP_ROW(1); | 
 |   if (width > 16) { | 
 |     top[2] = LOAD_TOP_ROW(2); | 
 |     top[3] = LOAD_TOP_ROW(3); | 
 |     if (width == 64) { | 
 |       top[4] = LOAD_TOP_ROW(4); | 
 |       top[5] = LOAD_TOP_ROW(5); | 
 |       top[6] = LOAD_TOP_ROW(6); | 
 |       top[7] = LOAD_TOP_ROW(7); | 
 |     } | 
 |   } | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint16x8_t left = vdupq_n_u16(left_column[y]); | 
 |     const uint16x8_t top_dist = vabdq_u16(left, top_left); | 
 |     PAETH_PREDICTOR(0); | 
 |     PAETH_PREDICTOR(1); | 
 |     if (width > 16) { | 
 |       PAETH_PREDICTOR(2); | 
 |       PAETH_PREDICTOR(3); | 
 |       if (width == 64) { | 
 |         PAETH_PREDICTOR(4); | 
 |         PAETH_PREDICTOR(5); | 
 |         PAETH_PREDICTOR(6); | 
 |         PAETH_PREDICTOR(7); | 
 |       } | 
 |     } | 
 |     dest += stride; | 
 |   } | 
 | } | 
 |  | 
 | #define HIGHBD_PAETH_NXM_WIDE(W, H)                               \ | 
 |   void aom_highbd_paeth_predictor_##W##x##H##_neon(               \ | 
 |       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \ | 
 |       const uint16_t *left, int bd) {                             \ | 
 |     (void)bd;                                                     \ | 
 |     highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ | 
 |   } | 
 |  | 
 | HIGHBD_PAETH_NXM_WIDE(16, 4) | 
 | HIGHBD_PAETH_NXM_WIDE(16, 8) | 
 | HIGHBD_PAETH_NXM_WIDE(16, 16) | 
 | HIGHBD_PAETH_NXM_WIDE(16, 32) | 
 | HIGHBD_PAETH_NXM_WIDE(16, 64) | 
 | HIGHBD_PAETH_NXM_WIDE(32, 8) | 
 | HIGHBD_PAETH_NXM_WIDE(32, 16) | 
 | HIGHBD_PAETH_NXM_WIDE(32, 32) | 
 | HIGHBD_PAETH_NXM_WIDE(32, 64) | 
 | HIGHBD_PAETH_NXM_WIDE(64, 16) | 
 | HIGHBD_PAETH_NXM_WIDE(64, 32) | 
 | HIGHBD_PAETH_NXM_WIDE(64, 64) | 
 |  | 
 | // ----------------------------------------------------------------------------- | 
 | // SMOOTH | 
 |  | 
 | // 256 - v = vneg_s8(v) | 
 | static INLINE uint16x4_t negate_s8(const uint16x4_t v) { | 
 |   return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); | 
 | } | 
 |  | 
 | static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                           const uint16_t *const top_row, | 
 |                                           const uint16_t *const left_column, | 
 |                                           const int height) { | 
 |   const uint16_t top_right = top_row[3]; | 
 |   const uint16_t bottom_left = left_column[height - 1]; | 
 |   const uint16_t *const weights_y = smooth_weights_u16 + height - 4; | 
 |  | 
 |   const uint16x4_t top_v = vld1_u16(top_row); | 
 |   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); | 
 |   const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16); | 
 |   const uint16x4_t scaled_weights_x = negate_s8(weights_x_v); | 
 |   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     // Each variable in the running summation is named for the last item to be | 
 |     // accumulated. | 
 |     const uint32x4_t weighted_top = | 
 |         vmlal_n_u16(weighted_tr, top_v, weights_y[y]); | 
 |     const uint32x4_t weighted_left = | 
 |         vmlal_n_u16(weighted_top, weights_x_v, left_column[y]); | 
 |     const uint32x4_t weighted_bl = | 
 |         vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]); | 
 |  | 
 |     const uint16x4_t pred = | 
 |         vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1); | 
 |     vst1_u16(dst, pred); | 
 |     dst += stride; | 
 |   } | 
 | } | 
 |  | 
 | // Common code between 8xH and [16|32|64]xH. | 
 | static INLINE void highbd_calculate_pred8( | 
 |     uint16_t *dst, const uint32x4_t weighted_corners_low, | 
 |     const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals, | 
 |     const uint16x4x2_t weights_x, const uint16_t left_y, | 
 |     const uint16_t weight_y) { | 
 |   // Each variable in the running summation is named for the last item to be | 
 |   // accumulated. | 
 |   const uint32x4_t weighted_top_low = | 
 |       vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y); | 
 |   const uint32x4_t weighted_edges_low = | 
 |       vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y); | 
 |  | 
 |   const uint16x4_t pred_low = | 
 |       vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1); | 
 |   vst1_u16(dst, pred_low); | 
 |  | 
 |   const uint32x4_t weighted_top_high = | 
 |       vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y); | 
 |   const uint32x4_t weighted_edges_high = | 
 |       vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y); | 
 |  | 
 |   const uint16x4_t pred_high = | 
 |       vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1); | 
 |   vst1_u16(dst + 4, pred_high); | 
 | } | 
 |  | 
 | static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                    const uint16_t *const top_row, | 
 |                                    const uint16_t *const left_column, | 
 |                                    const int height) { | 
 |   const uint16_t top_right = top_row[7]; | 
 |   const uint16_t bottom_left = left_column[height - 1]; | 
 |   const uint16_t *const weights_y = smooth_weights_u16 + height - 4; | 
 |  | 
 |   const uint16x4x2_t top_vals = { { vld1_u16(top_row), | 
 |                                     vld1_u16(top_row + 4) } }; | 
 |   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); | 
 |   const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), | 
 |                                      vld1_u16(smooth_weights_u16 + 8) } }; | 
 |   const uint32x4_t weighted_tr_low = | 
 |       vmull_n_u16(negate_s8(weights_x.val[0]), top_right); | 
 |   const uint32x4_t weighted_tr_high = | 
 |       vmull_n_u16(negate_s8(weights_x.val[1]), top_right); | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint32x4_t weighted_bl = | 
 |         vmull_n_u16(bottom_left_v, 256 - weights_y[y]); | 
 |     const uint32x4_t weighted_corners_low = | 
 |         vaddq_u32(weighted_bl, weighted_tr_low); | 
 |     const uint32x4_t weighted_corners_high = | 
 |         vaddq_u32(weighted_bl, weighted_tr_high); | 
 |     highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high, | 
 |                            top_vals, weights_x, left_column[y], weights_y[y]); | 
 |     dst += stride; | 
 |   } | 
 | } | 
 |  | 
 | #define HIGHBD_SMOOTH_NXM(W, H)                                 \ | 
 |   void aom_highbd_smooth_predictor_##W##x##H##_neon(            \ | 
 |       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ | 
 |       const uint16_t *left, int bd) {                           \ | 
 |     (void)bd;                                                   \ | 
 |     highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_NXM(4, 4) | 
 | HIGHBD_SMOOTH_NXM(4, 8) | 
 | HIGHBD_SMOOTH_NXM(8, 4) | 
 | HIGHBD_SMOOTH_NXM(8, 8) | 
 | HIGHBD_SMOOTH_NXM(4, 16) | 
 | HIGHBD_SMOOTH_NXM(8, 16) | 
 | HIGHBD_SMOOTH_NXM(8, 32) | 
 |  | 
 | #undef HIGHBD_SMOOTH_NXM | 
 |  | 
 | // For width 16 and above. | 
 | #define HIGHBD_SMOOTH_PREDICTOR(W)                                             \ | 
 |   static void highbd_smooth_##W##xh_neon(                                      \ | 
 |       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \ | 
 |       const uint16_t *const left_column, const int height) {                   \ | 
 |     const uint16_t top_right = top_row[(W)-1];                                 \ | 
 |     const uint16_t bottom_left = left_column[height - 1];                      \ | 
 |     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \ | 
 |                                                                                \ | 
 |     /* Precompute weighted values that don't vary with |y|. */                 \ | 
 |     uint32x4_t weighted_tr_low[(W) >> 3];                                      \ | 
 |     uint32x4_t weighted_tr_high[(W) >> 3];                                     \ | 
 |     for (int i = 0; i < (W) >> 3; ++i) {                                       \ | 
 |       const int x = i << 3;                                                    \ | 
 |       const uint16x4_t weights_x_low =                                         \ | 
 |           vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \ | 
 |       weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \ | 
 |       const uint16x4_t weights_x_high =                                        \ | 
 |           vld1_u16(smooth_weights_u16 + (W) + x);                              \ | 
 |       weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \ | 
 |     }                                                                          \ | 
 |                                                                                \ | 
 |     const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \ | 
 |     for (int y = 0; y < height; ++y) {                                         \ | 
 |       const uint32x4_t weighted_bl =                                           \ | 
 |           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \ | 
 |       uint16_t *dst_x = dst;                                                   \ | 
 |       for (int i = 0; i < (W) >> 3; ++i) {                                     \ | 
 |         const int x = i << 3;                                                  \ | 
 |         const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \ | 
 |                                           vld1_u16(top_row + x + 4) } };       \ | 
 |         const uint32x4_t weighted_corners_low =                                \ | 
 |             vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \ | 
 |         const uint32x4_t weighted_corners_high =                               \ | 
 |             vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \ | 
 |         /* Accumulate weighted edge values and store. */                       \ | 
 |         const uint16x4x2_t weights_x = {                                       \ | 
 |           { vld1_u16(smooth_weights_u16 + (W)-4 + x),                          \ | 
 |             vld1_u16(smooth_weights_u16 + (W) + x) }                           \ | 
 |         };                                                                     \ | 
 |         highbd_calculate_pred8(dst_x, weighted_corners_low,                    \ | 
 |                                weighted_corners_high, top_vals, weights_x,     \ | 
 |                                left_column[y], weights_y[y]);                  \ | 
 |         dst_x += 8;                                                            \ | 
 |       }                                                                        \ | 
 |       dst += stride;                                                           \ | 
 |     }                                                                          \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_PREDICTOR(16) | 
 | HIGHBD_SMOOTH_PREDICTOR(32) | 
 | HIGHBD_SMOOTH_PREDICTOR(64) | 
 |  | 
 | #undef HIGHBD_SMOOTH_PREDICTOR | 
 |  | 
 | #define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \ | 
 |   void aom_highbd_smooth_predictor_##W##x##H##_neon(            \ | 
 |       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ | 
 |       const uint16_t *left, int bd) {                           \ | 
 |     (void)bd;                                                   \ | 
 |     highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_NXM_WIDE(16, 4) | 
 | HIGHBD_SMOOTH_NXM_WIDE(16, 8) | 
 | HIGHBD_SMOOTH_NXM_WIDE(16, 16) | 
 | HIGHBD_SMOOTH_NXM_WIDE(16, 32) | 
 | HIGHBD_SMOOTH_NXM_WIDE(16, 64) | 
 | HIGHBD_SMOOTH_NXM_WIDE(32, 8) | 
 | HIGHBD_SMOOTH_NXM_WIDE(32, 16) | 
 | HIGHBD_SMOOTH_NXM_WIDE(32, 32) | 
 | HIGHBD_SMOOTH_NXM_WIDE(32, 64) | 
 | HIGHBD_SMOOTH_NXM_WIDE(64, 16) | 
 | HIGHBD_SMOOTH_NXM_WIDE(64, 32) | 
 | HIGHBD_SMOOTH_NXM_WIDE(64, 64) | 
 |  | 
 | #undef HIGHBD_SMOOTH_NXM_WIDE | 
 |  | 
 | static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                      const uint16_t *const top_row, | 
 |                                      const uint16_t *const left_column, | 
 |                                      const int height) { | 
 |   const uint16_t bottom_left = left_column[height - 1]; | 
 |   const uint16_t *const weights_y = smooth_weights_u16 + height - 4; | 
 |  | 
 |   const uint16x4_t top_v = vld1_u16(top_row); | 
 |   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint32x4_t weighted_bl = | 
 |         vmull_n_u16(bottom_left_v, 256 - weights_y[y]); | 
 |     const uint32x4_t weighted_top = | 
 |         vmlal_n_u16(weighted_bl, top_v, weights_y[y]); | 
 |     vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE)); | 
 |  | 
 |     dst += stride; | 
 |   } | 
 | } | 
 |  | 
 | static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride, | 
 |                                      const uint16_t *const top_row, | 
 |                                      const uint16_t *const left_column, | 
 |                                      const int height) { | 
 |   const uint16_t bottom_left = left_column[height - 1]; | 
 |   const uint16_t *const weights_y = smooth_weights_u16 + height - 4; | 
 |  | 
 |   const uint16x4_t top_low = vld1_u16(top_row); | 
 |   const uint16x4_t top_high = vld1_u16(top_row + 4); | 
 |   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint32x4_t weighted_bl = | 
 |         vmull_n_u16(bottom_left_v, 256 - weights_y[y]); | 
 |  | 
 |     const uint32x4_t weighted_top_low = | 
 |         vmlal_n_u16(weighted_bl, top_low, weights_y[y]); | 
 |     vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); | 
 |  | 
 |     const uint32x4_t weighted_top_high = | 
 |         vmlal_n_u16(weighted_bl, top_high, weights_y[y]); | 
 |     vst1_u16(dst + 4, | 
 |              vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); | 
 |     dst += stride; | 
 |   } | 
 | } | 
 |  | 
 | #define HIGHBD_SMOOTH_V_NXM(W, H)                                \ | 
 |   void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \ | 
 |       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \ | 
 |       const uint16_t *left, int bd) {                            \ | 
 |     (void)bd;                                                    \ | 
 |     highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_V_NXM(4, 4) | 
 | HIGHBD_SMOOTH_V_NXM(4, 8) | 
 | HIGHBD_SMOOTH_V_NXM(4, 16) | 
 | HIGHBD_SMOOTH_V_NXM(8, 4) | 
 | HIGHBD_SMOOTH_V_NXM(8, 8) | 
 | HIGHBD_SMOOTH_V_NXM(8, 16) | 
 | HIGHBD_SMOOTH_V_NXM(8, 32) | 
 |  | 
 | #undef HIGHBD_SMOOTH_V_NXM | 
 |  | 
 | // For width 16 and above. | 
 | #define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \ | 
 |   static void highbd_smooth_v_##W##xh_neon(                                  \ | 
 |       uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \ | 
 |       const uint16_t *const left_column, const int height) {                 \ | 
 |     const uint16_t bottom_left = left_column[height - 1];                    \ | 
 |     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \ | 
 |                                                                              \ | 
 |     uint16x4x2_t top_vals[(W) >> 3];                                         \ | 
 |     for (int i = 0; i < (W) >> 3; ++i) {                                     \ | 
 |       const int x = i << 3;                                                  \ | 
 |       top_vals[i].val[0] = vld1_u16(top_row + x);                            \ | 
 |       top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \ | 
 |     }                                                                        \ | 
 |                                                                              \ | 
 |     const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \ | 
 |     for (int y = 0; y < height; ++y) {                                       \ | 
 |       const uint32x4_t weighted_bl =                                         \ | 
 |           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \ | 
 |                                                                              \ | 
 |       uint16_t *dst_x = dst;                                                 \ | 
 |       for (int i = 0; i < (W) >> 3; ++i) {                                   \ | 
 |         const uint32x4_t weighted_top_low =                                  \ | 
 |             vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \ | 
 |         vst1_u16(dst_x,                                                      \ | 
 |                  vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \ | 
 |                                                                              \ | 
 |         const uint32x4_t weighted_top_high =                                 \ | 
 |             vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \ | 
 |         vst1_u16(dst_x + 4,                                                  \ | 
 |                  vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ | 
 |         dst_x += 8;                                                          \ | 
 |       }                                                                      \ | 
 |       dst += stride;                                                         \ | 
 |     }                                                                        \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_V_PREDICTOR(16) | 
 | HIGHBD_SMOOTH_V_PREDICTOR(32) | 
 | HIGHBD_SMOOTH_V_PREDICTOR(64) | 
 |  | 
 | #undef HIGHBD_SMOOTH_V_PREDICTOR | 
 |  | 
 | #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \ | 
 |   void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \ | 
 |       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \ | 
 |       const uint16_t *left, int bd) {                            \ | 
 |     (void)bd;                                                    \ | 
 |     highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(16, 4) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(16, 64) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(32, 8) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(64, 16) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) | 
 | HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) | 
 |  | 
 | #undef HIGHBD_SMOOTH_V_NXM_WIDE | 
 |  | 
 | static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                             const uint16_t *const top_row, | 
 |                                             const uint16_t *const left_column, | 
 |                                             const int height) { | 
 |   const uint16_t top_right = top_row[3]; | 
 |  | 
 |   const uint16x4_t weights_x = vld1_u16(smooth_weights_u16); | 
 |   const uint16x4_t scaled_weights_x = negate_s8(weights_x); | 
 |  | 
 |   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint32x4_t weighted_left = | 
 |         vmlal_n_u16(weighted_tr, weights_x, left_column[y]); | 
 |     vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE)); | 
 |     dst += stride; | 
 |   } | 
 | } | 
 |  | 
 | static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride, | 
 |                                             const uint16_t *const top_row, | 
 |                                             const uint16_t *const left_column, | 
 |                                             const int height) { | 
 |   const uint16_t top_right = top_row[7]; | 
 |  | 
 |   const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), | 
 |                                      vld1_u16(smooth_weights_u16 + 8) } }; | 
 |  | 
 |   const uint32x4_t weighted_tr_low = | 
 |       vmull_n_u16(negate_s8(weights_x.val[0]), top_right); | 
 |   const uint32x4_t weighted_tr_high = | 
 |       vmull_n_u16(negate_s8(weights_x.val[1]), top_right); | 
 |  | 
 |   for (int y = 0; y < height; ++y) { | 
 |     const uint16_t left_y = left_column[y]; | 
 |     const uint32x4_t weighted_left_low = | 
 |         vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y); | 
 |     vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); | 
 |  | 
 |     const uint32x4_t weighted_left_high = | 
 |         vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y); | 
 |     vst1_u16(dst + 4, | 
 |              vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); | 
 |     dst += stride; | 
 |   } | 
 | } | 
 |  | 
 | #define HIGHBD_SMOOTH_H_NXM(W, H)                                \ | 
 |   void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \ | 
 |       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \ | 
 |       const uint16_t *left, int bd) {                            \ | 
 |     (void)bd;                                                    \ | 
 |     highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_H_NXM(4, 4) | 
 | HIGHBD_SMOOTH_H_NXM(4, 8) | 
 | HIGHBD_SMOOTH_H_NXM(4, 16) | 
 | HIGHBD_SMOOTH_H_NXM(8, 4) | 
 | HIGHBD_SMOOTH_H_NXM(8, 8) | 
 | HIGHBD_SMOOTH_H_NXM(8, 16) | 
 | HIGHBD_SMOOTH_H_NXM(8, 32) | 
 |  | 
 | #undef HIGHBD_SMOOTH_H_NXM | 
 |  | 
 | // For width 16 and above. | 
 | #define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \ | 
 |   void highbd_smooth_h_##W##xh_neon(                                          \ | 
 |       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \ | 
 |       const uint16_t *const left_column, const int height) {                  \ | 
 |     const uint16_t top_right = top_row[(W)-1];                                \ | 
 |                                                                               \ | 
 |     uint16x4_t weights_x_low[(W) >> 3];                                       \ | 
 |     uint16x4_t weights_x_high[(W) >> 3];                                      \ | 
 |     uint32x4_t weighted_tr_low[(W) >> 3];                                     \ | 
 |     uint32x4_t weighted_tr_high[(W) >> 3];                                    \ | 
 |     for (int i = 0; i < (W) >> 3; ++i) {                                      \ | 
 |       const int x = i << 3;                                                   \ | 
 |       weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \ | 
 |       weighted_tr_low[i] =                                                    \ | 
 |           vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \ | 
 |       weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \ | 
 |       weighted_tr_high[i] =                                                   \ | 
 |           vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \ | 
 |     }                                                                         \ | 
 |                                                                               \ | 
 |     for (int y = 0; y < height; ++y) {                                        \ | 
 |       uint16_t *dst_x = dst;                                                  \ | 
 |       const uint16_t left_y = left_column[y];                                 \ | 
 |       for (int i = 0; i < (W) >> 3; ++i) {                                    \ | 
 |         const uint32x4_t weighted_left_low =                                  \ | 
 |             vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \ | 
 |         vst1_u16(dst_x,                                                       \ | 
 |                  vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \ | 
 |                                                                               \ | 
 |         const uint32x4_t weighted_left_high =                                 \ | 
 |             vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \ | 
 |         vst1_u16(dst_x + 4,                                                   \ | 
 |                  vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ | 
 |         dst_x += 8;                                                           \ | 
 |       }                                                                       \ | 
 |       dst += stride;                                                          \ | 
 |     }                                                                         \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_H_PREDICTOR(16) | 
 | HIGHBD_SMOOTH_H_PREDICTOR(32) | 
 | HIGHBD_SMOOTH_H_PREDICTOR(64) | 
 |  | 
 | #undef HIGHBD_SMOOTH_H_PREDICTOR | 
 |  | 
 | #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \ | 
 |   void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \ | 
 |       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \ | 
 |       const uint16_t *left, int bd) {                            \ | 
 |     (void)bd;                                                    \ | 
 |     highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ | 
 |   } | 
 |  | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(16, 4) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(16, 64) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(32, 8) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(64, 16) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) | 
 | HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) | 
 |  | 
 | #undef HIGHBD_SMOOTH_H_NXM_WIDE |