intrapred_neon: add aom_smooth_v_predictor_NxM_neon

ported from libgav1 @ v0.17.0-83-ge54abf5c

Bug: b/231616924
Change-Id: I813d48e386823b457085370cf71776609b405243
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index c82b622..0b19b92 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -183,19 +183,19 @@
 specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
 specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
 
-specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/;
 
 specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
 specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
@@ -260,12 +260,12 @@
 specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
 specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
 
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/;
 
 specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
 specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index bdc6a29..db383f2 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -2884,6 +2884,159 @@
 #undef SMOOTH_NXM_WIDE
 
 // -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_V_PREDICTOR(W)                                         \
+  static void smooth_v_##W##xh_neon(                                  \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,   \
+      const uint8_t *const left_column, const int height) {           \
+    const uint8_t bottom_left = left_column[height - 1];              \
+    const uint8_t *const weights_y = smooth_weights + height - 4;     \
+                                                                      \
+    uint8x8_t UNINITIALIZED_IS_SAFE(top_v);                           \
+    if ((W) == 4) {                                                   \
+      load_u8_4x1(top_row, &top_v, 0);                                \
+    } else { /* width == 8 */                                         \
+      top_v = vld1_u8(top_row);                                       \
+    }                                                                 \
+                                                                      \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);           \
+                                                                      \
+    for (int y = 0; y < height; ++y) {                                \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);          \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);      \
+                                                                      \
+      const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);   \
+      const uint16x8_t weighted_top_bl =                              \
+          vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);    \
+      const uint8x8_t pred =                                          \
+          vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE);    \
+                                                                      \
+      if ((W) == 4) {                                                 \
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+      } else { /* width == 8 */                                       \
+        vst1_u8(dst, pred);                                           \
+      }                                                               \
+      dst += stride;                                                  \
+    }                                                                 \
+  }
+
+SMOOTH_V_PREDICTOR(4)
+SMOOTH_V_PREDICTOR(8)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM(W, H)                                    \
+  void aom_smooth_v_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_V_NXM(4, 4)
+SMOOTH_V_NXM(4, 8)
+SMOOTH_V_NXM(4, 16)
+SMOOTH_V_NXM(8, 4)
+SMOOTH_V_NXM(8, 8)
+SMOOTH_V_NXM(8, 16)
+SMOOTH_V_NXM(8, 32)
+
+#undef SMOOTH_V_NXM
+
+static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+    const uint8x16_t top, const uint8x8_t weights_y,
+    const uint16x8_t weighted_bl) {
+  const uint16x8_t pred_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t pred_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_V_PREDICTOR(W)                                            \
+  static void smooth_v_##W##xh_neon(                                     \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,      \
+      const uint8_t *const left_column, const int height) {              \
+    const uint8_t bottom_left = left_column[height - 1];                 \
+    const uint8_t *const weights_y = smooth_weights + height - 4;        \
+                                                                         \
+    uint8x16_t top_v[4];                                                 \
+    top_v[0] = vld1q_u8(top_row);                                        \
+    if ((W) > 16) {                                                      \
+      top_v[1] = vld1q_u8(top_row + 16);                                 \
+      if ((W) == 64) {                                                   \
+        top_v[2] = vld1q_u8(top_row + 32);                               \
+        top_v[3] = vld1q_u8(top_row + 48);                               \
+      }                                                                  \
+    }                                                                    \
+                                                                         \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);              \
+                                                                         \
+    for (int y = 0; y < height; ++y) {                                   \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);             \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);         \
+      const uint16x8_t weighted_bl =                                     \
+          vmull_u8(scaled_weights_y, bottom_left_v);                     \
+                                                                         \
+      const uint8x16_t pred_0 = calculate_vertical_weights_and_pred(     \
+          top_v[0], weights_y_v, weighted_bl);                           \
+      vst1q_u8(dst, pred_0);                                             \
+                                                                         \
+      if ((W) > 16) {                                                    \
+        const uint8x16_t pred_1 = calculate_vertical_weights_and_pred(   \
+            top_v[1], weights_y_v, weighted_bl);                         \
+        vst1q_u8(dst + 16, pred_1);                                      \
+                                                                         \
+        if ((W) == 64) {                                                 \
+          const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
+              top_v[2], weights_y_v, weighted_bl);                       \
+          vst1q_u8(dst + 32, pred_2);                                    \
+                                                                         \
+          const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
+              top_v[3], weights_y_v, weighted_bl);                       \
+          vst1q_u8(dst + 48, pred_3);                                    \
+        }                                                                \
+      }                                                                  \
+                                                                         \
+      dst += stride;                                                     \
+    }                                                                    \
+  }
+
+SMOOTH_V_PREDICTOR(16)
+SMOOTH_V_PREDICTOR(32)
+SMOOTH_V_PREDICTOR(64)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM_WIDE(W, H)                               \
+  void aom_smooth_v_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_V_NXM_WIDE(16, 4)
+SMOOTH_V_NXM_WIDE(16, 8)
+SMOOTH_V_NXM_WIDE(16, 16)
+SMOOTH_V_NXM_WIDE(16, 32)
+SMOOTH_V_NXM_WIDE(16, 64)
+SMOOTH_V_NXM_WIDE(32, 8)
+SMOOTH_V_NXM_WIDE(32, 16)
+SMOOTH_V_NXM_WIDE(32, 32)
+SMOOTH_V_NXM_WIDE(32, 64)
+SMOOTH_V_NXM_WIDE(64, 16)
+SMOOTH_V_NXM_WIDE(64, 32)
+SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef SMOOTH_V_NXM_WIDE
+
+// -----------------------------------------------------------------------------
 // PAETH
 
 static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index a219427..f96906d 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -333,25 +333,38 @@
 
 #if HAVE_NEON
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
-  lowbd_entry(smooth, 4, 4, neon),   lowbd_entry(smooth, 4, 8, neon),
-  lowbd_entry(smooth, 4, 16, neon),  lowbd_entry(smooth, 8, 4, neon),
-  lowbd_entry(smooth, 8, 8, neon),   lowbd_entry(smooth, 8, 16, neon),
-  lowbd_entry(smooth, 8, 32, neon),  lowbd_entry(smooth, 16, 4, neon),
-  lowbd_entry(smooth, 16, 8, neon),  lowbd_entry(smooth, 16, 16, neon),
-  lowbd_entry(smooth, 16, 32, neon), lowbd_entry(smooth, 16, 64, neon),
-  lowbd_entry(smooth, 32, 8, neon),  lowbd_entry(smooth, 32, 16, neon),
-  lowbd_entry(smooth, 32, 32, neon), lowbd_entry(smooth, 32, 64, neon),
-  lowbd_entry(smooth, 64, 16, neon), lowbd_entry(smooth, 64, 32, neon),
-  lowbd_entry(smooth, 64, 64, neon), lowbd_entry(paeth, 4, 4, neon),
-  lowbd_entry(paeth, 4, 8, neon),    lowbd_entry(paeth, 4, 16, neon),
-  lowbd_entry(paeth, 8, 4, neon),    lowbd_entry(paeth, 8, 8, neon),
-  lowbd_entry(paeth, 8, 16, neon),   lowbd_entry(paeth, 8, 32, neon),
-  lowbd_entry(paeth, 16, 4, neon),   lowbd_entry(paeth, 16, 8, neon),
-  lowbd_entry(paeth, 16, 16, neon),  lowbd_entry(paeth, 16, 32, neon),
-  lowbd_entry(paeth, 16, 64, neon),  lowbd_entry(paeth, 32, 8, neon),
-  lowbd_entry(paeth, 32, 16, neon),  lowbd_entry(paeth, 32, 32, neon),
-  lowbd_entry(paeth, 32, 64, neon),  lowbd_entry(paeth, 64, 16, neon),
-  lowbd_entry(paeth, 64, 32, neon),  lowbd_entry(paeth, 64, 64, neon),
+  lowbd_entry(smooth, 4, 4, neon),     lowbd_entry(smooth, 4, 8, neon),
+  lowbd_entry(smooth, 4, 16, neon),    lowbd_entry(smooth, 8, 4, neon),
+  lowbd_entry(smooth, 8, 8, neon),     lowbd_entry(smooth, 8, 16, neon),
+  lowbd_entry(smooth, 8, 32, neon),    lowbd_entry(smooth, 16, 4, neon),
+  lowbd_entry(smooth, 16, 8, neon),    lowbd_entry(smooth, 16, 16, neon),
+  lowbd_entry(smooth, 16, 32, neon),   lowbd_entry(smooth, 16, 64, neon),
+  lowbd_entry(smooth, 32, 8, neon),    lowbd_entry(smooth, 32, 16, neon),
+  lowbd_entry(smooth, 32, 32, neon),   lowbd_entry(smooth, 32, 64, neon),
+  lowbd_entry(smooth, 64, 16, neon),   lowbd_entry(smooth, 64, 32, neon),
+  lowbd_entry(smooth, 64, 64, neon),
+
+  lowbd_entry(smooth_v, 4, 4, neon),   lowbd_entry(smooth_v, 4, 8, neon),
+  lowbd_entry(smooth_v, 4, 16, neon),  lowbd_entry(smooth_v, 8, 4, neon),
+  lowbd_entry(smooth_v, 8, 8, neon),   lowbd_entry(smooth_v, 8, 16, neon),
+  lowbd_entry(smooth_v, 8, 32, neon),  lowbd_entry(smooth_v, 16, 4, neon),
+  lowbd_entry(smooth_v, 16, 8, neon),  lowbd_entry(smooth_v, 16, 16, neon),
+  lowbd_entry(smooth_v, 16, 32, neon), lowbd_entry(smooth_v, 16, 64, neon),
+  lowbd_entry(smooth_v, 32, 8, neon),  lowbd_entry(smooth_v, 32, 16, neon),
+  lowbd_entry(smooth_v, 32, 32, neon), lowbd_entry(smooth_v, 32, 64, neon),
+  lowbd_entry(smooth_v, 64, 16, neon), lowbd_entry(smooth_v, 64, 32, neon),
+  lowbd_entry(smooth_v, 64, 64, neon),
+
+  lowbd_entry(paeth, 4, 4, neon),      lowbd_entry(paeth, 4, 8, neon),
+  lowbd_entry(paeth, 4, 16, neon),     lowbd_entry(paeth, 8, 4, neon),
+  lowbd_entry(paeth, 8, 8, neon),      lowbd_entry(paeth, 8, 16, neon),
+  lowbd_entry(paeth, 8, 32, neon),     lowbd_entry(paeth, 16, 4, neon),
+  lowbd_entry(paeth, 16, 8, neon),     lowbd_entry(paeth, 16, 16, neon),
+  lowbd_entry(paeth, 16, 32, neon),    lowbd_entry(paeth, 16, 64, neon),
+  lowbd_entry(paeth, 32, 8, neon),     lowbd_entry(paeth, 32, 16, neon),
+  lowbd_entry(paeth, 32, 32, neon),    lowbd_entry(paeth, 32, 64, neon),
+  lowbd_entry(paeth, 64, 16, neon),    lowbd_entry(paeth, 64, 32, neon),
+  lowbd_entry(paeth, 64, 64, neon),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 50cc542..cbaca11 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -468,13 +468,14 @@
                 aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
                 aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
                 aom_h_predictor_4x4_neon, aom_paeth_predictor_4x4_neon,
-                aom_smooth_predictor_4x4_neon, NULL, NULL)
+                aom_smooth_predictor_4x4_neon, aom_smooth_v_predictor_4x4_neon,
+                NULL)
 INTRA_PRED_TEST(NEON, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_4x8_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_4x16_neon, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -559,16 +560,17 @@
                 aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
                 aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
                 aom_h_predictor_8x8_neon, aom_paeth_predictor_8x8_neon,
-                aom_smooth_predictor_8x8_neon, NULL, NULL)
+                aom_smooth_predictor_8x8_neon, aom_smooth_v_predictor_8x8_neon,
+                NULL)
 INTRA_PRED_TEST(NEON, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_8x4_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_8x16_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_8x32_neon, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -686,19 +688,20 @@
                 aom_dc_top_predictor_16x16_neon,
                 aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
                 aom_h_predictor_16x16_neon, aom_paeth_predictor_16x16_neon,
-                aom_smooth_predictor_16x16_neon, NULL, NULL)
+                aom_smooth_predictor_16x16_neon,
+                aom_smooth_v_predictor_16x16_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_16x8_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_16x32_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_16x4_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_16x64_neon, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -805,16 +808,17 @@
                 aom_dc_top_predictor_32x32_neon,
                 aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
                 aom_h_predictor_32x32_neon, aom_paeth_predictor_32x32_neon,
-                aom_smooth_predictor_32x32_neon, NULL, NULL)
+                aom_smooth_predictor_32x32_neon,
+                aom_smooth_v_predictor_32x32_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_32x16_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_32x64_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_32x8_neon, NULL)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -903,13 +907,13 @@
 #if HAVE_NEON
 INTRA_PRED_TEST(NEON, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_64x64_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_64x32_neon, NULL)
 INTRA_PRED_TEST(NEON, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
-                NULL, NULL)
+                aom_smooth_v_predictor_64x16_neon, NULL)
 #endif  // HAVE_NEON
 
 #if CONFIG_AV1_HIGHBITDEPTH