Add Neon implementations for remaining v predictors

We already have Neon implementations of the v predictors for all square
block sizes except 64x64, so add the remaining cases and update
tests/speed to match.

On Neoverse V1, these new implementations are worth about a 10%
improvement for the largest and smallest cases on both Clang 15 and GCC
12, and around parity for the remainder with both compilers.

Change-Id: I945f550608e75e978a48b253c2d81e04955c0be5
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index dab303c..0bf1a9c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -146,24 +146,24 @@
 specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_v_predictor_4x4 neon sse2/;
-specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
-specialize qw/aom_v_predictor_8x4 sse2/;
+specialize qw/aom_v_predictor_4x8 neon sse2/;
+specialize qw/aom_v_predictor_4x16 neon sse2/;
+specialize qw/aom_v_predictor_8x4 neon sse2/;
 specialize qw/aom_v_predictor_8x8 neon sse2/;
-specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
-specialize qw/aom_v_predictor_16x8 sse2/;
+specialize qw/aom_v_predictor_8x16 neon sse2/;
+specialize qw/aom_v_predictor_8x32 neon sse2/;
+specialize qw/aom_v_predictor_16x4 neon sse2/;
+specialize qw/aom_v_predictor_16x8 neon sse2/;
 specialize qw/aom_v_predictor_16x16 neon sse2/;
-specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
-specialize qw/aom_v_predictor_32x16 sse2 avx2/;
+specialize qw/aom_v_predictor_16x32 neon sse2/;
+specialize qw/aom_v_predictor_16x64 neon sse2/;
+specialize qw/aom_v_predictor_32x8 neon sse2/;
+specialize qw/aom_v_predictor_32x16 neon sse2 avx2/;
 specialize qw/aom_v_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_v_predictor_32x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
-specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_h_predictor_4x4 neon sse2/;
 specialize qw/aom_h_predictor_4x8 sse2/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index c724fa5..5aa401b 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -603,52 +603,186 @@
   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
 }
 
+// -----------------------------------------------------------------------------
+
+static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+                               uint8x8_t d0) {
+  for (int i = 0; i < h; ++i) {
+    store_u8_4x1(dst + i * stride, d0, 0);
+  }
+}
+
+static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+                               uint8x8_t d0) {
+  for (int i = 0; i < h; ++i) {
+    vst1_u8(dst + i * stride, d0);
+  }
+}
+
+static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x16_t d0) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + i * stride, d0);
+  }
+}
+
+static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x16_t d0, uint8x16_t d1) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + 0, d0);
+    vst1q_u8(dst + 16, d1);
+    dst += stride;
+  }
+}
+
+static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
+                                uint8x16_t d3) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + 0, d0);
+    vst1q_u8(dst + 16, d1);
+    vst1q_u8(dst + 32, d2);
+    vst1q_u8(dst + 48, d3);
+    dst += stride;
+  }
+}
+
 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
   (void)left;
-
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+  v_store_4xh(dst, stride, 4, load_u8_4x1_lane0(above));
 }
 
 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
   (void)left;
-
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+  v_store_8xh(dst, stride, 8, vld1_u8(above));
 }
 
 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
   (void)left;
-
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+  v_store_16xh(dst, stride, 16, vld1q_u8(above));
 }
 
 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
   (void)left;
-
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
-  }
+  v_store_32xh(dst, stride, 32, d0, d1);
 }
 
+void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_4xh(dst, stride, 8, load_u8_4x1_lane0(above));
+}
+
+void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_4xh(dst, stride, 16, load_u8_4x1_lane0(above));
+}
+
+void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 4, vld1_u8(above));
+}
+
+void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 16, vld1_u8(above));
+}
+
+void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 32, vld1_u8(above));
+}
+
+void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 4, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 8, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 32, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 64, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 8, d0, d1);
+}
+
+void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 16, d0, d1);
+}
+
+void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 64, d0, d1);
+}
+
+void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  const uint8x16_t d2 = vld1q_u8(above + 32);
+  const uint8x16_t d3 = vld1q_u8(above + 48);
+  (void)left;
+  v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  const uint8x16_t d2 = vld1q_u8(above + 32);
+  const uint8x16_t d3 = vld1q_u8(above + 48);
+  (void)left;
+  v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  const uint8x16_t d2 = vld1q_u8(above + 32);
+  const uint8x16_t d3 = vld1q_u8(above + 48);
+  (void)left;
+  v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
+}
+
+// -----------------------------------------------------------------------------
+
 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   uint8x8_t d0u8 = vdup_n_u8(0);
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 01a5dc2..47f35f5 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -342,9 +342,7 @@
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
   lowbd_intrapred(dc, neon),       lowbd_intrapred(dc_top, neon),
   lowbd_intrapred(dc_left, neon),  lowbd_intrapred(dc_128, neon),
-
-  lowbd_entry(v, 4, 4, neon),      lowbd_entry(v, 8, 8, neon),
-  lowbd_entry(v, 16, 16, neon),    lowbd_entry(v, 32, 32, neon),
+  lowbd_intrapred(v, neon),
 
   lowbd_entry(h, 4, 4, neon),      lowbd_entry(h, 8, 8, neon),
   lowbd_entry(h, 16, 16, neon),    lowbd_entry(h, 32, 32, neon),
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 526f678..e3751fb 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -470,14 +470,15 @@
                 aom_smooth_h_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
                 aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
-                aom_dc_128_predictor_4x8_neon, nullptr, nullptr,
-                aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
-                aom_smooth_v_predictor_4x8_neon,
+                aom_dc_128_predictor_4x8_neon, aom_v_predictor_4x8_neon,
+                nullptr, aom_paeth_predictor_4x8_neon,
+                aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
                 aom_smooth_h_predictor_4x8_neon)
 INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
                 aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
-                aom_dc_128_predictor_4x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
+                aom_dc_128_predictor_4x16_neon, aom_v_predictor_4x16_neon,
+                nullptr, aom_paeth_predictor_4x16_neon,
+                aom_smooth_predictor_4x16_neon,
                 aom_smooth_v_predictor_4x16_neon,
                 aom_smooth_h_predictor_4x16_neon)
 #endif  // HAVE_NEON
@@ -560,20 +561,22 @@
                 aom_smooth_h_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
                 aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
-                aom_dc_128_predictor_8x4_neon, nullptr, nullptr,
-                aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
-                aom_smooth_v_predictor_8x4_neon,
+                aom_dc_128_predictor_8x4_neon, aom_v_predictor_8x4_neon,
+                nullptr, aom_paeth_predictor_8x4_neon,
+                aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
                 aom_smooth_h_predictor_8x4_neon)
 INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
                 aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
-                aom_dc_128_predictor_8x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
+                aom_dc_128_predictor_8x16_neon, aom_v_predictor_8x16_neon,
+                nullptr, aom_paeth_predictor_8x16_neon,
+                aom_smooth_predictor_8x16_neon,
                 aom_smooth_v_predictor_8x16_neon,
                 aom_smooth_h_predictor_8x16_neon)
 INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
                 aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
-                aom_dc_128_predictor_8x32_neon, nullptr, nullptr,
-                aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
+                aom_dc_128_predictor_8x32_neon, aom_v_predictor_8x32_neon,
+                nullptr, aom_paeth_predictor_8x32_neon,
+                aom_smooth_predictor_8x32_neon,
                 aom_smooth_v_predictor_8x32_neon,
                 aom_smooth_h_predictor_8x32_neon)
 #endif  // HAVE_NEON
@@ -692,28 +695,32 @@
                 aom_smooth_h_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
                 aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
-                aom_dc_128_predictor_16x8_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
+                aom_dc_128_predictor_16x8_neon, aom_v_predictor_16x8_neon,
+                nullptr, aom_paeth_predictor_16x8_neon,
+                aom_smooth_predictor_16x8_neon,
                 aom_smooth_v_predictor_16x8_neon,
                 aom_smooth_h_predictor_16x8_neon)
 INTRA_PRED_TEST(NEON, TX_16X32, aom_dc_predictor_16x32_neon,
                 aom_dc_left_predictor_16x32_neon,
                 aom_dc_top_predictor_16x32_neon,
-                aom_dc_128_predictor_16x32_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
+                aom_dc_128_predictor_16x32_neon, aom_v_predictor_16x32_neon,
+                nullptr, aom_paeth_predictor_16x32_neon,
+                aom_smooth_predictor_16x32_neon,
                 aom_smooth_v_predictor_16x32_neon,
                 aom_smooth_h_predictor_16x32_neon)
 INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
                 aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
-                aom_dc_128_predictor_16x4_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
+                aom_dc_128_predictor_16x4_neon, aom_v_predictor_16x4_neon,
+                nullptr, aom_paeth_predictor_16x4_neon,
+                aom_smooth_predictor_16x4_neon,
                 aom_smooth_v_predictor_16x4_neon,
                 aom_smooth_h_predictor_16x4_neon)
 INTRA_PRED_TEST(NEON, TX_16X64, aom_dc_predictor_16x64_neon,
                 aom_dc_left_predictor_16x64_neon,
                 aom_dc_top_predictor_16x64_neon,
-                aom_dc_128_predictor_16x64_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
+                aom_dc_128_predictor_16x64_neon, aom_v_predictor_16x64_neon,
+                nullptr, aom_paeth_predictor_16x64_neon,
+                aom_smooth_predictor_16x64_neon,
                 aom_smooth_v_predictor_16x64_neon,
                 aom_smooth_h_predictor_16x64_neon)
 #endif  // HAVE_NEON
@@ -824,21 +831,24 @@
 INTRA_PRED_TEST(NEON, TX_32X16, aom_dc_predictor_32x16_neon,
                 aom_dc_left_predictor_32x16_neon,
                 aom_dc_top_predictor_32x16_neon,
-                aom_dc_128_predictor_32x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
+                aom_dc_128_predictor_32x16_neon, aom_v_predictor_32x16_neon,
+                nullptr, aom_paeth_predictor_32x16_neon,
+                aom_smooth_predictor_32x16_neon,
                 aom_smooth_v_predictor_32x16_neon,
                 aom_smooth_h_predictor_32x16_neon)
 INTRA_PRED_TEST(NEON, TX_32X64, aom_dc_predictor_32x64_neon,
                 aom_dc_left_predictor_32x64_neon,
                 aom_dc_top_predictor_32x64_neon,
-                aom_dc_128_predictor_32x64_neon, nullptr, nullptr,
-                aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
+                aom_dc_128_predictor_32x64_neon, aom_v_predictor_32x64_neon,
+                nullptr, aom_paeth_predictor_32x64_neon,
+                aom_smooth_predictor_32x64_neon,
                 aom_smooth_v_predictor_32x64_neon,
                 aom_smooth_h_predictor_32x64_neon)
 INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
                 aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
-                aom_dc_128_predictor_32x8_neon, nullptr, nullptr,
-                aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
+                aom_dc_128_predictor_32x8_neon, aom_v_predictor_32x8_neon,
+                nullptr, aom_paeth_predictor_32x8_neon,
+                aom_smooth_predictor_32x8_neon,
                 aom_smooth_v_predictor_32x8_neon,
                 aom_smooth_h_predictor_32x8_neon)
 #endif  // HAVE_NEON
@@ -926,22 +936,25 @@
 INTRA_PRED_TEST(NEON, TX_64X64, aom_dc_predictor_64x64_neon,
                 aom_dc_left_predictor_64x64_neon,
                 aom_dc_top_predictor_64x64_neon,
-                aom_dc_128_predictor_64x64_neon, nullptr, nullptr,
-                aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
+                aom_dc_128_predictor_64x64_neon, aom_v_predictor_64x64_neon,
+                nullptr, aom_paeth_predictor_64x64_neon,
+                aom_smooth_predictor_64x64_neon,
                 aom_smooth_v_predictor_64x64_neon,
                 aom_smooth_h_predictor_64x64_neon)
 INTRA_PRED_TEST(NEON, TX_64X32, aom_dc_predictor_64x32_neon,
                 aom_dc_left_predictor_64x32_neon,
                 aom_dc_top_predictor_64x32_neon,
-                aom_dc_128_predictor_64x32_neon, nullptr, nullptr,
-                aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
+                aom_dc_128_predictor_64x32_neon, aom_v_predictor_64x32_neon,
+                nullptr, aom_paeth_predictor_64x32_neon,
+                aom_smooth_predictor_64x32_neon,
                 aom_smooth_v_predictor_64x32_neon,
                 aom_smooth_h_predictor_64x32_neon)
 INTRA_PRED_TEST(NEON, TX_64X16, aom_dc_predictor_64x16_neon,
                 aom_dc_left_predictor_64x16_neon,
                 aom_dc_top_predictor_64x16_neon,
-                aom_dc_128_predictor_64x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
+                aom_dc_128_predictor_64x16_neon, aom_v_predictor_64x16_neon,
+                nullptr, aom_paeth_predictor_64x16_neon,
+                aom_smooth_predictor_64x16_neon,
                 aom_smooth_v_predictor_64x16_neon,
                 aom_smooth_h_predictor_64x16_neon)
 #endif  // HAVE_NEON