Add Neon implementations for remaining v predictors

We already have Neon implementations of the v predictors for all square
block sizes except 64x64, so add the remaining cases and update
tests/speed to match.

On Neoverse V1, these new implementations are worth about a 10%
improvement for the largest and smallest cases on both Clang 15 and GCC
12, and around parity for the remainder with both compilers.

Change-Id: I945f550608e75e978a48b253c2d81e04955c0be5
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 526f678..e3751fb 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -470,14 +470,15 @@
                 aom_smooth_h_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
                 aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
-                aom_dc_128_predictor_4x8_neon, nullptr, nullptr,
-                aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
-                aom_smooth_v_predictor_4x8_neon,
+                aom_dc_128_predictor_4x8_neon, aom_v_predictor_4x8_neon,
+                nullptr, aom_paeth_predictor_4x8_neon,
+                aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
                 aom_smooth_h_predictor_4x8_neon)
 INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
                 aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
-                aom_dc_128_predictor_4x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
+                aom_dc_128_predictor_4x16_neon, aom_v_predictor_4x16_neon,
+                nullptr, aom_paeth_predictor_4x16_neon,
+                aom_smooth_predictor_4x16_neon,
                 aom_smooth_v_predictor_4x16_neon,
                 aom_smooth_h_predictor_4x16_neon)
 #endif  // HAVE_NEON
@@ -560,20 +561,22 @@
                 aom_smooth_h_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
                 aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
-                aom_dc_128_predictor_8x4_neon, nullptr, nullptr,
-                aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
-                aom_smooth_v_predictor_8x4_neon,
+                aom_dc_128_predictor_8x4_neon, aom_v_predictor_8x4_neon,
+                nullptr, aom_paeth_predictor_8x4_neon,
+                aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
                 aom_smooth_h_predictor_8x4_neon)
 INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
                 aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
-                aom_dc_128_predictor_8x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
+                aom_dc_128_predictor_8x16_neon, aom_v_predictor_8x16_neon,
+                nullptr, aom_paeth_predictor_8x16_neon,
+                aom_smooth_predictor_8x16_neon,
                 aom_smooth_v_predictor_8x16_neon,
                 aom_smooth_h_predictor_8x16_neon)
 INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
                 aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
-                aom_dc_128_predictor_8x32_neon, nullptr, nullptr,
-                aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
+                aom_dc_128_predictor_8x32_neon, aom_v_predictor_8x32_neon,
+                nullptr, aom_paeth_predictor_8x32_neon,
+                aom_smooth_predictor_8x32_neon,
                 aom_smooth_v_predictor_8x32_neon,
                 aom_smooth_h_predictor_8x32_neon)
 #endif  // HAVE_NEON
@@ -692,28 +695,32 @@
                 aom_smooth_h_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
                 aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
-                aom_dc_128_predictor_16x8_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
+                aom_dc_128_predictor_16x8_neon, aom_v_predictor_16x8_neon,
+                nullptr, aom_paeth_predictor_16x8_neon,
+                aom_smooth_predictor_16x8_neon,
                 aom_smooth_v_predictor_16x8_neon,
                 aom_smooth_h_predictor_16x8_neon)
 INTRA_PRED_TEST(NEON, TX_16X32, aom_dc_predictor_16x32_neon,
                 aom_dc_left_predictor_16x32_neon,
                 aom_dc_top_predictor_16x32_neon,
-                aom_dc_128_predictor_16x32_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
+                aom_dc_128_predictor_16x32_neon, aom_v_predictor_16x32_neon,
+                nullptr, aom_paeth_predictor_16x32_neon,
+                aom_smooth_predictor_16x32_neon,
                 aom_smooth_v_predictor_16x32_neon,
                 aom_smooth_h_predictor_16x32_neon)
 INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
                 aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
-                aom_dc_128_predictor_16x4_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
+                aom_dc_128_predictor_16x4_neon, aom_v_predictor_16x4_neon,
+                nullptr, aom_paeth_predictor_16x4_neon,
+                aom_smooth_predictor_16x4_neon,
                 aom_smooth_v_predictor_16x4_neon,
                 aom_smooth_h_predictor_16x4_neon)
 INTRA_PRED_TEST(NEON, TX_16X64, aom_dc_predictor_16x64_neon,
                 aom_dc_left_predictor_16x64_neon,
                 aom_dc_top_predictor_16x64_neon,
-                aom_dc_128_predictor_16x64_neon, nullptr, nullptr,
-                aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
+                aom_dc_128_predictor_16x64_neon, aom_v_predictor_16x64_neon,
+                nullptr, aom_paeth_predictor_16x64_neon,
+                aom_smooth_predictor_16x64_neon,
                 aom_smooth_v_predictor_16x64_neon,
                 aom_smooth_h_predictor_16x64_neon)
 #endif  // HAVE_NEON
@@ -824,21 +831,24 @@
 INTRA_PRED_TEST(NEON, TX_32X16, aom_dc_predictor_32x16_neon,
                 aom_dc_left_predictor_32x16_neon,
                 aom_dc_top_predictor_32x16_neon,
-                aom_dc_128_predictor_32x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
+                aom_dc_128_predictor_32x16_neon, aom_v_predictor_32x16_neon,
+                nullptr, aom_paeth_predictor_32x16_neon,
+                aom_smooth_predictor_32x16_neon,
                 aom_smooth_v_predictor_32x16_neon,
                 aom_smooth_h_predictor_32x16_neon)
 INTRA_PRED_TEST(NEON, TX_32X64, aom_dc_predictor_32x64_neon,
                 aom_dc_left_predictor_32x64_neon,
                 aom_dc_top_predictor_32x64_neon,
-                aom_dc_128_predictor_32x64_neon, nullptr, nullptr,
-                aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
+                aom_dc_128_predictor_32x64_neon, aom_v_predictor_32x64_neon,
+                nullptr, aom_paeth_predictor_32x64_neon,
+                aom_smooth_predictor_32x64_neon,
                 aom_smooth_v_predictor_32x64_neon,
                 aom_smooth_h_predictor_32x64_neon)
 INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
                 aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
-                aom_dc_128_predictor_32x8_neon, nullptr, nullptr,
-                aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
+                aom_dc_128_predictor_32x8_neon, aom_v_predictor_32x8_neon,
+                nullptr, aom_paeth_predictor_32x8_neon,
+                aom_smooth_predictor_32x8_neon,
                 aom_smooth_v_predictor_32x8_neon,
                 aom_smooth_h_predictor_32x8_neon)
 #endif  // HAVE_NEON
@@ -926,22 +936,25 @@
 INTRA_PRED_TEST(NEON, TX_64X64, aom_dc_predictor_64x64_neon,
                 aom_dc_left_predictor_64x64_neon,
                 aom_dc_top_predictor_64x64_neon,
-                aom_dc_128_predictor_64x64_neon, nullptr, nullptr,
-                aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
+                aom_dc_128_predictor_64x64_neon, aom_v_predictor_64x64_neon,
+                nullptr, aom_paeth_predictor_64x64_neon,
+                aom_smooth_predictor_64x64_neon,
                 aom_smooth_v_predictor_64x64_neon,
                 aom_smooth_h_predictor_64x64_neon)
 INTRA_PRED_TEST(NEON, TX_64X32, aom_dc_predictor_64x32_neon,
                 aom_dc_left_predictor_64x32_neon,
                 aom_dc_top_predictor_64x32_neon,
-                aom_dc_128_predictor_64x32_neon, nullptr, nullptr,
-                aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
+                aom_dc_128_predictor_64x32_neon, aom_v_predictor_64x32_neon,
+                nullptr, aom_paeth_predictor_64x32_neon,
+                aom_smooth_predictor_64x32_neon,
                 aom_smooth_v_predictor_64x32_neon,
                 aom_smooth_h_predictor_64x32_neon)
 INTRA_PRED_TEST(NEON, TX_64X16, aom_dc_predictor_64x16_neon,
                 aom_dc_left_predictor_64x16_neon,
                 aom_dc_top_predictor_64x16_neon,
-                aom_dc_128_predictor_64x16_neon, nullptr, nullptr,
-                aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
+                aom_dc_128_predictor_64x16_neon, aom_v_predictor_64x16_neon,
+                nullptr, aom_paeth_predictor_64x16_neon,
+                aom_smooth_predictor_64x16_neon,
                 aom_smooth_v_predictor_64x16_neon,
                 aom_smooth_h_predictor_64x16_neon)
 #endif  // HAVE_NEON