Add highbd_dc_top_predictor Neon implementation and tests

Add Neon implementations of highbd_dc_top_predictor for all block sizes.
Also add the corresponding tests and benchmarks.

This particular implementation is mostly a duplicate of the macro for
dc_left, however we are reading from `above` rather than `left`, and the
shift values are different since they are `log2(w)` rather than
`log2(h)`.

Change-Id: I4e78c97c1f1b7174ee69b6b1fa8350bcf0604f18
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 3ad8c94..8723d3d 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -381,16 +381,25 @@
   specialize qw/aom_highbd_dc_left_predictor_64x32 neon/;
   specialize qw/aom_highbd_dc_left_predictor_64x64 neon/;
 
-  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_64x64 neon/;
 
   specialize qw/aom_highbd_paeth_predictor_4x4 neon/;
   specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 799feb7..aa0ba85 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -259,6 +259,42 @@
 #undef DC_PREDICTOR_LEFT
 
 // -----------------------------------------------------------------------------
+// DC_TOP
+
+#define DC_PREDICTOR_TOP(w, h, shift, q)                                   \
+  void aom_highbd_dc_top_predictor_##w##x##h##_neon(                       \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
+      const uint16_t *left, int bd) {                                      \
+    (void)bd;                                                              \
+    (void)left;                                                            \
+    const uint32x4_t sum = highbd_dc_load_sum_##w(above);                  \
+    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
+    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+  }
+
+DC_PREDICTOR_TOP(4, 4, 2, )
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, q)
+DC_PREDICTOR_TOP(8, 8, 3, q)
+DC_PREDICTOR_TOP(8, 16, 3, q)
+DC_PREDICTOR_TOP(8, 32, 3, q)
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 16, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 32, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+DC_PREDICTOR_TOP(64, 64, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
 // V_PRED
 
 #define HIGHBD_V_NXM(W, H)                                    \
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index b8a1512..ecbf47c 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -405,10 +405,11 @@
   highbd_entry(dc, 16, 16, neon, 8),    highbd_entry(dc, 32, 32, neon, 8),
   highbd_entry(dc, 64, 64, neon, 8),
 
-  highbd_intrapred(dc_left, neon, 12),  highbd_intrapred(dc_128, neon, 12),
-  highbd_intrapred(v, neon, 12),        highbd_intrapred(h, neon, 12),
-  highbd_intrapred(paeth, neon, 12),    highbd_intrapred(smooth, neon, 12),
-  highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12),
+  highbd_intrapred(dc_top, neon, 12),   highbd_intrapred(dc_left, neon, 12),
+  highbd_intrapred(dc_128, neon, 12),   highbd_intrapred(v, neon, 12),
+  highbd_intrapred(h, neon, 12),        highbd_intrapred(paeth, neon, 12),
+  highbd_intrapred(smooth, neon, 12),   highbd_intrapred(smooth_v, neon, 12),
+  highbd_intrapred(smooth_h, neon, 12),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 9dc633b..e7d698b 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -1306,7 +1306,8 @@
 #endif
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon,
-                       aom_highbd_dc_left_predictor_4x4_neon, nullptr,
+                       aom_highbd_dc_left_predictor_4x4_neon,
+                       aom_highbd_dc_top_predictor_4x4_neon,
                        aom_highbd_dc_128_predictor_4x4_neon,
                        aom_highbd_v_predictor_4x4_neon,
                        aom_highbd_h_predictor_4x4_neon,
@@ -1314,17 +1315,16 @@
                        aom_highbd_smooth_predictor_4x4_neon,
                        aom_highbd_smooth_v_predictor_4x4_neon,
                        aom_highbd_smooth_h_predictor_4x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr,
-                       aom_highbd_dc_left_predictor_4x8_neon, nullptr,
-                       aom_highbd_dc_128_predictor_4x8_neon,
-                       aom_highbd_v_predictor_4x8_neon,
-                       aom_highbd_h_predictor_4x8_neon,
-                       aom_highbd_paeth_predictor_4x8_neon,
-                       aom_highbd_smooth_predictor_4x8_neon,
-                       aom_highbd_smooth_v_predictor_4x8_neon,
-                       aom_highbd_smooth_h_predictor_4x8_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TX_4X8, nullptr, aom_highbd_dc_left_predictor_4x8_neon,
+    aom_highbd_dc_top_predictor_4x8_neon, aom_highbd_dc_128_predictor_4x8_neon,
+    aom_highbd_v_predictor_4x8_neon, aom_highbd_h_predictor_4x8_neon,
+    aom_highbd_paeth_predictor_4x8_neon, aom_highbd_smooth_predictor_4x8_neon,
+    aom_highbd_smooth_v_predictor_4x8_neon,
+    aom_highbd_smooth_h_predictor_4x8_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr,
-                       aom_highbd_dc_left_predictor_4x16_neon, nullptr,
+                       aom_highbd_dc_left_predictor_4x16_neon,
+                       aom_highbd_dc_top_predictor_4x16_neon,
                        aom_highbd_dc_128_predictor_4x16_neon,
                        aom_highbd_v_predictor_4x16_neon,
                        aom_highbd_h_predictor_4x16_neon,
@@ -1397,7 +1397,8 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon,
-                       aom_highbd_dc_left_predictor_8x8_neon, nullptr,
+                       aom_highbd_dc_left_predictor_8x8_neon,
+                       aom_highbd_dc_top_predictor_8x8_neon,
                        aom_highbd_dc_128_predictor_8x8_neon,
                        aom_highbd_v_predictor_8x8_neon,
                        aom_highbd_h_predictor_8x8_neon,
@@ -1405,17 +1406,16 @@
                        aom_highbd_smooth_predictor_8x8_neon,
                        aom_highbd_smooth_v_predictor_8x8_neon,
                        aom_highbd_smooth_h_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr,
-                       aom_highbd_dc_left_predictor_8x4_neon, nullptr,
-                       aom_highbd_dc_128_predictor_8x4_neon,
-                       aom_highbd_v_predictor_8x4_neon,
-                       aom_highbd_h_predictor_8x4_neon,
-                       aom_highbd_paeth_predictor_8x4_neon,
-                       aom_highbd_smooth_predictor_8x4_neon,
-                       aom_highbd_smooth_v_predictor_8x4_neon,
-                       aom_highbd_smooth_h_predictor_8x4_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TX_8X4, nullptr, aom_highbd_dc_left_predictor_8x4_neon,
+    aom_highbd_dc_top_predictor_8x4_neon, aom_highbd_dc_128_predictor_8x4_neon,
+    aom_highbd_v_predictor_8x4_neon, aom_highbd_h_predictor_8x4_neon,
+    aom_highbd_paeth_predictor_8x4_neon, aom_highbd_smooth_predictor_8x4_neon,
+    aom_highbd_smooth_v_predictor_8x4_neon,
+    aom_highbd_smooth_h_predictor_8x4_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr,
-                       aom_highbd_dc_left_predictor_8x16_neon, nullptr,
+                       aom_highbd_dc_left_predictor_8x16_neon,
+                       aom_highbd_dc_top_predictor_8x16_neon,
                        aom_highbd_dc_128_predictor_8x16_neon,
                        aom_highbd_v_predictor_8x16_neon,
                        aom_highbd_h_predictor_8x16_neon,
@@ -1424,7 +1424,8 @@
                        aom_highbd_smooth_v_predictor_8x16_neon,
                        aom_highbd_smooth_h_predictor_8x16_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr,
-                       aom_highbd_dc_left_predictor_8x32_neon, nullptr,
+                       aom_highbd_dc_left_predictor_8x32_neon,
+                       aom_highbd_dc_top_predictor_8x32_neon,
                        aom_highbd_dc_128_predictor_8x32_neon,
                        aom_highbd_v_predictor_8x32_neon,
                        aom_highbd_h_predictor_8x32_neon,
@@ -1515,7 +1516,8 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon,
-                       aom_highbd_dc_left_predictor_16x16_neon, nullptr,
+                       aom_highbd_dc_left_predictor_16x16_neon,
+                       aom_highbd_dc_top_predictor_16x16_neon,
                        aom_highbd_dc_128_predictor_16x16_neon,
                        aom_highbd_v_predictor_16x16_neon,
                        aom_highbd_h_predictor_16x16_neon,
@@ -1524,7 +1526,8 @@
                        aom_highbd_smooth_v_predictor_16x16_neon,
                        aom_highbd_smooth_h_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr,
-                       aom_highbd_dc_left_predictor_16x8_neon, nullptr,
+                       aom_highbd_dc_left_predictor_16x8_neon,
+                       aom_highbd_dc_top_predictor_16x8_neon,
                        aom_highbd_dc_128_predictor_16x8_neon,
                        aom_highbd_v_predictor_16x8_neon,
                        aom_highbd_h_predictor_16x8_neon,
@@ -1533,7 +1536,8 @@
                        aom_highbd_smooth_v_predictor_16x8_neon,
                        aom_highbd_smooth_h_predictor_16x8_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr,
-                       aom_highbd_dc_left_predictor_16x32_neon, nullptr,
+                       aom_highbd_dc_left_predictor_16x32_neon,
+                       aom_highbd_dc_top_predictor_16x32_neon,
                        aom_highbd_dc_128_predictor_16x32_neon,
                        aom_highbd_v_predictor_16x32_neon,
                        aom_highbd_h_predictor_16x32_neon,
@@ -1542,7 +1546,8 @@
                        aom_highbd_smooth_v_predictor_16x32_neon,
                        aom_highbd_smooth_h_predictor_16x32_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr,
-                       aom_highbd_dc_left_predictor_16x4_neon, nullptr,
+                       aom_highbd_dc_left_predictor_16x4_neon,
+                       aom_highbd_dc_top_predictor_16x4_neon,
                        aom_highbd_dc_128_predictor_16x4_neon,
                        aom_highbd_v_predictor_16x4_neon,
                        aom_highbd_h_predictor_16x4_neon,
@@ -1551,7 +1556,8 @@
                        aom_highbd_smooth_v_predictor_16x4_neon,
                        aom_highbd_smooth_h_predictor_16x4_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr,
-                       aom_highbd_dc_left_predictor_16x64_neon, nullptr,
+                       aom_highbd_dc_left_predictor_16x64_neon,
+                       aom_highbd_dc_top_predictor_16x64_neon,
                        aom_highbd_dc_128_predictor_16x64_neon,
                        aom_highbd_v_predictor_16x64_neon,
                        aom_highbd_h_predictor_16x64_neon,
@@ -1625,7 +1631,8 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon,
-                       aom_highbd_dc_left_predictor_32x32_neon, nullptr,
+                       aom_highbd_dc_left_predictor_32x32_neon,
+                       aom_highbd_dc_top_predictor_32x32_neon,
                        aom_highbd_dc_128_predictor_32x32_neon,
                        aom_highbd_v_predictor_32x32_neon,
                        aom_highbd_h_predictor_32x32_neon,
@@ -1634,7 +1641,8 @@
                        aom_highbd_smooth_v_predictor_32x32_neon,
                        aom_highbd_smooth_h_predictor_32x32_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr,
-                       aom_highbd_dc_left_predictor_32x16_neon, nullptr,
+                       aom_highbd_dc_left_predictor_32x16_neon,
+                       aom_highbd_dc_top_predictor_32x16_neon,
                        aom_highbd_dc_128_predictor_32x16_neon,
                        aom_highbd_v_predictor_32x16_neon,
                        aom_highbd_h_predictor_32x16_neon,
@@ -1643,7 +1651,8 @@
                        aom_highbd_smooth_v_predictor_32x16_neon,
                        aom_highbd_smooth_h_predictor_32x16_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr,
-                       aom_highbd_dc_left_predictor_32x64_neon, nullptr,
+                       aom_highbd_dc_left_predictor_32x64_neon,
+                       aom_highbd_dc_top_predictor_32x64_neon,
                        aom_highbd_dc_128_predictor_32x64_neon,
                        aom_highbd_v_predictor_32x64_neon,
                        aom_highbd_h_predictor_32x64_neon,
@@ -1652,7 +1661,8 @@
                        aom_highbd_smooth_v_predictor_32x64_neon,
                        aom_highbd_smooth_h_predictor_32x64_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr,
-                       aom_highbd_dc_left_predictor_32x8_neon, nullptr,
+                       aom_highbd_dc_left_predictor_32x8_neon,
+                       aom_highbd_dc_top_predictor_32x8_neon,
                        aom_highbd_dc_128_predictor_32x8_neon,
                        aom_highbd_v_predictor_32x8_neon,
                        aom_highbd_h_predictor_32x8_neon,
@@ -1689,7 +1699,8 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon,
-                       aom_highbd_dc_left_predictor_64x64_neon, nullptr,
+                       aom_highbd_dc_left_predictor_64x64_neon,
+                       aom_highbd_dc_top_predictor_64x64_neon,
                        aom_highbd_dc_128_predictor_64x64_neon,
                        aom_highbd_v_predictor_64x64_neon,
                        aom_highbd_h_predictor_64x64_neon,
@@ -1698,7 +1709,8 @@
                        aom_highbd_smooth_v_predictor_64x64_neon,
                        aom_highbd_smooth_h_predictor_64x64_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr,
-                       aom_highbd_dc_left_predictor_64x32_neon, nullptr,
+                       aom_highbd_dc_left_predictor_64x32_neon,
+                       aom_highbd_dc_top_predictor_64x32_neon,
                        aom_highbd_dc_128_predictor_64x32_neon,
                        aom_highbd_v_predictor_64x32_neon,
                        aom_highbd_h_predictor_64x32_neon,
@@ -1707,7 +1719,8 @@
                        aom_highbd_smooth_v_predictor_64x32_neon,
                        aom_highbd_smooth_h_predictor_64x32_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr,
-                       aom_highbd_dc_left_predictor_64x16_neon, nullptr,
+                       aom_highbd_dc_left_predictor_64x16_neon,
+                       aom_highbd_dc_top_predictor_64x16_neon,
                        aom_highbd_dc_128_predictor_64x16_neon,
                        aom_highbd_v_predictor_64x16_neon,
                        aom_highbd_h_predictor_64x16_neon,