Add highbd_dc_128_predictor Neon implementation and tests

Add Neon implementations of highbd_dc_128_predictor for all block sizes.
Also add the corresponding tests and benchmarks.

We also take this opportunity to clean up the ordering of the specialize
calls in aom_dsp_rtcd_defs.pl and add the missing dc_128 cases for Neon.

Change-Id: I97de6e91a76c9cdda740563975424d514623eade
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index e05c276..16bb4a9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -341,36 +341,47 @@
   specialize qw/aom_highbd_h_predictor_64x32 neon/;
   specialize qw/aom_highbd_h_predictor_64x64 neon/;
 
+  specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_64x64 neon/;
+
   specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
   specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
+
+  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
 
   specialize qw/aom_highbd_paeth_predictor_4x4 neon/;
   specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index 8363399..af212cd 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -82,6 +82,88 @@
 #undef INTRA_PRED_SQUARE
 
 // -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                       uint16x4_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1_u16(dst + i * stride, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                       uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                        uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+    vst1q_u16(dst + i * stride + 8, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                        uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+    vst1q_u16(dst + i * stride + 8, dc);
+    vst1q_u16(dst + i * stride + 16, dc);
+    vst1q_u16(dst + i * stride + 24, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                        uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+    vst1q_u16(dst + i * stride + 8, dc);
+    vst1q_u16(dst + i * stride + 16, dc);
+    vst1q_u16(dst + i * stride + 24, dc);
+    vst1q_u16(dst + i * stride + 32, dc);
+    vst1q_u16(dst + i * stride + 40, dc);
+    vst1q_u16(dst + i * stride + 48, dc);
+    vst1q_u16(dst + i * stride + 56, dc);
+  }
+}
+
+#define HIGHBD_DC_PREDICTOR_128(w, h, q)                        \
+  void aom_highbd_dc_128_predictor_##w##x##h##_neon(            \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)above;                                                \
+    (void)bd;                                                   \
+    (void)left;                                                 \
+    highbd_dc_store_##w##xh(dst, stride, (h),                   \
+                            vdup##q##_n_u16(0x80 << (bd - 8))); \
+  }
+
+HIGHBD_DC_PREDICTOR_128(4, 4, )
+HIGHBD_DC_PREDICTOR_128(4, 8, )
+HIGHBD_DC_PREDICTOR_128(4, 16, )
+HIGHBD_DC_PREDICTOR_128(8, 4, q)
+HIGHBD_DC_PREDICTOR_128(8, 8, q)
+HIGHBD_DC_PREDICTOR_128(8, 16, q)
+HIGHBD_DC_PREDICTOR_128(8, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 4, q)
+HIGHBD_DC_PREDICTOR_128(16, 8, q)
+HIGHBD_DC_PREDICTOR_128(16, 16, q)
+HIGHBD_DC_PREDICTOR_128(16, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 64, q)
+HIGHBD_DC_PREDICTOR_128(32, 8, q)
+HIGHBD_DC_PREDICTOR_128(32, 16, q)
+HIGHBD_DC_PREDICTOR_128(32, 32, q)
+HIGHBD_DC_PREDICTOR_128(32, 64, q)
+HIGHBD_DC_PREDICTOR_128(64, 16, q)
+HIGHBD_DC_PREDICTOR_128(64, 32, q)
+HIGHBD_DC_PREDICTOR_128(64, 64, q)
+
+#undef HIGHBD_DC_PREDICTOR_128
+
+// -----------------------------------------------------------------------------
 // V_PRED
 
 #define HIGHBD_V_NXM(W, H)                                    \
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 8839450..4ae0798 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -405,9 +405,10 @@
   highbd_entry(dc, 16, 16, neon, 8),    highbd_entry(dc, 32, 32, neon, 8),
   highbd_entry(dc, 64, 64, neon, 8),
 
-  highbd_intrapred(v, neon, 12),        highbd_intrapred(h, neon, 12),
-  highbd_intrapred(paeth, neon, 12),    highbd_intrapred(smooth, neon, 12),
-  highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12),
+  highbd_intrapred(dc_128, neon, 12),   highbd_intrapred(v, neon, 12),
+  highbd_intrapred(h, neon, 12),        highbd_intrapred(paeth, neon, 12),
+  highbd_intrapred(smooth, neon, 12),   highbd_intrapred(smooth_v, neon, 12),
+  highbd_intrapred(smooth_h, neon, 12),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 05462a2..8812bed 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -1306,20 +1306,23 @@
 #endif
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon, nullptr,
-                       nullptr, nullptr, aom_highbd_v_predictor_4x4_neon,
+                       nullptr, aom_highbd_dc_128_predictor_4x4_neon,
+                       aom_highbd_v_predictor_4x4_neon,
                        aom_highbd_h_predictor_4x4_neon,
                        aom_highbd_paeth_predictor_4x4_neon,
                        aom_highbd_smooth_predictor_4x4_neon,
                        aom_highbd_smooth_v_predictor_4x4_neon,
                        aom_highbd_smooth_h_predictor_4x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_4x8_neon,
                        aom_highbd_v_predictor_4x8_neon,
                        aom_highbd_h_predictor_4x8_neon,
                        aom_highbd_paeth_predictor_4x8_neon,
                        aom_highbd_smooth_predictor_4x8_neon,
                        aom_highbd_smooth_v_predictor_4x8_neon,
                        aom_highbd_smooth_h_predictor_4x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_4x16_neon,
                        aom_highbd_v_predictor_4x16_neon,
                        aom_highbd_h_predictor_4x16_neon,
                        aom_highbd_paeth_predictor_4x16_neon,
@@ -1391,27 +1394,31 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon, nullptr,
-                       nullptr, nullptr, aom_highbd_v_predictor_8x8_neon,
+                       nullptr, aom_highbd_dc_128_predictor_8x8_neon,
+                       aom_highbd_v_predictor_8x8_neon,
                        aom_highbd_h_predictor_8x8_neon,
                        aom_highbd_paeth_predictor_8x8_neon,
                        aom_highbd_smooth_predictor_8x8_neon,
                        aom_highbd_smooth_v_predictor_8x8_neon,
                        aom_highbd_smooth_h_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_8x4_neon,
                        aom_highbd_v_predictor_8x4_neon,
                        aom_highbd_h_predictor_8x4_neon,
                        aom_highbd_paeth_predictor_8x4_neon,
                        aom_highbd_smooth_predictor_8x4_neon,
                        aom_highbd_smooth_v_predictor_8x4_neon,
                        aom_highbd_smooth_h_predictor_8x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_8x16_neon,
                        aom_highbd_v_predictor_8x16_neon,
                        aom_highbd_h_predictor_8x16_neon,
                        aom_highbd_paeth_predictor_8x16_neon,
                        aom_highbd_smooth_predictor_8x16_neon,
                        aom_highbd_smooth_v_predictor_8x16_neon,
                        aom_highbd_smooth_h_predictor_8x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_8x32_neon,
                        aom_highbd_v_predictor_8x32_neon,
                        aom_highbd_h_predictor_8x32_neon,
                        aom_highbd_paeth_predictor_8x32_neon,
@@ -1501,35 +1508,39 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon,
-                       nullptr, nullptr, nullptr,
+                       nullptr, nullptr, aom_highbd_dc_128_predictor_16x16_neon,
                        aom_highbd_v_predictor_16x16_neon,
                        aom_highbd_h_predictor_16x16_neon,
                        aom_highbd_paeth_predictor_16x16_neon,
                        aom_highbd_smooth_predictor_16x16_neon,
                        aom_highbd_smooth_v_predictor_16x16_neon,
                        aom_highbd_smooth_h_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_16x8_neon,
                        aom_highbd_v_predictor_16x8_neon,
                        aom_highbd_h_predictor_16x8_neon,
                        aom_highbd_paeth_predictor_16x8_neon,
                        aom_highbd_smooth_predictor_16x8_neon,
                        aom_highbd_smooth_v_predictor_16x8_neon,
                        aom_highbd_smooth_h_predictor_16x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_16x32_neon,
                        aom_highbd_v_predictor_16x32_neon,
                        aom_highbd_h_predictor_16x32_neon,
                        aom_highbd_paeth_predictor_16x32_neon,
                        aom_highbd_smooth_predictor_16x32_neon,
                        aom_highbd_smooth_v_predictor_16x32_neon,
                        aom_highbd_smooth_h_predictor_16x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_16x4_neon,
                        aom_highbd_v_predictor_16x4_neon,
                        aom_highbd_h_predictor_16x4_neon,
                        aom_highbd_paeth_predictor_16x4_neon,
                        aom_highbd_smooth_predictor_16x4_neon,
                        aom_highbd_smooth_v_predictor_16x4_neon,
                        aom_highbd_smooth_h_predictor_16x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_16x64_neon,
                        aom_highbd_v_predictor_16x64_neon,
                        aom_highbd_h_predictor_16x64_neon,
                        aom_highbd_paeth_predictor_16x64_neon,
@@ -1602,28 +1613,31 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon,
-                       nullptr, nullptr, nullptr,
+                       nullptr, nullptr, aom_highbd_dc_128_predictor_32x32_neon,
                        aom_highbd_v_predictor_32x32_neon,
                        aom_highbd_h_predictor_32x32_neon,
                        aom_highbd_paeth_predictor_32x32_neon,
                        aom_highbd_smooth_predictor_32x32_neon,
                        aom_highbd_smooth_v_predictor_32x32_neon,
                        aom_highbd_smooth_h_predictor_32x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_32x16_neon,
                        aom_highbd_v_predictor_32x16_neon,
                        aom_highbd_h_predictor_32x16_neon,
                        aom_highbd_paeth_predictor_32x16_neon,
                        aom_highbd_smooth_predictor_32x16_neon,
                        aom_highbd_smooth_v_predictor_32x16_neon,
                        aom_highbd_smooth_h_predictor_32x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_32x64_neon,
                        aom_highbd_v_predictor_32x64_neon,
                        aom_highbd_h_predictor_32x64_neon,
                        aom_highbd_paeth_predictor_32x64_neon,
                        aom_highbd_smooth_predictor_32x64_neon,
                        aom_highbd_smooth_v_predictor_32x64_neon,
                        aom_highbd_smooth_h_predictor_32x64_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_32x8_neon,
                        aom_highbd_v_predictor_32x8_neon,
                        aom_highbd_h_predictor_32x8_neon,
                        aom_highbd_paeth_predictor_32x8_neon,
@@ -1659,21 +1673,23 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon,
-                       nullptr, nullptr, nullptr,
+                       nullptr, nullptr, aom_highbd_dc_128_predictor_64x64_neon,
                        aom_highbd_v_predictor_64x64_neon,
                        aom_highbd_h_predictor_64x64_neon,
                        aom_highbd_paeth_predictor_64x64_neon,
                        aom_highbd_smooth_predictor_64x64_neon,
                        aom_highbd_smooth_v_predictor_64x64_neon,
                        aom_highbd_smooth_h_predictor_64x64_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_64x32_neon,
                        aom_highbd_v_predictor_64x32_neon,
                        aom_highbd_h_predictor_64x32_neon,
                        aom_highbd_paeth_predictor_64x32_neon,
                        aom_highbd_smooth_predictor_64x32_neon,
                        aom_highbd_smooth_v_predictor_64x32_neon,
                        aom_highbd_smooth_h_predictor_64x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr,
+                       aom_highbd_dc_128_predictor_64x16_neon,
                        aom_highbd_v_predictor_64x16_neon,
                        aom_highbd_h_predictor_64x16_neon,
                        aom_highbd_paeth_predictor_64x16_neon,