Add highbd_dc_left_predictor Neon implementation and tests

Add Neon implementations of highbd_dc_left_predictor for all block
sizes. Also add the corresponding tests and benchmarks.

Change-Id: I83f8229acdda2a17f17edb7d4f422fbae6b2ddd1
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 604bb5a..3ad8c94 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -361,16 +361,25 @@
   specialize qw/aom_highbd_dc_128_predictor_64x32 neon/;
   specialize qw/aom_highbd_dc_128_predictor_64x64 neon/;
 
-  specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_64x64 neon/;
 
   specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index af212cd..799feb7 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -164,6 +164,101 @@
 #undef HIGHBD_DC_PREDICTOR_128
 
 // -----------------------------------------------------------------------------
+// DC_LEFT
+
+static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+#if AOM_ARCH_AARCH64
+  const uint32x4_t c = vpaddq_u32(b, b);
+  return vpaddq_u32(c, c);
+#else
+  const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
+  const uint32x2_t d = vpadd_u32(c, c);
+  return vcombine_u32(d, d);
+#endif
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
+  const uint16x4_t a = vld1_u16(left);   // up to 12 bits
+  const uint16x4_t b = vpadd_u16(a, a);  // up to 13 bits
+  return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
+  return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
+  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
+  const uint16x8_t a1 = vld1q_u16(left + 8);
+  const uint16x8_t b = vaddq_u16(a0, a1);  // up to 13 bits
+  return horizontal_add_and_broadcast_long_u16x8(b);
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
+  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
+  const uint16x8_t a1 = vld1q_u16(left + 8);
+  const uint16x8_t a2 = vld1q_u16(left + 16);
+  const uint16x8_t a3 = vld1q_u16(left + 24);
+  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
+  const uint16x8_t b1 = vaddq_u16(a2, a3);
+  const uint16x8_t c = vaddq_u16(b0, b1);  // up to 14 bits
+  return horizontal_add_and_broadcast_long_u16x8(c);
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
+  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
+  const uint16x8_t a1 = vld1q_u16(left + 8);
+  const uint16x8_t a2 = vld1q_u16(left + 16);
+  const uint16x8_t a3 = vld1q_u16(left + 24);
+  const uint16x8_t a4 = vld1q_u16(left + 32);
+  const uint16x8_t a5 = vld1q_u16(left + 40);
+  const uint16x8_t a6 = vld1q_u16(left + 48);
+  const uint16x8_t a7 = vld1q_u16(left + 56);
+  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
+  const uint16x8_t b1 = vaddq_u16(a2, a3);
+  const uint16x8_t b2 = vaddq_u16(a4, a5);
+  const uint16x8_t b3 = vaddq_u16(a6, a7);
+  const uint16x8_t c0 = vaddq_u16(b0, b1);  // up to 14 bits
+  const uint16x8_t c1 = vaddq_u16(b2, b3);
+  const uint16x8_t d = vaddq_u16(c0, c1);  // up to 15 bits
+  return horizontal_add_and_broadcast_long_u16x8(d);
+}
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q)                                  \
+  void aom_highbd_dc_left_predictor_##w##x##h##_neon(                      \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
+      const uint16_t *left, int bd) {                                      \
+    (void)above;                                                           \
+    (void)bd;                                                              \
+    const uint32x4_t sum = highbd_dc_load_sum_##h(left);                   \
+    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
+    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+  }
+
+DC_PREDICTOR_LEFT(4, 4, 2, )
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(8, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 8, 3, q)
+DC_PREDICTOR_LEFT(8, 16, 4, q)
+DC_PREDICTOR_LEFT(8, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 16, 4, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(64, 64, 6, q)
+
+#undef DC_PREDICTOR_LEFT
+
+// -----------------------------------------------------------------------------
 // V_PRED
 
 #define HIGHBD_V_NXM(W, H)                                    \
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 4ae0798..b8a1512 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -405,10 +405,10 @@
   highbd_entry(dc, 16, 16, neon, 8),    highbd_entry(dc, 32, 32, neon, 8),
   highbd_entry(dc, 64, 64, neon, 8),
 
-  highbd_intrapred(dc_128, neon, 12),   highbd_intrapred(v, neon, 12),
-  highbd_intrapred(h, neon, 12),        highbd_intrapred(paeth, neon, 12),
-  highbd_intrapred(smooth, neon, 12),   highbd_intrapred(smooth_v, neon, 12),
-  highbd_intrapred(smooth_h, neon, 12),
+  highbd_intrapred(dc_left, neon, 12),  highbd_intrapred(dc_128, neon, 12),
+  highbd_intrapred(v, neon, 12),        highbd_intrapred(h, neon, 12),
+  highbd_intrapred(paeth, neon, 12),    highbd_intrapred(smooth, neon, 12),
+  highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 8812bed..9dc633b 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -1305,15 +1305,17 @@
                        nullptr, nullptr)
 #endif
 #if HAVE_NEON
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon, nullptr,
-                       nullptr, aom_highbd_dc_128_predictor_4x4_neon,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon,
+                       aom_highbd_dc_left_predictor_4x4_neon, nullptr,
+                       aom_highbd_dc_128_predictor_4x4_neon,
                        aom_highbd_v_predictor_4x4_neon,
                        aom_highbd_h_predictor_4x4_neon,
                        aom_highbd_paeth_predictor_4x4_neon,
                        aom_highbd_smooth_predictor_4x4_neon,
                        aom_highbd_smooth_v_predictor_4x4_neon,
                        aom_highbd_smooth_h_predictor_4x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr,
+                       aom_highbd_dc_left_predictor_4x8_neon, nullptr,
                        aom_highbd_dc_128_predictor_4x8_neon,
                        aom_highbd_v_predictor_4x8_neon,
                        aom_highbd_h_predictor_4x8_neon,
@@ -1321,7 +1323,8 @@
                        aom_highbd_smooth_predictor_4x8_neon,
                        aom_highbd_smooth_v_predictor_4x8_neon,
                        aom_highbd_smooth_h_predictor_4x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr,
+                       aom_highbd_dc_left_predictor_4x16_neon, nullptr,
                        aom_highbd_dc_128_predictor_4x16_neon,
                        aom_highbd_v_predictor_4x16_neon,
                        aom_highbd_h_predictor_4x16_neon,
@@ -1393,15 +1396,17 @@
 #endif
 
 #if HAVE_NEON
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon, nullptr,
-                       nullptr, aom_highbd_dc_128_predictor_8x8_neon,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon,
+                       aom_highbd_dc_left_predictor_8x8_neon, nullptr,
+                       aom_highbd_dc_128_predictor_8x8_neon,
                        aom_highbd_v_predictor_8x8_neon,
                        aom_highbd_h_predictor_8x8_neon,
                        aom_highbd_paeth_predictor_8x8_neon,
                        aom_highbd_smooth_predictor_8x8_neon,
                        aom_highbd_smooth_v_predictor_8x8_neon,
                        aom_highbd_smooth_h_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr,
+                       aom_highbd_dc_left_predictor_8x4_neon, nullptr,
                        aom_highbd_dc_128_predictor_8x4_neon,
                        aom_highbd_v_predictor_8x4_neon,
                        aom_highbd_h_predictor_8x4_neon,
@@ -1409,7 +1414,8 @@
                        aom_highbd_smooth_predictor_8x4_neon,
                        aom_highbd_smooth_v_predictor_8x4_neon,
                        aom_highbd_smooth_h_predictor_8x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr,
+                       aom_highbd_dc_left_predictor_8x16_neon, nullptr,
                        aom_highbd_dc_128_predictor_8x16_neon,
                        aom_highbd_v_predictor_8x16_neon,
                        aom_highbd_h_predictor_8x16_neon,
@@ -1417,7 +1423,8 @@
                        aom_highbd_smooth_predictor_8x16_neon,
                        aom_highbd_smooth_v_predictor_8x16_neon,
                        aom_highbd_smooth_h_predictor_8x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr,
+                       aom_highbd_dc_left_predictor_8x32_neon, nullptr,
                        aom_highbd_dc_128_predictor_8x32_neon,
                        aom_highbd_v_predictor_8x32_neon,
                        aom_highbd_h_predictor_8x32_neon,
@@ -1508,14 +1515,16 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon,
-                       nullptr, nullptr, aom_highbd_dc_128_predictor_16x16_neon,
+                       aom_highbd_dc_left_predictor_16x16_neon, nullptr,
+                       aom_highbd_dc_128_predictor_16x16_neon,
                        aom_highbd_v_predictor_16x16_neon,
                        aom_highbd_h_predictor_16x16_neon,
                        aom_highbd_paeth_predictor_16x16_neon,
                        aom_highbd_smooth_predictor_16x16_neon,
                        aom_highbd_smooth_v_predictor_16x16_neon,
                        aom_highbd_smooth_h_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr,
+                       aom_highbd_dc_left_predictor_16x8_neon, nullptr,
                        aom_highbd_dc_128_predictor_16x8_neon,
                        aom_highbd_v_predictor_16x8_neon,
                        aom_highbd_h_predictor_16x8_neon,
@@ -1523,7 +1532,8 @@
                        aom_highbd_smooth_predictor_16x8_neon,
                        aom_highbd_smooth_v_predictor_16x8_neon,
                        aom_highbd_smooth_h_predictor_16x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr,
+                       aom_highbd_dc_left_predictor_16x32_neon, nullptr,
                        aom_highbd_dc_128_predictor_16x32_neon,
                        aom_highbd_v_predictor_16x32_neon,
                        aom_highbd_h_predictor_16x32_neon,
@@ -1531,7 +1541,8 @@
                        aom_highbd_smooth_predictor_16x32_neon,
                        aom_highbd_smooth_v_predictor_16x32_neon,
                        aom_highbd_smooth_h_predictor_16x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr,
+                       aom_highbd_dc_left_predictor_16x4_neon, nullptr,
                        aom_highbd_dc_128_predictor_16x4_neon,
                        aom_highbd_v_predictor_16x4_neon,
                        aom_highbd_h_predictor_16x4_neon,
@@ -1539,7 +1550,8 @@
                        aom_highbd_smooth_predictor_16x4_neon,
                        aom_highbd_smooth_v_predictor_16x4_neon,
                        aom_highbd_smooth_h_predictor_16x4_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr,
+                       aom_highbd_dc_left_predictor_16x64_neon, nullptr,
                        aom_highbd_dc_128_predictor_16x64_neon,
                        aom_highbd_v_predictor_16x64_neon,
                        aom_highbd_h_predictor_16x64_neon,
@@ -1613,14 +1625,16 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon,
-                       nullptr, nullptr, aom_highbd_dc_128_predictor_32x32_neon,
+                       aom_highbd_dc_left_predictor_32x32_neon, nullptr,
+                       aom_highbd_dc_128_predictor_32x32_neon,
                        aom_highbd_v_predictor_32x32_neon,
                        aom_highbd_h_predictor_32x32_neon,
                        aom_highbd_paeth_predictor_32x32_neon,
                        aom_highbd_smooth_predictor_32x32_neon,
                        aom_highbd_smooth_v_predictor_32x32_neon,
                        aom_highbd_smooth_h_predictor_32x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr,
+                       aom_highbd_dc_left_predictor_32x16_neon, nullptr,
                        aom_highbd_dc_128_predictor_32x16_neon,
                        aom_highbd_v_predictor_32x16_neon,
                        aom_highbd_h_predictor_32x16_neon,
@@ -1628,7 +1642,8 @@
                        aom_highbd_smooth_predictor_32x16_neon,
                        aom_highbd_smooth_v_predictor_32x16_neon,
                        aom_highbd_smooth_h_predictor_32x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr,
+                       aom_highbd_dc_left_predictor_32x64_neon, nullptr,
                        aom_highbd_dc_128_predictor_32x64_neon,
                        aom_highbd_v_predictor_32x64_neon,
                        aom_highbd_h_predictor_32x64_neon,
@@ -1636,7 +1651,8 @@
                        aom_highbd_smooth_predictor_32x64_neon,
                        aom_highbd_smooth_v_predictor_32x64_neon,
                        aom_highbd_smooth_h_predictor_32x64_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr,
+                       aom_highbd_dc_left_predictor_32x8_neon, nullptr,
                        aom_highbd_dc_128_predictor_32x8_neon,
                        aom_highbd_v_predictor_32x8_neon,
                        aom_highbd_h_predictor_32x8_neon,
@@ -1673,14 +1689,16 @@
 
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon,
-                       nullptr, nullptr, aom_highbd_dc_128_predictor_64x64_neon,
+                       aom_highbd_dc_left_predictor_64x64_neon, nullptr,
+                       aom_highbd_dc_128_predictor_64x64_neon,
                        aom_highbd_v_predictor_64x64_neon,
                        aom_highbd_h_predictor_64x64_neon,
                        aom_highbd_paeth_predictor_64x64_neon,
                        aom_highbd_smooth_predictor_64x64_neon,
                        aom_highbd_smooth_v_predictor_64x64_neon,
                        aom_highbd_smooth_h_predictor_64x64_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr,
+                       aom_highbd_dc_left_predictor_64x32_neon, nullptr,
                        aom_highbd_dc_128_predictor_64x32_neon,
                        aom_highbd_v_predictor_64x32_neon,
                        aom_highbd_h_predictor_64x32_neon,
@@ -1688,7 +1706,8 @@
                        aom_highbd_smooth_predictor_64x32_neon,
                        aom_highbd_smooth_v_predictor_64x32_neon,
                        aom_highbd_smooth_h_predictor_64x32_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr,
+HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr,
+                       aom_highbd_dc_left_predictor_64x16_neon, nullptr,
                        aom_highbd_dc_128_predictor_64x16_neon,
                        aom_highbd_v_predictor_64x16_neon,
                        aom_highbd_h_predictor_64x16_neon,