Add highbd_h_predictor Neon implementation and tests

Add Neon implementations of highbd_h_predictor for all block sizes.
Also add the corresponding tests and benchmarks.

Change-Id: I2b4ca0a968d58e445a21589ea64dee0b696179f3
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 7321222..8f087b5 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -321,16 +321,26 @@
   specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
   specialize qw/aom_highbd_dc_predictor_64x64 neon/;
 
-  specialize qw/aom_highbd_h_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_h_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_h_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_h_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_h_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_h_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_h_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_h_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_h_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_h_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_h_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_4x16 neon/;
+  specialize qw/aom_highbd_h_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_8x32 neon/;
+  specialize qw/aom_highbd_h_predictor_16x4 neon/;
+  specialize qw/aom_highbd_h_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_16x64 neon/;
+  specialize qw/aom_highbd_h_predictor_32x8 neon/;
+  specialize qw/aom_highbd_h_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_32x64 neon/;
+  specialize qw/aom_highbd_h_predictor_64x16 neon/;
+  specialize qw/aom_highbd_h_predictor_64x32 neon/;
+  specialize qw/aom_highbd_h_predictor_64x64 neon/;
+
   specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
   specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
   specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
diff --git a/aom_dsp/arm/highbd_intrapred_neon.c b/aom_dsp/arm/highbd_intrapred_neon.c
index fa2f11e..8363399 100644
--- a/aom_dsp/arm/highbd_intrapred_neon.c
+++ b/aom_dsp/arm/highbd_intrapred_neon.c
@@ -213,6 +213,170 @@
 HIGHBD_V_NXM(64, 64)
 
 // -----------------------------------------------------------------------------
+// H_PRED
+
+static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                      uint16x4_t left) {
+  vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
+  vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
+  vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
+  vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
+                                      uint16x4_t left) {
+  vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
+  vst1q_u16(dst + 0, left);
+  vst1q_u16(dst + 8, left);
+}
+
+static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
+                                       uint16x4_t left) {
+  highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
+  vst1q_u16(dst + 0, left);
+  vst1q_u16(dst + 8, left);
+  vst1q_u16(dst + 16, left);
+  vst1q_u16(dst + 24, left);
+}
+
+static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
+                                       uint16x4_t left) {
+  highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
+  vst1q_u16(dst + 0, left);
+  vst1q_u16(dst + 8, left);
+  vst1q_u16(dst + 16, left);
+  vst1q_u16(dst + 24, left);
+  vst1q_u16(dst + 32, left);
+  vst1q_u16(dst + 40, left);
+  vst1q_u16(dst + 48, left);
+  vst1q_u16(dst + 56, left);
+}
+
+static INLINE void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
+                                       uint16x4_t left) {
+  highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  highbd_h_store_4x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  highbd_h_store_8x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  highbd_h_store_16x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+// For cases where height >= 16 we use pairs of loads to get LDP instructions.
+#define HIGHBD_H_WXH_LARGE(w, h)                                            \
+  void aom_highbd_h_predictor_##w##x##h##_neon(                             \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
+      const uint16_t *left, int bd) {                                       \
+    (void)above;                                                            \
+    (void)bd;                                                               \
+    for (int i = 0; i < (h) / 16; ++i) {                                    \
+      uint16x8_t l0 = vld1q_u16(left + 0);                                  \
+      uint16x8_t l1 = vld1q_u16(left + 8);                                  \
+      highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0));   \
+      highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0));  \
+      highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1));   \
+      highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
+      left += 16;                                                           \
+      dst += 16 * stride;                                                   \
+    }                                                                       \
+  }
+
+HIGHBD_H_WXH_LARGE(4, 16)
+HIGHBD_H_WXH_LARGE(8, 16)
+HIGHBD_H_WXH_LARGE(8, 32)
+HIGHBD_H_WXH_LARGE(16, 16)
+HIGHBD_H_WXH_LARGE(16, 32)
+HIGHBD_H_WXH_LARGE(16, 64)
+HIGHBD_H_WXH_LARGE(32, 16)
+HIGHBD_H_WXH_LARGE(32, 32)
+HIGHBD_H_WXH_LARGE(32, 64)
+HIGHBD_H_WXH_LARGE(64, 16)
+HIGHBD_H_WXH_LARGE(64, 32)
+HIGHBD_H_WXH_LARGE(64, 64)
+
+#undef HIGHBD_H_WXH_LARGE
+
+// -----------------------------------------------------------------------------
 // PAETH
 
 static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index c258492..8839450 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -405,9 +405,9 @@
   highbd_entry(dc, 16, 16, neon, 8),    highbd_entry(dc, 32, 32, neon, 8),
   highbd_entry(dc, 64, 64, neon, 8),
 
-  highbd_intrapred(v, neon, 12),        highbd_intrapred(paeth, neon, 12),
-  highbd_intrapred(smooth, neon, 12),   highbd_intrapred(smooth_v, neon, 12),
-  highbd_intrapred(smooth_h, neon, 12),
+  highbd_intrapred(v, neon, 12),        highbd_intrapred(h, neon, 12),
+  highbd_intrapred(paeth, neon, 12),    highbd_intrapred(smooth, neon, 12),
+  highbd_intrapred(smooth_v, neon, 12), highbd_intrapred(smooth_h, neon, 12),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 08370b6..05462a2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -1307,18 +1307,21 @@
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_4X4, aom_highbd_dc_predictor_4x4_neon, nullptr,
                        nullptr, nullptr, aom_highbd_v_predictor_4x4_neon,
-                       nullptr, aom_highbd_paeth_predictor_4x4_neon,
+                       aom_highbd_h_predictor_4x4_neon,
+                       aom_highbd_paeth_predictor_4x4_neon,
                        aom_highbd_smooth_predictor_4x4_neon,
                        aom_highbd_smooth_v_predictor_4x4_neon,
                        aom_highbd_smooth_h_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_4x8_neon, nullptr,
+                       aom_highbd_v_predictor_4x8_neon,
+                       aom_highbd_h_predictor_4x8_neon,
                        aom_highbd_paeth_predictor_4x8_neon,
                        aom_highbd_smooth_predictor_4x8_neon,
                        aom_highbd_smooth_v_predictor_4x8_neon,
                        aom_highbd_smooth_h_predictor_4x8_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_4x16_neon, nullptr,
+                       aom_highbd_v_predictor_4x16_neon,
+                       aom_highbd_h_predictor_4x16_neon,
                        aom_highbd_paeth_predictor_4x16_neon,
                        aom_highbd_smooth_predictor_4x16_neon,
                        aom_highbd_smooth_v_predictor_4x16_neon,
@@ -1389,24 +1392,28 @@
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X8, aom_highbd_dc_predictor_8x8_neon, nullptr,
                        nullptr, nullptr, aom_highbd_v_predictor_8x8_neon,
-                       nullptr, aom_highbd_paeth_predictor_8x8_neon,
+                       aom_highbd_h_predictor_8x8_neon,
+                       aom_highbd_paeth_predictor_8x8_neon,
                        aom_highbd_smooth_predictor_8x8_neon,
                        aom_highbd_smooth_v_predictor_8x8_neon,
                        aom_highbd_smooth_h_predictor_8x8_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_8x4_neon, nullptr,
+                       aom_highbd_v_predictor_8x4_neon,
+                       aom_highbd_h_predictor_8x4_neon,
                        aom_highbd_paeth_predictor_8x4_neon,
                        aom_highbd_smooth_predictor_8x4_neon,
                        aom_highbd_smooth_v_predictor_8x4_neon,
                        aom_highbd_smooth_h_predictor_8x4_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_8x16_neon, nullptr,
+                       aom_highbd_v_predictor_8x16_neon,
+                       aom_highbd_h_predictor_8x16_neon,
                        aom_highbd_paeth_predictor_8x16_neon,
                        aom_highbd_smooth_predictor_8x16_neon,
                        aom_highbd_smooth_v_predictor_8x16_neon,
                        aom_highbd_smooth_h_predictor_8x16_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_8x32_neon, nullptr,
+                       aom_highbd_v_predictor_8x32_neon,
+                       aom_highbd_h_predictor_8x32_neon,
                        aom_highbd_paeth_predictor_8x32_neon,
                        aom_highbd_smooth_predictor_8x32_neon,
                        aom_highbd_smooth_v_predictor_8x32_neon,
@@ -1495,31 +1502,36 @@
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X16, aom_highbd_dc_predictor_16x16_neon,
                        nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_16x16_neon, nullptr,
+                       aom_highbd_v_predictor_16x16_neon,
+                       aom_highbd_h_predictor_16x16_neon,
                        aom_highbd_paeth_predictor_16x16_neon,
                        aom_highbd_smooth_predictor_16x16_neon,
                        aom_highbd_smooth_v_predictor_16x16_neon,
                        aom_highbd_smooth_h_predictor_16x16_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_16x8_neon, nullptr,
+                       aom_highbd_v_predictor_16x8_neon,
+                       aom_highbd_h_predictor_16x8_neon,
                        aom_highbd_paeth_predictor_16x8_neon,
                        aom_highbd_smooth_predictor_16x8_neon,
                        aom_highbd_smooth_v_predictor_16x8_neon,
                        aom_highbd_smooth_h_predictor_16x8_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_16x32_neon, nullptr,
+                       aom_highbd_v_predictor_16x32_neon,
+                       aom_highbd_h_predictor_16x32_neon,
                        aom_highbd_paeth_predictor_16x32_neon,
                        aom_highbd_smooth_predictor_16x32_neon,
                        aom_highbd_smooth_v_predictor_16x32_neon,
                        aom_highbd_smooth_h_predictor_16x32_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_16x4_neon, nullptr,
+                       aom_highbd_v_predictor_16x4_neon,
+                       aom_highbd_h_predictor_16x4_neon,
                        aom_highbd_paeth_predictor_16x4_neon,
                        aom_highbd_smooth_predictor_16x4_neon,
                        aom_highbd_smooth_v_predictor_16x4_neon,
                        aom_highbd_smooth_h_predictor_16x4_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_16x64_neon, nullptr,
+                       aom_highbd_v_predictor_16x64_neon,
+                       aom_highbd_h_predictor_16x64_neon,
                        aom_highbd_paeth_predictor_16x64_neon,
                        aom_highbd_smooth_predictor_16x64_neon,
                        aom_highbd_smooth_v_predictor_16x64_neon,
@@ -1591,25 +1603,29 @@
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X32, aom_highbd_dc_predictor_32x32_neon,
                        nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_32x32_neon, nullptr,
+                       aom_highbd_v_predictor_32x32_neon,
+                       aom_highbd_h_predictor_32x32_neon,
                        aom_highbd_paeth_predictor_32x32_neon,
                        aom_highbd_smooth_predictor_32x32_neon,
                        aom_highbd_smooth_v_predictor_32x32_neon,
                        aom_highbd_smooth_h_predictor_32x32_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_32x16_neon, nullptr,
+                       aom_highbd_v_predictor_32x16_neon,
+                       aom_highbd_h_predictor_32x16_neon,
                        aom_highbd_paeth_predictor_32x16_neon,
                        aom_highbd_smooth_predictor_32x16_neon,
                        aom_highbd_smooth_v_predictor_32x16_neon,
                        aom_highbd_smooth_h_predictor_32x16_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_32x64_neon, nullptr,
+                       aom_highbd_v_predictor_32x64_neon,
+                       aom_highbd_h_predictor_32x64_neon,
                        aom_highbd_paeth_predictor_32x64_neon,
                        aom_highbd_smooth_predictor_32x64_neon,
                        aom_highbd_smooth_v_predictor_32x64_neon,
                        aom_highbd_smooth_h_predictor_32x64_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_32x8_neon, nullptr,
+                       aom_highbd_v_predictor_32x8_neon,
+                       aom_highbd_h_predictor_32x8_neon,
                        aom_highbd_paeth_predictor_32x8_neon,
                        aom_highbd_smooth_predictor_32x8_neon,
                        aom_highbd_smooth_v_predictor_32x8_neon,
@@ -1644,19 +1660,22 @@
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X64, aom_highbd_dc_predictor_64x64_neon,
                        nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_64x64_neon, nullptr,
+                       aom_highbd_v_predictor_64x64_neon,
+                       aom_highbd_h_predictor_64x64_neon,
                        aom_highbd_paeth_predictor_64x64_neon,
                        aom_highbd_smooth_predictor_64x64_neon,
                        aom_highbd_smooth_v_predictor_64x64_neon,
                        aom_highbd_smooth_h_predictor_64x64_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_64x32_neon, nullptr,
+                       aom_highbd_v_predictor_64x32_neon,
+                       aom_highbd_h_predictor_64x32_neon,
                        aom_highbd_paeth_predictor_64x32_neon,
                        aom_highbd_smooth_predictor_64x32_neon,
                        aom_highbd_smooth_v_predictor_64x32_neon,
                        aom_highbd_smooth_h_predictor_64x32_neon)
 HIGHBD_INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, nullptr,
-                       aom_highbd_v_predictor_64x16_neon, nullptr,
+                       aom_highbd_v_predictor_64x16_neon,
+                       aom_highbd_h_predictor_64x16_neon,
                        aom_highbd_paeth_predictor_64x16_neon,
                        aom_highbd_smooth_predictor_64x16_neon,
                        aom_highbd_smooth_v_predictor_64x16_neon,