Add Neon implementations for remaining h predictors

We already have Neon implementations of the h predictors for all square
block sizes except 64x64, so add the remaining cases and update
tests/speed to match. We also clean up and refactor the existing cases
to expose some helper functions we can reuse in the new implementations.

On Neoverse N1 these new implementations are worth about a 33% geomean
performance improvement over the C versions on both Clang 15 and GCC 12.
On Neoverse V1 there is a smaller improvement, with a geomean
improvement of around 3% on Clang 15 and 15% on GCC 12.

Change-Id: Ib9dd1e2c924c41d620635b8e3acb813242642cb3
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 0bf1a9c..832c8d1 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -166,24 +166,24 @@
 specialize qw/aom_v_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_h_predictor_4x4 neon sse2/;
-specialize qw/aom_h_predictor_4x8 sse2/;
-specialize qw/aom_h_predictor_4x16 sse2/;
-specialize qw/aom_h_predictor_8x4 sse2/;
+specialize qw/aom_h_predictor_4x8 neon sse2/;
+specialize qw/aom_h_predictor_4x16 neon sse2/;
+specialize qw/aom_h_predictor_8x4 neon sse2/;
 specialize qw/aom_h_predictor_8x8 neon sse2/;
-specialize qw/aom_h_predictor_8x16 sse2/;
-specialize qw/aom_h_predictor_8x32 sse2/;
-specialize qw/aom_h_predictor_16x4 sse2/;
-specialize qw/aom_h_predictor_16x8 sse2/;
+specialize qw/aom_h_predictor_8x16 neon sse2/;
+specialize qw/aom_h_predictor_8x32 neon sse2/;
+specialize qw/aom_h_predictor_16x4 neon sse2/;
+specialize qw/aom_h_predictor_16x8 neon sse2/;
 specialize qw/aom_h_predictor_16x16 neon sse2/;
-specialize qw/aom_h_predictor_16x32 sse2/;
-specialize qw/aom_h_predictor_16x64 sse2/;
-specialize qw/aom_h_predictor_32x8 sse2/;
-specialize qw/aom_h_predictor_32x16 sse2/;
+specialize qw/aom_h_predictor_16x32 neon sse2/;
+specialize qw/aom_h_predictor_16x64 neon sse2/;
+specialize qw/aom_h_predictor_32x8 neon sse2/;
+specialize qw/aom_h_predictor_32x16 neon sse2/;
 specialize qw/aom_h_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_h_predictor_32x64 sse2/;
-specialize qw/aom_h_predictor_64x16 sse2/;
-specialize qw/aom_h_predictor_64x32 sse2/;
-specialize qw/aom_h_predictor_64x64 sse2/;
+specialize qw/aom_h_predictor_32x64 neon sse2/;
+specialize qw/aom_h_predictor_64x16 neon sse2/;
+specialize qw/aom_h_predictor_64x32 neon sse2/;
+specialize qw/aom_h_predictor_64x64 neon sse2/;
 
 specialize qw/aom_paeth_predictor_4x4 ssse3 neon/;
 specialize qw/aom_paeth_predictor_4x8 ssse3 neon/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 5aa401b..c6c2849 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -783,143 +783,293 @@
 
 // -----------------------------------------------------------------------------
 
+static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0);
+  store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0);
+  store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2), 0);
+  store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3), 0);
+  store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4), 0);
+  store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5), 0);
+  store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6), 0);
+  store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7), 0);
+}
+
+static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+  vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+  vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+  vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+  vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
+  vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
+  vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
+  vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
+}
+
 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
+  const uint8x8_t d0 = load_u8_4x1_lane0(left);
   (void)above;
-
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0);
+  store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0);
+  store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2), 0);
+  store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3), 0);
 }
 
 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
+  const uint8x8_t d0 = vld1_u8(left);
   (void)above;
-
-  d1u64 = vld1_u64((const uint64_t *)left);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
+  h_store_8x8(dst, stride, d0);
 }
 
 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  const uint8x16_t d0 = vld1q_u8(left);
   (void)above;
-
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-  }
+  h_store_16x8(dst, stride, vget_low_u8(d0));
+  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
 }
 
 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
   (void)above;
+  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
 
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
+void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_4x8(dst, stride, d0);
+}
+
+void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  (void)above;
+  vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+  vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+  vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+  vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  (void)above;
+  h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = load_u8_4x1_lane0(left);
+  (void)above;
+  vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_16x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  (void)above;
+  h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  const uint8x16_t d2 = vld1q_u8(left + 32);
+  const uint8x16_t d3 = vld1q_u8(left + 48);
+  (void)above;
+  h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+  h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
+  h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
+  h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
+  h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_32x8(dst, stride, d0);
+}
+
+void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left + 0);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  const uint8x16_t d2 = vld1q_u8(left + 32);
+  const uint8x16_t d3 = vld1q_u8(left + 48);
+  (void)above;
+  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+  h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
+  h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
+  h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
+  h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  for (int i = 0; i < 2; ++i) {
+    const uint8x16_t d0 = vld1q_u8(left);
+    h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+    h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+    left += 16;
+    dst += 16 * stride;
+  }
+}
+
+void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  for (int i = 0; i < 4; ++i) {
+    const uint8x16_t d0 = vld1q_u8(left);
+    h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+    h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+    left += 16;
+    dst += 16 * stride;
   }
 }
 
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 47f35f5..c258492 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -342,11 +342,7 @@
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
   lowbd_intrapred(dc, neon),       lowbd_intrapred(dc_top, neon),
   lowbd_intrapred(dc_left, neon),  lowbd_intrapred(dc_128, neon),
-  lowbd_intrapred(v, neon),
-
-  lowbd_entry(h, 4, 4, neon),      lowbd_entry(h, 8, 8, neon),
-  lowbd_entry(h, 16, 16, neon),    lowbd_entry(h, 32, 32, neon),
-
+  lowbd_intrapred(v, neon),        lowbd_intrapred(h, neon),
   lowbd_intrapred(smooth, neon),   lowbd_intrapred(smooth_v, neon),
   lowbd_intrapred(smooth_h, neon), lowbd_intrapred(paeth, neon),
 };
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index e3751fb..08370b6 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -471,13 +471,13 @@
 INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
                 aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
                 aom_dc_128_predictor_4x8_neon, aom_v_predictor_4x8_neon,
-                nullptr, aom_paeth_predictor_4x8_neon,
+                aom_h_predictor_4x8_neon, aom_paeth_predictor_4x8_neon,
                 aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
                 aom_smooth_h_predictor_4x8_neon)
 INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
                 aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
                 aom_dc_128_predictor_4x16_neon, aom_v_predictor_4x16_neon,
-                nullptr, aom_paeth_predictor_4x16_neon,
+                aom_h_predictor_4x16_neon, aom_paeth_predictor_4x16_neon,
                 aom_smooth_predictor_4x16_neon,
                 aom_smooth_v_predictor_4x16_neon,
                 aom_smooth_h_predictor_4x16_neon)
@@ -562,20 +562,20 @@
 INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
                 aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
                 aom_dc_128_predictor_8x4_neon, aom_v_predictor_8x4_neon,
-                nullptr, aom_paeth_predictor_8x4_neon,
+                aom_h_predictor_8x4_neon, aom_paeth_predictor_8x4_neon,
                 aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
                 aom_smooth_h_predictor_8x4_neon)
 INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
                 aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
                 aom_dc_128_predictor_8x16_neon, aom_v_predictor_8x16_neon,
-                nullptr, aom_paeth_predictor_8x16_neon,
+                aom_h_predictor_8x16_neon, aom_paeth_predictor_8x16_neon,
                 aom_smooth_predictor_8x16_neon,
                 aom_smooth_v_predictor_8x16_neon,
                 aom_smooth_h_predictor_8x16_neon)
 INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
                 aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
                 aom_dc_128_predictor_8x32_neon, aom_v_predictor_8x32_neon,
-                nullptr, aom_paeth_predictor_8x32_neon,
+                aom_h_predictor_8x32_neon, aom_paeth_predictor_8x32_neon,
                 aom_smooth_predictor_8x32_neon,
                 aom_smooth_v_predictor_8x32_neon,
                 aom_smooth_h_predictor_8x32_neon)
@@ -696,7 +696,7 @@
 INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
                 aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
                 aom_dc_128_predictor_16x8_neon, aom_v_predictor_16x8_neon,
-                nullptr, aom_paeth_predictor_16x8_neon,
+                aom_h_predictor_16x8_neon, aom_paeth_predictor_16x8_neon,
                 aom_smooth_predictor_16x8_neon,
                 aom_smooth_v_predictor_16x8_neon,
                 aom_smooth_h_predictor_16x8_neon)
@@ -704,14 +704,14 @@
                 aom_dc_left_predictor_16x32_neon,
                 aom_dc_top_predictor_16x32_neon,
                 aom_dc_128_predictor_16x32_neon, aom_v_predictor_16x32_neon,
-                nullptr, aom_paeth_predictor_16x32_neon,
+                aom_h_predictor_16x32_neon, aom_paeth_predictor_16x32_neon,
                 aom_smooth_predictor_16x32_neon,
                 aom_smooth_v_predictor_16x32_neon,
                 aom_smooth_h_predictor_16x32_neon)
 INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
                 aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
                 aom_dc_128_predictor_16x4_neon, aom_v_predictor_16x4_neon,
-                nullptr, aom_paeth_predictor_16x4_neon,
+                aom_h_predictor_16x4_neon, aom_paeth_predictor_16x4_neon,
                 aom_smooth_predictor_16x4_neon,
                 aom_smooth_v_predictor_16x4_neon,
                 aom_smooth_h_predictor_16x4_neon)
@@ -719,7 +719,7 @@
                 aom_dc_left_predictor_16x64_neon,
                 aom_dc_top_predictor_16x64_neon,
                 aom_dc_128_predictor_16x64_neon, aom_v_predictor_16x64_neon,
-                nullptr, aom_paeth_predictor_16x64_neon,
+                aom_h_predictor_16x64_neon, aom_paeth_predictor_16x64_neon,
                 aom_smooth_predictor_16x64_neon,
                 aom_smooth_v_predictor_16x64_neon,
                 aom_smooth_h_predictor_16x64_neon)
@@ -832,7 +832,7 @@
                 aom_dc_left_predictor_32x16_neon,
                 aom_dc_top_predictor_32x16_neon,
                 aom_dc_128_predictor_32x16_neon, aom_v_predictor_32x16_neon,
-                nullptr, aom_paeth_predictor_32x16_neon,
+                aom_h_predictor_32x16_neon, aom_paeth_predictor_32x16_neon,
                 aom_smooth_predictor_32x16_neon,
                 aom_smooth_v_predictor_32x16_neon,
                 aom_smooth_h_predictor_32x16_neon)
@@ -840,14 +840,14 @@
                 aom_dc_left_predictor_32x64_neon,
                 aom_dc_top_predictor_32x64_neon,
                 aom_dc_128_predictor_32x64_neon, aom_v_predictor_32x64_neon,
-                nullptr, aom_paeth_predictor_32x64_neon,
+                aom_h_predictor_32x64_neon, aom_paeth_predictor_32x64_neon,
                 aom_smooth_predictor_32x64_neon,
                 aom_smooth_v_predictor_32x64_neon,
                 aom_smooth_h_predictor_32x64_neon)
 INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
                 aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
                 aom_dc_128_predictor_32x8_neon, aom_v_predictor_32x8_neon,
-                nullptr, aom_paeth_predictor_32x8_neon,
+                aom_h_predictor_32x8_neon, aom_paeth_predictor_32x8_neon,
                 aom_smooth_predictor_32x8_neon,
                 aom_smooth_v_predictor_32x8_neon,
                 aom_smooth_h_predictor_32x8_neon)
@@ -937,7 +937,7 @@
                 aom_dc_left_predictor_64x64_neon,
                 aom_dc_top_predictor_64x64_neon,
                 aom_dc_128_predictor_64x64_neon, aom_v_predictor_64x64_neon,
-                nullptr, aom_paeth_predictor_64x64_neon,
+                aom_h_predictor_64x64_neon, aom_paeth_predictor_64x64_neon,
                 aom_smooth_predictor_64x64_neon,
                 aom_smooth_v_predictor_64x64_neon,
                 aom_smooth_h_predictor_64x64_neon)
@@ -945,7 +945,7 @@
                 aom_dc_left_predictor_64x32_neon,
                 aom_dc_top_predictor_64x32_neon,
                 aom_dc_128_predictor_64x32_neon, aom_v_predictor_64x32_neon,
-                nullptr, aom_paeth_predictor_64x32_neon,
+                aom_h_predictor_64x32_neon, aom_paeth_predictor_64x32_neon,
                 aom_smooth_predictor_64x32_neon,
                 aom_smooth_v_predictor_64x32_neon,
                 aom_smooth_h_predictor_64x32_neon)
@@ -953,7 +953,7 @@
                 aom_dc_left_predictor_64x16_neon,
                 aom_dc_top_predictor_64x16_neon,
                 aom_dc_128_predictor_64x16_neon, aom_v_predictor_64x16_neon,
-                nullptr, aom_paeth_predictor_64x16_neon,
+                aom_h_predictor_64x16_neon, aom_paeth_predictor_64x16_neon,
                 aom_smooth_predictor_64x16_neon,
                 aom_smooth_v_predictor_64x16_neon,
                 aom_smooth_h_predictor_64x16_neon)