Add Neon implementations for rectangular dc predictors

We already have Neon implementations of the dc predictors for all square
block sizes, so add the remaining cases and update tests/speed to match.

On Neoverse V1, this gives a small improvement over the C code when
built with Clang 15 (~3.5%) and a significant improvement when built
with GCC 12 (~30%).

Change-Id: Iab9fed1610c40dbb1698abed4156bf01806919ed
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index cbcc24d..dab303c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -86,63 +86,63 @@
 }
 
 specialize qw/aom_dc_top_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x4 sse2/;
+specialize qw/aom_dc_top_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x4 neon sse2/;
 specialize qw/aom_dc_top_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
-specialize qw/aom_dc_top_predictor_16x8 sse2/;
+specialize qw/aom_dc_top_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x8 neon sse2/;
 specialize qw/aom_dc_top_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
-specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x16 neon sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 neon sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_dc_left_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x4 sse2/;
+specialize qw/aom_dc_left_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x4 neon sse2/;
 specialize qw/aom_dc_left_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
-specialize qw/aom_dc_left_predictor_16x8 sse2/;
+specialize qw/aom_dc_left_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x8 neon sse2/;
 specialize qw/aom_dc_left_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
-specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x16 neon sse2 avx2/;
 specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 neon sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_dc_128_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x4 sse2/;
+specialize qw/aom_dc_128_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x4 neon sse2/;
 specialize qw/aom_dc_128_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
-specialize qw/aom_dc_128_predictor_16x8 sse2/;
+specialize qw/aom_dc_128_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x8 neon sse2/;
 specialize qw/aom_dc_128_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
-specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x16 neon sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 neon sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_v_predictor_4x4 neon sse2/;
@@ -268,24 +268,24 @@
 # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
 # by multiply and shift.
 specialize qw/aom_dc_predictor_4x4 neon sse2/;
-specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_dc_predictor_4x16 sse2/;
-specialize qw/aom_dc_predictor_8x4 sse2/;
+specialize qw/aom_dc_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x4 neon sse2/;
 specialize qw/aom_dc_predictor_8x8 neon sse2/;
-specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_dc_predictor_8x32 sse2/;
-specialize qw/aom_dc_predictor_16x4 sse2/;
-specialize qw/aom_dc_predictor_16x8 sse2/;
+specialize qw/aom_dc_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_predictor_16x8 neon sse2/;
 specialize qw/aom_dc_predictor_16x16 neon sse2/;
-specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_dc_predictor_16x64 sse2/;
-specialize qw/aom_dc_predictor_32x8 sse2/;
-specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_predictor_32x16 neon sse2 avx2/;
 specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/;
-specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 neon sse2 avx2/;
 specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/;
-specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 neon sse2 avx2/;
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/;
   specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 5f43802..c724fa5 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -24,15 +24,15 @@
 // DC 4x4
 
 static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
-  const uint8x8_t a = vld1_u8(in);
+  const uint8x8_t a = load_u8_4x1_lane0(in);
   const uint16x4_t p0 = vpaddl_u8(a);
   const uint16x4_t p1 = vpadd_u16(p0, p0);
   return vcombine_u16(p1, vdup_n_u16(0));
 }
 
-static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, uint8x8_t dc0) {
-  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-  for (int i = 0; i < 4; ++i) {
+static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x8_t dc) {
+  for (int i = 0; i < h; ++i) {
     store_u8_4x1(dst + i * stride, dc, 0);
   }
 }
@@ -43,7 +43,7 @@
   const uint16x8_t sum_left = dc_load_sum_4(left);
   const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
-  dc_store_4x4(dst, stride, dc0);
+  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
 }
 
 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -51,7 +51,7 @@
   const uint16x8_t sum_left = dc_load_sum_4(left);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
   (void)above;
-  dc_store_4x4(dst, stride, dc0);
+  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
 }
 
 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -59,7 +59,7 @@
   const uint16x8_t sum_top = dc_load_sum_4(above);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
   (void)left;
-  dc_store_4x4(dst, stride, dc0);
+  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
 }
 
 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
@@ -67,7 +67,7 @@
   const uint8x8_t dc0 = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_store_4x4(dst, stride, dc0);
+  dc_store_4xh(dst, stride, 4, dc0);
 }
 
 //------------------------------------------------------------------------------
@@ -100,9 +100,9 @@
 #endif
 }
 
-static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t dc0) {
-  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-  for (int i = 0; i < 8; ++i) {
+static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x8_t dc) {
+  for (int i = 0; i < h; ++i) {
     vst1_u8(dst + i * stride, dc);
   }
 }
@@ -114,7 +114,7 @@
   uint16x8_t sum = vaddl_u8(sum_left, sum_top);
   sum = horizontal_add_and_broadcast_u16x8(sum);
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
-  dc_store_8x8(dst, stride, dc0);
+  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
 }
 
 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -122,7 +122,7 @@
   const uint16x8_t sum_left = dc_load_sum_8(left);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
   (void)above;
-  dc_store_8x8(dst, stride, dc0);
+  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
 }
 
 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -130,7 +130,7 @@
   const uint16x8_t sum_top = dc_load_sum_8(above);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
   (void)left;
-  dc_store_8x8(dst, stride, dc0);
+  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
 }
 
 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -138,13 +138,13 @@
   const uint8x8_t dc0 = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_store_8x8(dst, stride, dc0);
+  dc_store_8xh(dst, stride, 8, dc0);
 }
 
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
   const uint8x16_t a = vld1q_u8(in);
   // delay the remainder of the reduction until
   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
@@ -152,57 +152,58 @@
   return vpaddlq_u8(a);
 }
 
-static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
-                                  uint8x8_t dc0) {
-  const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-  for (int i = 0; i < 16; ++i) {
+static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
+}
+
+static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                 uint8x16_t dc) {
+  for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, dc);
   }
 }
 
 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint16x8_t sum_top = dc_load_sum_16(above);
-  const uint16x8_t sum_left = dc_load_sum_16(left);
+  const uint16x8_t sum_top = dc_load_partial_sum_16(above);
+  const uint16x8_t sum_left = dc_load_partial_sum_16(left);
   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   sum = horizontal_add_and_broadcast_u16x8(sum);
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
-  dc_store_16x16(dst, stride, dc0);
+  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  uint16x8_t sum_left = dc_load_sum_16(left);
-  sum_left = horizontal_add_and_broadcast_u16x8(sum_left);
+  const uint16x8_t sum_left = dc_load_sum_16(left);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
   (void)above;
-  dc_store_16x16(dst, stride, dc0);
+  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  uint16x8_t sum_top = dc_load_sum_16(above);
-  sum_top = horizontal_add_and_broadcast_u16x8(sum_top);
+  const uint16x8_t sum_top = dc_load_sum_16(above);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
   (void)left;
-  dc_store_16x16(dst, stride, dc0);
+  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc0 = vdup_n_u8(0x80);
+  const uint8x16_t dc0 = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_store_16x16(dst, stride, dc0);
+  dc_store_16xh(dst, stride, 16, dc0);
 }
 
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
   const uint8x16_t a0 = vld1q_u8(in);
   const uint8x16_t a1 = vld1q_u8(in + 16);
   // delay the remainder of the reduction until
@@ -211,10 +212,13 @@
   return vpadalq_u8(vpaddlq_u8(a0), a1);
 }
 
-static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
-                                  uint8x8_t dc0) {
-  const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-  for (int i = 0; i < 32; ++i) {
+static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
+}
+
+static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                 uint8x16_t dc) {
+  for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, dc);
     vst1q_u8(dst + i * stride + 16, dc);
   }
@@ -222,47 +226,45 @@
 
 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint16x8_t sum_top = dc_load_sum_32(above);
-  const uint16x8_t sum_left = dc_load_sum_32(left);
+  const uint16x8_t sum_top = dc_load_partial_sum_32(above);
+  const uint16x8_t sum_left = dc_load_partial_sum_32(left);
   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   sum = horizontal_add_and_broadcast_u16x8(sum);
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
-  dc_store_32x32(dst, stride, dc0);
+  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  uint16x8_t sum_left = dc_load_sum_32(left);
-  sum_left = horizontal_add_and_broadcast_u16x8(sum_left);
+  const uint16x8_t sum_left = dc_load_sum_32(left);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
   (void)above;
-  dc_store_32x32(dst, stride, dc0);
+  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  uint16x8_t sum_top = dc_load_sum_32(above);
-  sum_top = horizontal_add_and_broadcast_u16x8(sum_top);
+  const uint16x8_t sum_top = dc_load_sum_32(above);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
   (void)left;
-  dc_store_32x32(dst, stride, dc0);
+  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc0 = vdup_n_u8(0x80);
+  const uint8x16_t dc0 = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_store_32x32(dst, stride, dc0);
+  dc_store_32xh(dst, stride, 32, dc0);
 }
 
 //------------------------------------------------------------------------------
 // DC 64x64
 
-static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
   const uint8x16_t a0 = vld1q_u8(in);
   const uint8x16_t a1 = vld1q_u8(in + 16);
   const uint8x16_t a2 = vld1q_u8(in + 32);
@@ -275,10 +277,13 @@
   return vaddq_u16(p01, p23);
 }
 
-static INLINE void dc_store_64x64(uint8_t *dst, ptrdiff_t stride,
-                                  uint8x8_t dc0) {
-  const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-  for (int i = 0; i < 64; ++i) {
+static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
+}
+
+static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                 uint8x16_t dc) {
+  for (int i = 0; i < h; ++i) {
     vst1q_u8(dst + i * stride, dc);
     vst1q_u8(dst + i * stride + 16, dc);
     vst1q_u8(dst + i * stride + 32, dc);
@@ -288,43 +293,286 @@
 
 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint16x8_t sum_top = dc_load_sum_64(above);
-  const uint16x8_t sum_left = dc_load_sum_64(left);
+  const uint16x8_t sum_top = dc_load_partial_sum_64(above);
+  const uint16x8_t sum_left = dc_load_partial_sum_64(left);
   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   sum = horizontal_add_and_broadcast_u16x8(sum);
   const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
-  dc_store_64x64(dst, stride, dc0);
+  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  uint16x8_t sum_left = dc_load_sum_64(left);
-  sum_left = horizontal_add_and_broadcast_u16x8(sum_left);
+  const uint16x8_t sum_left = dc_load_sum_64(left);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
   (void)above;
-  dc_store_64x64(dst, stride, dc0);
+  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  uint16x8_t sum_top = dc_load_sum_64(above);
-  sum_top = horizontal_add_and_broadcast_u16x8(sum_top);
+  const uint16x8_t sum_top = dc_load_sum_64(above);
   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
   (void)left;
-  dc_store_64x64(dst, stride, dc0);
+  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
 }
 
 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc0 = vdup_n_u8(0x80);
+  const uint8x16_t dc0 = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_store_64x64(dst, stride, dc0);
+  dc_store_64xh(dst, stride, 64, dc0);
 }
 
+//------------------------------------------------------------------------------
+// DC rectangular cases
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
+}
+
+static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
+                                        int shift1, int multiplier) {
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+  return expected_dc;
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = load_u8_4x1_lane0(above);
+  uint8x8_t l = vld1_u8(left);
+  uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+  uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
+  dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = vld1_u8(above);
+  uint8x8_t l = load_u8_4x1_lane0(left);
+  uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+  uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
+  dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = load_u8_4x1_lane0(above);
+  uint8x16_t l = vld1q_u8(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
+  dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a = vld1q_u8(above);
+  uint8x8_t l = load_u8_4x1_lane0(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
+  dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = vld1_u8(above);
+  uint8x16_t l = vld1q_u8(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
+  dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a = vld1q_u8(above);
+  uint8x8_t l = vld1_u8(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
+  dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = vld1_u8(above);
+  uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum_al = vaddw_u8(sum_left, a);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
+  dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_top = dc_load_partial_sum_32(above);
+  uint8x8_t l = vld1_u8(left);
+  uint16x8_t sum_al = vaddw_u8(sum_top, l);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
+  dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_16(above);
+  uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
+  dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_32(above);
+  uint16x8_t sum_left = dc_load_partial_sum_16(left);
+  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
+  dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_16(above);
+  uint16x8_t sum_left = dc_load_partial_sum_64(left);
+  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
+  dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_64(above);
+  uint16x8_t sum_left = dc_load_partial_sum_16(left);
+  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
+  dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_32(above);
+  uint16x8_t sum_left = dc_load_partial_sum_64(left);
+  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
+  dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_64(above);
+  uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
+  dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+#define DC_PREDICTOR_128(w, h, q)                                            \
+  void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                             const uint8_t *above,           \
+                                             const uint8_t *left) {          \
+    (void)above;                                                             \
+    (void)left;                                                              \
+    dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80));                \
+  }
+
+DC_PREDICTOR_128(4, 8, )
+DC_PREDICTOR_128(4, 16, )
+DC_PREDICTOR_128(8, 4, )
+DC_PREDICTOR_128(8, 16, )
+DC_PREDICTOR_128(8, 32, )
+DC_PREDICTOR_128(16, 4, q)
+DC_PREDICTOR_128(16, 8, q)
+DC_PREDICTOR_128(16, 32, q)
+DC_PREDICTOR_128(16, 64, q)
+DC_PREDICTOR_128(32, 8, q)
+DC_PREDICTOR_128(32, 16, q)
+DC_PREDICTOR_128(32, 64, q)
+DC_PREDICTOR_128(64, 32, q)
+DC_PREDICTOR_128(64, 16, q)
+
+#undef DC_PREDICTOR_128
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q)                                     \
+  void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                              const uint8_t *above,           \
+                                              const uint8_t *left) {          \
+    (void)above;                                                              \
+    const uint16x8_t sum = dc_load_sum_##h(left);                             \
+    const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                         \
+    dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));            \
+  }
+
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(8, 4, 2, )
+DC_PREDICTOR_LEFT(8, 16, 4, )
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 32, 5, )
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+
+#undef DC_PREDICTOR_LEFT
+
+#define DC_PREDICTOR_TOP(w, h, shift, q)                                     \
+  void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                             const uint8_t *above,           \
+                                             const uint8_t *left) {          \
+    (void)left;                                                              \
+    const uint16x8_t sum = dc_load_sum_##w(above);                           \
+    const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                        \
+    dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));           \
+  }
+
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, )
+DC_PREDICTOR_TOP(8, 16, 3, )
+DC_PREDICTOR_TOP(8, 32, 3, )
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
 // -----------------------------------------------------------------------------
 
 void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 58f5226..30328f7 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -94,6 +94,13 @@
         vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
   } while (0)
 
+// Load four bytes into the low half of a uint8x8_t, zero the upper half.
+static INLINE uint8x8_t load_u8_4x1_lane0(const uint8_t *p) {
+  uint8x8_t ret = vdup_n_u8(0);
+  load_u8_4x1(p, &ret, 0);
+  return ret;
+}
+
 // Load 2 sets of 4 bytes when alignment is guaranteed.
 static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
   uint32x2_t a = vdup_n_u32(0);
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 4804476..01a5dc2 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -340,30 +340,17 @@
 
 #if HAVE_NEON
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
-  lowbd_entry(dc, 4, 4, neon),        lowbd_entry(dc, 8, 8, neon),
-  lowbd_entry(dc, 16, 16, neon),      lowbd_entry(dc, 32, 32, neon),
-  lowbd_entry(dc, 64, 64, neon),
+  lowbd_intrapred(dc, neon),       lowbd_intrapred(dc_top, neon),
+  lowbd_intrapred(dc_left, neon),  lowbd_intrapred(dc_128, neon),
 
-  lowbd_entry(dc_top, 4, 4, neon),    lowbd_entry(dc_top, 8, 8, neon),
-  lowbd_entry(dc_top, 16, 16, neon),  lowbd_entry(dc_top, 32, 32, neon),
-  lowbd_entry(dc_top, 64, 64, neon),
+  lowbd_entry(v, 4, 4, neon),      lowbd_entry(v, 8, 8, neon),
+  lowbd_entry(v, 16, 16, neon),    lowbd_entry(v, 32, 32, neon),
 
-  lowbd_entry(dc_left, 4, 4, neon),   lowbd_entry(dc_left, 8, 8, neon),
-  lowbd_entry(dc_left, 16, 16, neon), lowbd_entry(dc_left, 32, 32, neon),
-  lowbd_entry(dc_left, 64, 64, neon),
+  lowbd_entry(h, 4, 4, neon),      lowbd_entry(h, 8, 8, neon),
+  lowbd_entry(h, 16, 16, neon),    lowbd_entry(h, 32, 32, neon),
 
-  lowbd_entry(dc_128, 4, 4, neon),    lowbd_entry(dc_128, 8, 8, neon),
-  lowbd_entry(dc_128, 16, 16, neon),  lowbd_entry(dc_128, 32, 32, neon),
-  lowbd_entry(dc_128, 64, 64, neon),
-
-  lowbd_entry(v, 4, 4, neon),         lowbd_entry(v, 8, 8, neon),
-  lowbd_entry(v, 16, 16, neon),       lowbd_entry(v, 32, 32, neon),
-
-  lowbd_entry(h, 4, 4, neon),         lowbd_entry(h, 8, 8, neon),
-  lowbd_entry(h, 16, 16, neon),       lowbd_entry(h, 32, 32, neon),
-
-  lowbd_intrapred(smooth, neon),      lowbd_intrapred(smooth_v, neon),
-  lowbd_intrapred(smooth_h, neon),    lowbd_intrapred(paeth, neon),
+  lowbd_intrapred(smooth, neon),   lowbd_intrapred(smooth_v, neon),
+  lowbd_intrapred(smooth_h, neon), lowbd_intrapred(paeth, neon),
 };
 
 INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 82765ce..526f678 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -468,13 +468,16 @@
                 aom_h_predictor_4x4_neon, aom_paeth_predictor_4x4_neon,
                 aom_smooth_predictor_4x4_neon, aom_smooth_v_predictor_4x4_neon,
                 aom_smooth_h_predictor_4x4_neon)
-INTRA_PRED_TEST(NEON, TX_4X8, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_4x8_neon,
-                aom_smooth_predictor_4x8_neon, aom_smooth_v_predictor_4x8_neon,
+INTRA_PRED_TEST(NEON, TX_4X8, aom_dc_predictor_4x8_neon,
+                aom_dc_left_predictor_4x8_neon, aom_dc_top_predictor_4x8_neon,
+                aom_dc_128_predictor_4x8_neon, nullptr, nullptr,
+                aom_paeth_predictor_4x8_neon, aom_smooth_predictor_4x8_neon,
+                aom_smooth_v_predictor_4x8_neon,
                 aom_smooth_h_predictor_4x8_neon)
-INTRA_PRED_TEST(NEON, TX_4X16, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_4x16_neon,
-                aom_smooth_predictor_4x16_neon,
+INTRA_PRED_TEST(NEON, TX_4X16, aom_dc_predictor_4x16_neon,
+                aom_dc_left_predictor_4x16_neon, aom_dc_top_predictor_4x16_neon,
+                aom_dc_128_predictor_4x16_neon, nullptr, nullptr,
+                aom_paeth_predictor_4x16_neon, aom_smooth_predictor_4x16_neon,
                 aom_smooth_v_predictor_4x16_neon,
                 aom_smooth_h_predictor_4x16_neon)
 #endif  // HAVE_NEON
@@ -555,18 +558,22 @@
                 aom_h_predictor_8x8_neon, aom_paeth_predictor_8x8_neon,
                 aom_smooth_predictor_8x8_neon, aom_smooth_v_predictor_8x8_neon,
                 aom_smooth_h_predictor_8x8_neon)
-INTRA_PRED_TEST(NEON, TX_8X4, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_8x4_neon,
-                aom_smooth_predictor_8x4_neon, aom_smooth_v_predictor_8x4_neon,
+INTRA_PRED_TEST(NEON, TX_8X4, aom_dc_predictor_8x4_neon,
+                aom_dc_left_predictor_8x4_neon, aom_dc_top_predictor_8x4_neon,
+                aom_dc_128_predictor_8x4_neon, nullptr, nullptr,
+                aom_paeth_predictor_8x4_neon, aom_smooth_predictor_8x4_neon,
+                aom_smooth_v_predictor_8x4_neon,
                 aom_smooth_h_predictor_8x4_neon)
-INTRA_PRED_TEST(NEON, TX_8X16, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_8x16_neon,
-                aom_smooth_predictor_8x16_neon,
+INTRA_PRED_TEST(NEON, TX_8X16, aom_dc_predictor_8x16_neon,
+                aom_dc_left_predictor_8x16_neon, aom_dc_top_predictor_8x16_neon,
+                aom_dc_128_predictor_8x16_neon, nullptr, nullptr,
+                aom_paeth_predictor_8x16_neon, aom_smooth_predictor_8x16_neon,
                 aom_smooth_v_predictor_8x16_neon,
                 aom_smooth_h_predictor_8x16_neon)
-INTRA_PRED_TEST(NEON, TX_8X32, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_8x32_neon,
-                aom_smooth_predictor_8x32_neon,
+INTRA_PRED_TEST(NEON, TX_8X32, aom_dc_predictor_8x32_neon,
+                aom_dc_left_predictor_8x32_neon, aom_dc_top_predictor_8x32_neon,
+                aom_dc_128_predictor_8x32_neon, nullptr, nullptr,
+                aom_paeth_predictor_8x32_neon, aom_smooth_predictor_8x32_neon,
                 aom_smooth_v_predictor_8x32_neon,
                 aom_smooth_h_predictor_8x32_neon)
 #endif  // HAVE_NEON
@@ -683,24 +690,30 @@
                 aom_smooth_predictor_16x16_neon,
                 aom_smooth_v_predictor_16x16_neon,
                 aom_smooth_h_predictor_16x16_neon)
-INTRA_PRED_TEST(NEON, TX_16X8, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_16x8_neon,
-                aom_smooth_predictor_16x8_neon,
+INTRA_PRED_TEST(NEON, TX_16X8, aom_dc_predictor_16x8_neon,
+                aom_dc_left_predictor_16x8_neon, aom_dc_top_predictor_16x8_neon,
+                aom_dc_128_predictor_16x8_neon, nullptr, nullptr,
+                aom_paeth_predictor_16x8_neon, aom_smooth_predictor_16x8_neon,
                 aom_smooth_v_predictor_16x8_neon,
                 aom_smooth_h_predictor_16x8_neon)
-INTRA_PRED_TEST(NEON, TX_16X32, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_16x32_neon,
-                aom_smooth_predictor_16x32_neon,
+INTRA_PRED_TEST(NEON, TX_16X32, aom_dc_predictor_16x32_neon,
+                aom_dc_left_predictor_16x32_neon,
+                aom_dc_top_predictor_16x32_neon,
+                aom_dc_128_predictor_16x32_neon, nullptr, nullptr,
+                aom_paeth_predictor_16x32_neon, aom_smooth_predictor_16x32_neon,
                 aom_smooth_v_predictor_16x32_neon,
                 aom_smooth_h_predictor_16x32_neon)
-INTRA_PRED_TEST(NEON, TX_16X4, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_16x4_neon,
-                aom_smooth_predictor_16x4_neon,
+INTRA_PRED_TEST(NEON, TX_16X4, aom_dc_predictor_16x4_neon,
+                aom_dc_left_predictor_16x4_neon, aom_dc_top_predictor_16x4_neon,
+                aom_dc_128_predictor_16x4_neon, nullptr, nullptr,
+                aom_paeth_predictor_16x4_neon, aom_smooth_predictor_16x4_neon,
                 aom_smooth_v_predictor_16x4_neon,
                 aom_smooth_h_predictor_16x4_neon)
-INTRA_PRED_TEST(NEON, TX_16X64, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_16x64_neon,
-                aom_smooth_predictor_16x64_neon,
+INTRA_PRED_TEST(NEON, TX_16X64, aom_dc_predictor_16x64_neon,
+                aom_dc_left_predictor_16x64_neon,
+                aom_dc_top_predictor_16x64_neon,
+                aom_dc_128_predictor_16x64_neon, nullptr, nullptr,
+                aom_paeth_predictor_16x64_neon, aom_smooth_predictor_16x64_neon,
                 aom_smooth_v_predictor_16x64_neon,
                 aom_smooth_h_predictor_16x64_neon)
 #endif  // HAVE_NEON
@@ -808,19 +821,24 @@
                 aom_smooth_predictor_32x32_neon,
                 aom_smooth_v_predictor_32x32_neon,
                 aom_smooth_h_predictor_32x32_neon)
-INTRA_PRED_TEST(NEON, TX_32X16, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_32x16_neon,
-                aom_smooth_predictor_32x16_neon,
+INTRA_PRED_TEST(NEON, TX_32X16, aom_dc_predictor_32x16_neon,
+                aom_dc_left_predictor_32x16_neon,
+                aom_dc_top_predictor_32x16_neon,
+                aom_dc_128_predictor_32x16_neon, nullptr, nullptr,
+                aom_paeth_predictor_32x16_neon, aom_smooth_predictor_32x16_neon,
                 aom_smooth_v_predictor_32x16_neon,
                 aom_smooth_h_predictor_32x16_neon)
-INTRA_PRED_TEST(NEON, TX_32X64, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_32x64_neon,
-                aom_smooth_predictor_32x64_neon,
+INTRA_PRED_TEST(NEON, TX_32X64, aom_dc_predictor_32x64_neon,
+                aom_dc_left_predictor_32x64_neon,
+                aom_dc_top_predictor_32x64_neon,
+                aom_dc_128_predictor_32x64_neon, nullptr, nullptr,
+                aom_paeth_predictor_32x64_neon, aom_smooth_predictor_32x64_neon,
                 aom_smooth_v_predictor_32x64_neon,
                 aom_smooth_h_predictor_32x64_neon)
-INTRA_PRED_TEST(NEON, TX_32X8, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_32x8_neon,
-                aom_smooth_predictor_32x8_neon,
+INTRA_PRED_TEST(NEON, TX_32X8, aom_dc_predictor_32x8_neon,
+                aom_dc_left_predictor_32x8_neon, aom_dc_top_predictor_32x8_neon,
+                aom_dc_128_predictor_32x8_neon, nullptr, nullptr,
+                aom_paeth_predictor_32x8_neon, aom_smooth_predictor_32x8_neon,
                 aom_smooth_v_predictor_32x8_neon,
                 aom_smooth_h_predictor_32x8_neon)
 #endif  // HAVE_NEON
@@ -912,14 +930,18 @@
                 aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
                 aom_smooth_v_predictor_64x64_neon,
                 aom_smooth_h_predictor_64x64_neon)
-INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_64x32_neon,
-                aom_smooth_predictor_64x32_neon,
+INTRA_PRED_TEST(NEON, TX_64X32, aom_dc_predictor_64x32_neon,
+                aom_dc_left_predictor_64x32_neon,
+                aom_dc_top_predictor_64x32_neon,
+                aom_dc_128_predictor_64x32_neon, nullptr, nullptr,
+                aom_paeth_predictor_64x32_neon, aom_smooth_predictor_64x32_neon,
                 aom_smooth_v_predictor_64x32_neon,
                 aom_smooth_h_predictor_64x32_neon)
-INTRA_PRED_TEST(NEON, TX_64X16, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_64x16_neon,
-                aom_smooth_predictor_64x16_neon,
+INTRA_PRED_TEST(NEON, TX_64X16, aom_dc_predictor_64x16_neon,
+                aom_dc_left_predictor_64x16_neon,
+                aom_dc_top_predictor_64x16_neon,
+                aom_dc_128_predictor_64x16_neon, nullptr, nullptr,
+                aom_paeth_predictor_64x16_neon, aom_smooth_predictor_64x16_neon,
                 aom_smooth_v_predictor_64x16_neon,
                 aom_smooth_h_predictor_64x16_neon)
 #endif  // HAVE_NEON