Add Neon implementation for 64x64 dc predictors

We already have Neon implementations of the dc predictors for all other
square block sizes, so add 64x64 and update tests/speed to match.

On Neoverse V1, this gives a small improvement over the C code when
built with Clang 15 (~3-5%) and a slightly better improvement when built
with GCC 12 (~8-15%).

Change-Id: I225f13e2f6355e90fc66a04fa46c5d1775c32704
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 230faae..d2efcaf 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -103,7 +103,7 @@
 specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_dc_left_predictor_4x4 neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
@@ -123,7 +123,7 @@
 specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_dc_128_predictor_4x4 neon sse2/;
 specialize qw/aom_dc_128_predictor_4x8 sse2/;
@@ -143,7 +143,7 @@
 specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
 
 specialize qw/aom_v_predictor_4x4 neon sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
@@ -283,7 +283,7 @@
 specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/;
 specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/;
 specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
 specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
 if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index f6348f4..a0ce62d 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -17,6 +17,7 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom_dsp/intrapred_common.h"
 
 //------------------------------------------------------------------------------
@@ -316,6 +317,95 @@
   dc_32x32(dst, stride, NULL, NULL, 0, 0);
 }
 
+//------------------------------------------------------------------------------
+// DC 64x64
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_64x64(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t a0 = vld1q_u8(above);  // top row
+    const uint8x16_t a1 = vld1q_u8(above + 16);
+    const uint8x16_t a2 = vld1q_u8(above + 32);
+    const uint8x16_t a3 = vld1q_u8(above + 48);
+    const uint16x8_t p0 = vpaddlq_u8(a0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(a1);
+    const uint16x8_t p2 = vpaddlq_u8(a2);
+    const uint16x8_t p3 = vpaddlq_u8(a3);
+    const uint16x8_t p01 = vaddq_u16(p0, p1);
+    const uint16x8_t p23 = vaddq_u16(p2, p3);
+    sum_top = vdupq_n_u16(horizontal_add_u16x8(vaddq_u16(p01, p23)));
+  }
+
+  if (do_left) {
+    const uint8x16_t l0 = vld1q_u8(left);  // left row
+    const uint8x16_t l1 = vld1q_u8(left + 16);
+    const uint8x16_t l2 = vld1q_u8(left + 32);
+    const uint8x16_t l3 = vld1q_u8(left + 48);
+    const uint16x8_t p0 = vpaddlq_u8(l0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(l1);
+    const uint16x8_t p2 = vpaddlq_u8(l2);
+    const uint16x8_t p3 = vpaddlq_u8(l3);
+    const uint16x8_t p01 = vaddq_u16(p0, p1);
+    const uint16x8_t p23 = vaddq_u16(p2, p3);
+    sum_left = vdupq_n_u16(horizontal_add_u16x8(vaddq_u16(p01, p23)));
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 7);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 6);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 6);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 64; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+      vst1q_u8(dst + i * stride + 32, dc);
+      vst1q_u8(dst + i * stride + 48, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_64x64(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_64x64(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_64x64(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_64x64(dst, stride, NULL, NULL, 0, 0);
+}
+
 // -----------------------------------------------------------------------------
 
 void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 3da9293..4804476 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -342,15 +342,19 @@
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
   lowbd_entry(dc, 4, 4, neon),        lowbd_entry(dc, 8, 8, neon),
   lowbd_entry(dc, 16, 16, neon),      lowbd_entry(dc, 32, 32, neon),
+  lowbd_entry(dc, 64, 64, neon),
 
   lowbd_entry(dc_top, 4, 4, neon),    lowbd_entry(dc_top, 8, 8, neon),
   lowbd_entry(dc_top, 16, 16, neon),  lowbd_entry(dc_top, 32, 32, neon),
+  lowbd_entry(dc_top, 64, 64, neon),
 
   lowbd_entry(dc_left, 4, 4, neon),   lowbd_entry(dc_left, 8, 8, neon),
   lowbd_entry(dc_left, 16, 16, neon), lowbd_entry(dc_left, 32, 32, neon),
+  lowbd_entry(dc_left, 64, 64, neon),
 
   lowbd_entry(dc_128, 4, 4, neon),    lowbd_entry(dc_128, 8, 8, neon),
   lowbd_entry(dc_128, 16, 16, neon),  lowbd_entry(dc_128, 32, 32, neon),
+  lowbd_entry(dc_128, 64, 64, neon),
 
   lowbd_entry(v, 4, 4, neon),         lowbd_entry(v, 8, 8, neon),
   lowbd_entry(v, 16, 16, neon),       lowbd_entry(v, 32, 32, neon),
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index bf90d4a..82765ce 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -905,9 +905,11 @@
 #endif
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TX_64X64, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, aom_paeth_predictor_64x64_neon,
-                aom_smooth_predictor_64x64_neon,
+INTRA_PRED_TEST(NEON, TX_64X64, aom_dc_predictor_64x64_neon,
+                aom_dc_left_predictor_64x64_neon,
+                aom_dc_top_predictor_64x64_neon,
+                aom_dc_128_predictor_64x64_neon, nullptr, nullptr,
+                aom_paeth_predictor_64x64_neon, aom_smooth_predictor_64x64_neon,
                 aom_smooth_v_predictor_64x64_neon,
                 aom_smooth_h_predictor_64x64_neon)
 INTRA_PRED_TEST(NEON, TX_64X32, nullptr, nullptr, nullptr, nullptr, nullptr,