Add Neon implementation of aom_highbd_avg functions

Add Neon implementation of aom_highbd_avg_8x8 as well as the
corresponding tests. Also refactor existing implementation of
highbd_avg_4x4 and move it to a separate highbd file.

This is a straightforward port of this libvpx patch:
https://chromium-review.googlesource.com/c/webm/libvpx/+/4359871

Change-Id: Ib3e19047f309480abffe49740e3b21c912988022
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 6dae95e..884dd14 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -289,6 +289,7 @@
                 "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
 
     list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+                "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
                 "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
   endif()
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 19925d5..fb2d934 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1189,6 +1189,7 @@
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
     add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+    specialize qw/aom_highbd_avg_8x8 neon/;
     add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
     specialize qw/aom_highbd_avg_4x4 neon/;
     add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index bac50ca..fa9a141 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -192,31 +192,6 @@
   return var;
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
-unsigned int aom_highbd_avg_4x4_neon(const uint8_t *s, int p) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(s);
-  const uint16x4_t r0 = vld1_u16(src);
-  src += p;
-  uint16x4_t r1, r2, r3;
-  r1 = vld1_u16(src);
-  src += p;
-  r2 = vld1_u16(src);
-  src += p;
-  r3 = vld1_u16(src);
-  const uint16x4_t s1 = vadd_u16(r0, r1);
-  const uint16x4_t s2 = vadd_u16(r2, r3);
-  const uint16x4_t s3 = vadd_u16(s1, s2);
-#if defined(__aarch64__)
-  return (vaddv_u16(s3) + 8) >> 4;
-#else
-  const uint16x4_t h1 = vpadd_u16(s3, s3);
-  const uint16x4_t h2 = vpadd_u16(h1, h1);
-  const uint16x4_t res = vrshr_n_u16(h2, 4);
-  return vget_lane_u16(res, 0);
-#endif
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
 void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                          int b_stride, int *min, int *max) {
   // Load and concatenate.
diff --git a/aom_dsp/arm/highbd_avg_neon.c b/aom_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000..41c8903
--- /dev/null
+++ b/aom_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  This source code is subject to the terms of the BSD 2 Clause License and
+ *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ *  was not distributed with this source code in the LICENSE file, you can
+ *  obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ *  Media Patent License 1.0 was not distributed with this source code in the
+ *  PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  uint16x4_t sum, a0, a1, a2, a3;
+
+  load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3);
+
+  sum = vadd_u16(a0, a1);
+  sum = vadd_u16(sum, a2);
+  sum = vadd_u16(sum, a3);
+
+  return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4;
+}
+
+uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+  load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  sum = vaddq_u16(a0, a1);
+  sum = vaddq_u16(sum, a2);
+  sum = vaddq_u16(sum, a3);
+  sum = vaddq_u16(sum, a4);
+  sum = vaddq_u16(sum, a5);
+  sum = vaddq_u16(sum, a6);
+  sum = vaddq_u16(sum, a7);
+
+  return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6;
+}
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 91eb159..fb92d9b 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -878,6 +878,27 @@
   *s3 = vld1q_u8(s);
 }
 
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+                                uint16x8_t *s6, uint16x8_t *s7) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+}
+
 static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
                                  uint16x8_t *const s0, uint16x8_t *const s1,
                                  uint16x8_t *const s2, uint16x8_t *const s3,
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 4e86f06..8865915 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -847,7 +847,13 @@
                       make_tuple(32, 32, 10, 15, 4, &aom_highbd_avg_4x4_neon),
                       make_tuple(16, 16, 12, 0, 4, &aom_highbd_avg_4x4_neon),
                       make_tuple(16, 16, 12, 5, 4, &aom_highbd_avg_4x4_neon),
-                      make_tuple(32, 32, 12, 15, 4, &aom_highbd_avg_4x4_neon)));
+                      make_tuple(32, 32, 12, 15, 4, &aom_highbd_avg_4x4_neon),
+                      make_tuple(16, 16, 10, 0, 8, &aom_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 10, 5, 8, &aom_highbd_avg_8x8_neon),
+                      make_tuple(32, 32, 10, 15, 8, &aom_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 12, 0, 8, &aom_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 12, 5, 8, &aom_highbd_avg_8x8_neon),
+                      make_tuple(32, 32, 12, 15, 8, &aom_highbd_avg_8x8_neon)));
 #endif  // HAVE_NEON
 #endif  // CONFIG_AV1_HIGHBITDEPTH