Add Neon optimization for avg_8x8

Pulled over from vp9.
0.5% speed up overall.

Change-Id: Ife883b070fd923c1ce9c3cb0e6a43304dcde472c
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 471bff9..1e45597 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -244,7 +244,8 @@
               "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
               "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c")
+              "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/avg_neon.c")
 
   list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
               "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 65a6ba9..8c51e7f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -857,10 +857,10 @@
   # Avg
   #
   add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/aom_avg_8x8 sse2/;
+  specialize qw/aom_avg_8x8 sse2 neon/;
 
   add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/aom_avg_4x4 sse2/;
+  specialize qw/aom_avg_4x4 sse2 neon/;
 
   add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/aom_minmax_8x8 sse2/;
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
new file mode 100644
index 0000000..455f01d
--- /dev/null
+++ b/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+#if defined(__aarch64__)
+  const uint32_t d = vaddlvq_u16(c);
+  return d >> 4;
+#else
+  const uint32x2_t d = horizontal_add_u16x8(c);
+  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+#endif
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  uint16x8_t sum;
+  uint32x2_t d;
+  uint8x8_t b = vld1_u8(a);
+  a += a_stride;
+  uint8x8_t c = vld1_u8(a);
+  a += a_stride;
+  sum = vaddl_u8(b, c);
+
+  for (int i = 0; i < 6; ++i) {
+    const uint8x8_t e = vld1_u8(a);
+    a += a_stride;
+    sum = vaddw_u8(sum, e);
+  }
+
+  d = horizontal_add_u16x8(sum);
+
+  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+}
diff --git a/aom_dsp/arm/sum_neon.h b/aom_dsp/arm/sum_neon.h
new file mode 100644
index 0000000..809e51c
--- /dev/null
+++ b/aom_dsp/arm/sum_neon.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                  vreinterpret_u32_u64(vget_high_u64(c)));
+}
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 74385a6..d94f6e5 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -13,25 +13,10 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_config.h"
-
+#include "aom_dsp/arm/sum_neon.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
 // w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
                              int b_stride, int w, int h, uint32_t *sse,
diff --git a/av1/common/arm/mem_neon.h b/av1/common/arm/mem_neon.h
index 10e8a16..171055f 100644
--- a/av1/common/arm/mem_neon.h
+++ b/av1/common/arm/mem_neon.h
@@ -316,6 +316,26 @@
   *s3 = vld1q_s16(s);
 }
 
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
                                          uint32x2_t *tu0, uint32x2_t *tu1,
                                          uint32x2_t *tu2, uint32x2_t *tu3) {