Add Neon implementations of SAD functions for all block sizes

Add a Neon implementation of aom_sad<w>x<h> for the following block
sizes - that previously only used a scalar C implementation on Arm:
 * 128x64
 * 64x128, 64x32, 64x16
 * 32x64, 32x16, 32x8
 * 16x64, 16x32, 16x4
 * 8x32, 8x4
 * 4x16, 4x8

Also add test coverage for the new Neon functions.

Change-Id: I113b790033a8cb12502a9e707a16c573dd567a34
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 1928de0..8349af7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -766,28 +766,28 @@
   add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
   specialize qw/aom_sum_sse_2d_i16 sse2 avx2/;
   specialize qw/aom_sad128x128    avx2 neon     sse2/;
-  specialize qw/aom_sad128x64     avx2          sse2/;
-  specialize qw/aom_sad64x128     avx2          sse2/;
+  specialize qw/aom_sad128x64     avx2 neon     sse2/;
+  specialize qw/aom_sad64x128     avx2 neon     sse2/;
   specialize qw/aom_sad64x64      avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32      avx2      msa sse2/;
-  specialize qw/aom_sad32x64      avx2      msa sse2/;
+  specialize qw/aom_sad64x32      avx2 neon msa sse2/;
+  specialize qw/aom_sad32x64      avx2 neon msa sse2/;
   specialize qw/aom_sad32x32      avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16      avx2      msa sse2/;
-  specialize qw/aom_sad16x32                msa sse2/;
+  specialize qw/aom_sad32x16      avx2 neon msa sse2/;
+  specialize qw/aom_sad16x32           neon msa sse2/;
   specialize qw/aom_sad16x16           neon msa sse2/;
   specialize qw/aom_sad16x8            neon msa sse2/;
   specialize qw/aom_sad8x16            neon msa sse2/;
   specialize qw/aom_sad8x8             neon msa sse2/;
-  specialize qw/aom_sad8x4                  msa sse2/;
-  specialize qw/aom_sad4x8                  msa sse2/;
+  specialize qw/aom_sad8x4             neon msa sse2/;
+  specialize qw/aom_sad4x8             neon msa sse2/;
   specialize qw/aom_sad4x4             neon msa sse2/;
 
-  specialize qw/aom_sad4x16                     sse2/;
-  specialize qw/aom_sad16x4                     sse2/;
-  specialize qw/aom_sad8x32                     sse2/;
-  specialize qw/aom_sad32x8                     sse2/;
-  specialize qw/aom_sad16x64                    sse2/;
-  specialize qw/aom_sad64x16                    sse2/;
+  specialize qw/aom_sad4x16            neon     sse2/;
+  specialize qw/aom_sad16x4            neon     sse2/;
+  specialize qw/aom_sad8x32            neon     sse2/;
+  specialize qw/aom_sad32x8            neon     sse2/;
+  specialize qw/aom_sad16x64           neon     sse2/;
+  specialize qw/aom_sad64x16           neon     sse2/;
 
   specialize qw/aom_sad_skip_128x128    avx2          sse2  neon/;
   specialize qw/aom_sad_skip_128x64     avx2          sse2  neon/;
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index df66275..6cb7a03 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -232,128 +232,73 @@
   return horizontal_add_u16x8(sum);
 }
 
-unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
-                                 const uint8_t *ref, int ref_stride) {
-  return sad128xh_neon(src, src_stride, ref, ref_stride, 128);
-}
-
-unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  return sad64xh_neon(src, src_stride, ref, ref_stride, 64);
-}
-
-unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  return sad32xh_neon(src, src_stride, ref, ref_stride, 32);
-}
-
-unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  return sad16xh_neon(src, src_stride, ref, ref_stride, 16);
-}
-
-unsigned int aom_sad16x8_neon(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride) {
-  return sad16xh_neon(src, src_stride, ref, ref_stride, 8);
-}
-
-unsigned int aom_sad8x16_neon(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride) {
-  return sad8xh_neon(src, src_stride, ref, ref_stride, 16);
-}
-
-unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
-  return sad8xh_neon(src, src_stride, ref, ref_stride, 8);
-}
-
-unsigned int aom_sad4x4_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
-  return sad4xh_neon(src, src_stride, ref, ref_stride, 4);
-}
-
-#define FSADS128_H(h)                                                    \
-  unsigned int aom_sad_skip_128x##h##_neon(                              \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,    \
-      int ref_stride) {                                                  \
-    const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \
-                                       2 * ref_stride, h / 2);           \
-    return 2 * sum;                                                      \
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
   }
 
-FSADS128_H(128)
-FSADS128_H(64)
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+SAD_WXH_NEON(4, 16)
 
-#undef FSADS128_H
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+SAD_WXH_NEON(8, 32)
 
-#define FSADS64_H(h)                                                          \
-  unsigned int aom_sad_skip_64x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
+SAD_WXH_NEON(16, 4)
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+SAD_WXH_NEON(16, 64)
+
+SAD_WXH_NEON(32, 8)
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 16)
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+SAD_WXH_NEON(64, 128)
+
+SAD_WXH_NEON(128, 64)
+SAD_WXH_NEON(128, 128)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int aom_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
   }
 
-FSADS64_H(128)
-FSADS64_H(64)
-FSADS64_H(32)
-FSADS64_H(16)
+SAD_SKIP_WXH_NEON(4, 8)
+SAD_SKIP_WXH_NEON(4, 16)
 
-#undef FSADS64_H
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+SAD_SKIP_WXH_NEON(8, 32)
 
-#define FSADS32_H(h)                                                          \
-  unsigned int aom_sad_skip_32x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+SAD_SKIP_WXH_NEON(16, 64)
 
-FSADS32_H(64)
-FSADS32_H(32)
-FSADS32_H(16)
-FSADS32_H(8)
+SAD_SKIP_WXH_NEON(32, 8)
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
 
-#undef FSADS32_H
+SAD_SKIP_WXH_NEON(64, 16)
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+SAD_SKIP_WXH_NEON(64, 128)
 
-#define FSADS16_H(h)                                                          \
-  unsigned int aom_sad_skip_16x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride) {                                                       \
-    return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                            h / 2);                                           \
-  }
+SAD_SKIP_WXH_NEON(128, 64)
+SAD_SKIP_WXH_NEON(128, 128)
 
-FSADS16_H(64)
-FSADS16_H(32)
-FSADS16_H(16)
-FSADS16_H(8)
-
-#undef FSADS16_H
-
-#define FSADS8_H(h)                                                          \
-  unsigned int aom_sad_skip_8x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                           h / 2);                                           \
-  }
-
-FSADS8_H(32)
-FSADS8_H(16)
-FSADS8_H(8)
-
-#undef FSADS8_H
-
-#define FSADS4_H(h)                                                          \
-  unsigned int aom_sad_skip_4x##h##_neon(                                    \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
-                           h / 2);                                           \
-  }
-
-FSADS4_H(16)
-FSADS4_H(8)
-
-#undef FSADS4_H
+#undef SAD_SKIP_WXH_NEON
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 55897f2..294105b 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1790,13 +1790,29 @@
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
   make_tuple(128, 128, &aom_sad128x128_neon, -1),
+  make_tuple(128, 64, &aom_sad128x64_neon, -1),
+  make_tuple(64, 128, &aom_sad64x128_neon, -1),
   make_tuple(64, 64, &aom_sad64x64_neon, -1),
+  make_tuple(64, 32, &aom_sad64x32_neon, -1),
+  make_tuple(32, 64, &aom_sad32x64_neon, -1),
   make_tuple(32, 32, &aom_sad32x32_neon, -1),
+  make_tuple(32, 16, &aom_sad32x16_neon, -1),
+  make_tuple(16, 32, &aom_sad16x32_neon, -1),
   make_tuple(16, 16, &aom_sad16x16_neon, -1),
   make_tuple(16, 8, &aom_sad16x8_neon, -1),
   make_tuple(8, 16, &aom_sad8x16_neon, -1),
   make_tuple(8, 8, &aom_sad8x8_neon, -1),
+  make_tuple(8, 4, &aom_sad8x4_neon, -1),
+  make_tuple(4, 8, &aom_sad4x8_neon, -1),
   make_tuple(4, 4, &aom_sad4x4_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16_neon, -1),
+  make_tuple(32, 8, &aom_sad32x8_neon, -1),
+  make_tuple(16, 64, &aom_sad16x64_neon, -1),
+  make_tuple(16, 4, &aom_sad16x4_neon, -1),
+  make_tuple(8, 32, &aom_sad8x32_neon, -1),
+  make_tuple(4, 16, &aom_sad4x16_neon, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));