Add Neon implementations of SAD4D functions for all block sizes

Add a Neon implementation of aom_sad<w>x<h>4d for the following block
sizes - that previously only used a scalar C implementation on Arm:
 * 128x64, 128x128
 * 64x16, 64x32, 64x128
 * 32x8, 32x16, 32x64
 * 16x4, 16x8, 16x32, 16x64
 * 8x4, 8x8, 8x16, 8x32
 * 4x4, 4x8, 4x16, 4x32

Also add test coverage for the new Neon functions.

Change-Id: I7bda537c335edffbd527cb661f4397a4c59ea6de
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index fc9dab1..5e081a7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1002,34 +1002,31 @@
     add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
   }
 
-  specialize qw/aom_sad128x128x4d avx2          sse2/;
-  specialize qw/aom_sad128x64x4d  avx2          sse2/;
-  specialize qw/aom_sad64x128x4d  avx2          sse2/;
+  specialize qw/aom_sad128x128x4d avx2 neon     sse2/;
+  specialize qw/aom_sad128x64x4d  avx2 neon     sse2/;
+  specialize qw/aom_sad64x128x4d  avx2 neon     sse2/;
   specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
-  specialize qw/aom_sad64x16x4d   avx2          sse2/;
-  specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
+  specialize qw/aom_sad64x32x4d   avx2 neon msa sse2/;
+  specialize qw/aom_sad32x64x4d   avx2 neon msa sse2/;
   specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x8x4d    avx2          sse2/;
-  specialize qw/aom_sad16x64x4d                 sse2/;
-  specialize qw/aom_sad16x32x4d             msa sse2/;
-  specialize qw/aom_sad16x16x4d         neon msa sse2/;
-  specialize qw/aom_sad16x8x4d               msa sse2/;
+  specialize qw/aom_sad32x16x4d   avx2 neon msa sse2/;
+  specialize qw/aom_sad16x32x4d        neon msa sse2/;
+  specialize qw/aom_sad16x16x4d        neon msa sse2/;
+  specialize qw/aom_sad16x8x4d         neon msa sse2/;
 
-  specialize qw/aom_sad8x16x4d              msa sse2/;
-  specialize qw/aom_sad8x8x4d               msa sse2/;
-  specialize qw/aom_sad8x4x4d               msa sse2/;
-  specialize qw/aom_sad4x16x4d              msa sse2/;
-  specialize qw/aom_sad4x8x4d               msa sse2/;
-  specialize qw/aom_sad4x4x4d               msa sse2/;
+  specialize qw/aom_sad8x16x4d         neon msa sse2/;
+  specialize qw/aom_sad8x8x4d          neon msa sse2/;
+  specialize qw/aom_sad8x4x4d          neon msa sse2/;
+  specialize qw/aom_sad4x32x4d         neon     sse2/;
+  specialize qw/aom_sad4x8x4d          neon msa sse2/;
+  specialize qw/aom_sad4x4x4d          neon msa sse2/;
 
-  specialize qw/aom_sad4x32x4d  sse2/;
-  specialize qw/aom_sad4x16x4d  sse2/;
-  specialize qw/aom_sad16x4x4d  sse2/;
-  specialize qw/aom_sad8x32x4d  sse2/;
-  specialize qw/aom_sad32x8x4d  sse2/;
-  specialize qw/aom_sad64x16x4d sse2/;
+  specialize qw/aom_sad64x16x4d   avx2 neon     sse2/;
+  specialize qw/aom_sad32x8x4d    avx2 neon     sse2/;
+  specialize qw/aom_sad16x64x4d        neon     sse2/;
+  specialize qw/aom_sad16x4x4d         neon     sse2/;
+  specialize qw/aom_sad8x32x4d         neon     sse2/;
+  specialize qw/aom_sad4x16x4d         neon msa sse2/;
 
   specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
   specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon/;
@@ -1046,15 +1043,12 @@
   specialize qw/aom_sad_skip_16x32x4d        sse2 neon/;
   specialize qw/aom_sad_skip_16x16x4d        sse2 neon/;
   specialize qw/aom_sad_skip_16x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
-  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
   specialize qw/aom_sad_skip_4x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_32x8x4d         sse2 neon/;
-  specialize qw/aom_sad_skip_64x16x4d        sse2 neon/;
+  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     specialize qw/aom_sad128x128x4d_avg sse2/;
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index bbc0507..94b0b9c 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -277,23 +277,43 @@
   res[3] = horizontal_add_u16x8(sum[3]);
 }
 
-void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  sad64xhx4d_neon(src, src_stride, ref, ref_stride, res, 64);
-}
+#define SAD_WXH_4D_NEON(w, h)                                                  \
+  void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
+  }
 
-void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  sad32xhx4d_neon(src, src_stride, ref, ref_stride, res, 32);
-}
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+SAD_WXH_4D_NEON(4, 16)
+SAD_WXH_4D_NEON(4, 32)
 
-void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  sad16xhx4d_neon(src, src_stride, ref, ref_stride, res, 16);
-}
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+SAD_WXH_4D_NEON(8, 32)
+
+SAD_WXH_4D_NEON(16, 4)
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+SAD_WXH_4D_NEON(16, 64)
+
+SAD_WXH_4D_NEON(32, 8)
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 16)
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+SAD_WXH_4D_NEON(64, 128)
+
+SAD_WXH_4D_NEON(128, 64)
+SAD_WXH_4D_NEON(128, 128)
+
+#undef SAD_WXH_4D_NEON
 
 #define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
   void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 294105b..055da2f 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1817,9 +1817,30 @@
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
 const SadMxNx4Param x4d_neon_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_neon, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_neon, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_neon, -1),
   make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_neon, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_neon, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_neon, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_neon, -1),
   make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_neon, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_neon, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_neon, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_neon, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_neon, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16x4d_neon, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_neon, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_neon, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_neon, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_neon, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_neon, -1),
+#endif
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 const SadSkipMxNParam skip_neon_tests[] = {