Use run-time feature detection for Neon DotProd sad_avg

Arm Neon DotProd implementations of aom_sad<w>x<h>_avg currently need
to be enabled at compile time since they're guarded by #ifdef feature
macros. Now that run-time feature detection has been enabled for Arm
platforms, expose these implementations with distinct *neon_dotprod
names and wire them up to rtcd.pl. Also add new test cases for the
new functions.

Change-Id: I447e88bfb5c804acd8f7002d392a038465b06b9b
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index cf545b4..2292b9f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -864,17 +864,17 @@
   specialize qw/aom_sad_skip_16x64           sse2 neon neon_dotprod/;
   specialize qw/aom_sad_skip_64x16           sse2 neon neon_dotprod/;
 
-  specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
-  specialize qw/aom_sad128x64_avg  avx2 sse2 neon/;
-  specialize qw/aom_sad64x128_avg  avx2 sse2 neon/;
-  specialize qw/aom_sad64x64_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad64x32_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x64_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x32_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad32x16_avg   avx2 sse2 neon/;
-  specialize qw/aom_sad16x32_avg        sse2 neon/;
-  specialize qw/aom_sad16x16_avg        sse2 neon/;
-  specialize qw/aom_sad16x8_avg         sse2 neon/;
+  specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8_avg         sse2 neon neon_dotprod/;
   specialize qw/aom_sad8x16_avg         sse2 neon/;
   specialize qw/aom_sad8x8_avg          sse2 neon/;
   specialize qw/aom_sad8x4_avg          sse2 neon/;
@@ -882,11 +882,11 @@
   specialize qw/aom_sad4x4_avg          sse2 neon/;
 
   specialize qw/aom_sad4x16_avg         sse2 neon/;
-  specialize qw/aom_sad16x4_avg         sse2 neon/;
+  specialize qw/aom_sad16x4_avg         sse2 neon neon_dotprod/;
   specialize qw/aom_sad8x32_avg         sse2 neon/;
-  specialize qw/aom_sad32x8_avg         sse2 neon/;
-  specialize qw/aom_sad16x64_avg        sse2 neon/;
-  specialize qw/aom_sad64x16_avg        sse2 neon/;
+  specialize qw/aom_sad32x8_avg         sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16_avg        sse2 neon neon_dotprod/;
 
   specialize qw/aom_dist_wtd_sad128x128_avg sse2/;
   specialize qw/aom_dist_wtd_sad128x64_avg  sse2/;
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 2359ce9..320715e 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -298,114 +298,6 @@
 
 #undef SAD_SKIP_WXH_NEON
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
-                                           int src_stride,
-                                           const uint8_t *ref_ptr,
-                                           int ref_stride, int w, int h,
-                                           const uint8_t *second_pred) {
-  // Only two accumulators are required for optimal instruction throughput of
-  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-      s0 = vld1q_u8(src_ptr + j);
-      r0 = vld1q_u8(ref_ptr + j);
-      p0 = vld1q_u8(second_pred);
-      avg0 = vrhaddq_u8(r0, p0);
-      diff0 = vabdq_u8(s0, avg0);
-      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-      s1 = vld1q_u8(src_ptr + j + 16);
-      r1 = vld1q_u8(ref_ptr + j + 16);
-      p1 = vld1q_u8(second_pred + 16);
-      avg1 = vrhaddq_u8(r1, p1);
-      diff1 = vabdq_u8(s1, avg1);
-      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-      j += 32;
-      second_pred += 32;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
-                                             int src_stride,
-                                             const uint8_t *ref_ptr,
-                                             int ref_stride, int h,
-                                             const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
-                         second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *ref_ptr,
-                                            int ref_stride, int h,
-                                            const uint8_t *second_pred) {
-  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  int i = h / 2;
-  do {
-    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
-    s0 = vld1q_u8(src_ptr);
-    r0 = vld1q_u8(ref_ptr);
-    p0 = vld1q_u8(second_pred);
-    avg0 = vrhaddq_u8(r0, p0);
-    diff0 = vabdq_u8(s0, avg0);
-    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-
-    s1 = vld1q_u8(src_ptr);
-    r1 = vld1q_u8(ref_ptr);
-    p1 = vld1q_u8(second_pred);
-    avg1 = vrhaddq_u8(r1, p1);
-    diff1 = vabdq_u8(s1, avg1);
-    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-  } while (--i != 0);
-
-  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
                                              int src_stride,
                                              const uint8_t *ref_ptr,
@@ -612,8 +504,6 @@
   return horizontal_add_u16x8(sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *ref_ptr,
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
index 1b95e34..2642bd0 100644
--- a/aom_dsp/arm/sad_neon_dotprod.c
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -164,4 +164,132 @@
 SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
 #endif  // !CONFIG_REALTIME_ONLY
 
-#undef SAD_SKIP_WXH_NEON_DOTPROD
\ No newline at end of file
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  unsigned int aom_sad##w##x##h##_avg_neon_dotprod(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
diff --git a/test/sad_test.cc b/test/sad_test.cc
index c05da35..594e4b9 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -2452,6 +2452,28 @@
 };
 INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
                          ::testing::ValuesIn(skip_neon_dotprod_tests));
+
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_neon_dotprod, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_neon_dotprod, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_neon_dotprod, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_neon_dotprod, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_neon_dotprod, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_neon_dotprod, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_neon_dotprod, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_neon_dotprod, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_neon_dotprod, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_neon_dotprod, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(64, 16, &aom_sad64x16_avg_neon_dotprod, -1),
+  make_tuple(32, 8, &aom_sad32x8_avg_neon_dotprod, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_neon_dotprod, -1),
+  make_tuple(16, 4, &aom_sad16x4_avg_neon_dotprod, -1),
+#endif  // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+                         ::testing::ValuesIn(avg_neon_dotprod_tests));
 #endif  // HAVE_NEON_DOTPROD
 
 //------------------------------------------------------------------------------