Use run-time feature detection for Neon DotProd sad_avg
Arm Neon DotProd implementations of aom_sad<w>x<h>_avg currently need
to be enabled at compile time since they're guarded by #ifdef feature
macros. Now that run-time feature detection has been enabled for Arm
platforms, expose these implementations with distinct *neon_dotprod
names and wire them up to rtcd.pl. Also add new test cases for the
new functions.
Change-Id: I447e88bfb5c804acd8f7002d392a038465b06b9b
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index cf545b4..2292b9f 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -864,17 +864,17 @@
specialize qw/aom_sad_skip_16x64 sse2 neon neon_dotprod/;
specialize qw/aom_sad_skip_64x16 sse2 neon neon_dotprod/;
- specialize qw/aom_sad128x128_avg avx2 sse2 neon/;
- specialize qw/aom_sad128x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x128_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad64x32_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x64_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x32_avg avx2 sse2 neon/;
- specialize qw/aom_sad32x16_avg avx2 sse2 neon/;
- specialize qw/aom_sad16x32_avg sse2 neon/;
- specialize qw/aom_sad16x16_avg sse2 neon/;
- specialize qw/aom_sad16x8_avg sse2 neon/;
+ specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad128x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x128_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x64_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x32_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad32x16_avg avx2 sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x32_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x16_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x8_avg sse2 neon neon_dotprod/;
specialize qw/aom_sad8x16_avg sse2 neon/;
specialize qw/aom_sad8x8_avg sse2 neon/;
specialize qw/aom_sad8x4_avg sse2 neon/;
@@ -882,11 +882,11 @@
specialize qw/aom_sad4x4_avg sse2 neon/;
specialize qw/aom_sad4x16_avg sse2 neon/;
- specialize qw/aom_sad16x4_avg sse2 neon/;
+ specialize qw/aom_sad16x4_avg sse2 neon neon_dotprod/;
specialize qw/aom_sad8x32_avg sse2 neon/;
- specialize qw/aom_sad32x8_avg sse2 neon/;
- specialize qw/aom_sad16x64_avg sse2 neon/;
- specialize qw/aom_sad64x16_avg sse2 neon/;
+ specialize qw/aom_sad32x8_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad16x64_avg sse2 neon neon_dotprod/;
+ specialize qw/aom_sad64x16_avg sse2 neon neon_dotprod/;
specialize qw/aom_dist_wtd_sad128x128_avg sse2/;
specialize qw/aom_dist_wtd_sad128x64_avg sse2/;
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 2359ce9..320715e 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -298,114 +298,6 @@
#undef SAD_SKIP_WXH_NEON
-#if defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int w, int h,
- const uint8_t *second_pred) {
- // Only two accumulators are required for optimal instruction throughput of
- // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h;
- do {
- int j = 0;
- do {
- uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr + j);
- r0 = vld1q_u8(ref_ptr + j);
- p0 = vld1q_u8(second_pred);
- avg0 = vrhaddq_u8(r0, p0);
- diff0 = vabdq_u8(s0, avg0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- s1 = vld1q_u8(src_ptr + j + 16);
- r1 = vld1q_u8(ref_ptr + j + 16);
- p1 = vld1q_u8(second_pred + 16);
- avg1 = vrhaddq_u8(r1, p1);
- diff1 = vabdq_u8(s1, avg1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- j += 32;
- second_pred += 32;
- } while (j < w);
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h,
- second_pred);
-}
-
-static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
- second_pred);
-}
-
-static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
- second_pred);
-}
-
-static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *ref_ptr,
- int ref_stride, int h,
- const uint8_t *second_pred) {
- uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- int i = h / 2;
- do {
- uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
-
- s0 = vld1q_u8(src_ptr);
- r0 = vld1q_u8(ref_ptr);
- p0 = vld1q_u8(second_pred);
- avg0 = vrhaddq_u8(r0, p0);
- diff0 = vabdq_u8(s0, avg0);
- sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- second_pred += 16;
-
- s1 = vld1q_u8(src_ptr);
- r1 = vld1q_u8(ref_ptr);
- p1 = vld1q_u8(second_pred);
- avg1 = vrhaddq_u8(r1, p1);
- diff1 = vabdq_u8(s1, avg1);
- sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- second_pred += 16;
- } while (--i != 0);
-
- return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
-}
-
-#else // !defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
@@ -612,8 +504,6 @@
return horizontal_add_u16x8(sum);
}
-#endif // defined(__ARM_FEATURE_DOTPROD)
-
static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
diff --git a/aom_dsp/arm/sad_neon_dotprod.c b/aom_dsp/arm/sad_neon_dotprod.c
index 1b95e34..2642bd0 100644
--- a/aom_dsp/arm/sad_neon_dotprod.c
+++ b/aom_dsp/arm/sad_neon_dotprod.c
@@ -164,4 +164,132 @@
SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
#endif // !CONFIG_REALTIME_ONLY
-#undef SAD_SKIP_WXH_NEON_DOTPROD
\ No newline at end of file
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int w, int h,
+ const uint8_t *second_pred) {
+ // Only two accumulators are required for optimal instruction throughput of
+ // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr + j);
+ r0 = vld1q_u8(ref_ptr + j);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ s1 = vld1q_u8(src_ptr + j + 16);
+ r1 = vld1q_u8(ref_ptr + j + 16);
+ p1 = vld1q_u8(second_pred + 16);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ j += 32;
+ second_pred += 32;
+ } while (j < w);
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+ h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred) {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ r0 = vld1q_u8(ref_ptr);
+ p0 = vld1q_u8(second_pred);
+ avg0 = vrhaddq_u8(r0, p0);
+ diff0 = vabdq_u8(s0, avg0);
+ sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+
+ s1 = vld1q_u8(src_ptr);
+ r1 = vld1q_u8(ref_ptr);
+ p1 = vld1q_u8(second_pred);
+ avg1 = vrhaddq_u8(r1, p1);
+ diff1 = vabdq_u8(s1, avg1);
+ sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
+ } while (--i != 0);
+
+ return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \
+ unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+ second_pred); \
+ }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
diff --git a/test/sad_test.cc b/test/sad_test.cc
index c05da35..594e4b9 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -2452,6 +2452,28 @@
};
INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
::testing::ValuesIn(skip_neon_dotprod_tests));
+
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128_avg_neon_dotprod, -1),
+ make_tuple(128, 64, &aom_sad128x64_avg_neon_dotprod, -1),
+ make_tuple(64, 128, &aom_sad64x128_avg_neon_dotprod, -1),
+ make_tuple(64, 64, &aom_sad64x64_avg_neon_dotprod, -1),
+ make_tuple(64, 32, &aom_sad64x32_avg_neon_dotprod, -1),
+ make_tuple(32, 64, &aom_sad32x64_avg_neon_dotprod, -1),
+ make_tuple(32, 32, &aom_sad32x32_avg_neon_dotprod, -1),
+ make_tuple(32, 16, &aom_sad32x16_avg_neon_dotprod, -1),
+ make_tuple(16, 32, &aom_sad16x32_avg_neon_dotprod, -1),
+ make_tuple(16, 16, &aom_sad16x16_avg_neon_dotprod, -1),
+ make_tuple(16, 8, &aom_sad16x8_avg_neon_dotprod, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_avg_neon_dotprod, -1),
+ make_tuple(32, 8, &aom_sad32x8_avg_neon_dotprod, -1),
+ make_tuple(16, 64, &aom_sad16x64_avg_neon_dotprod, -1),
+ make_tuple(16, 4, &aom_sad16x4_avg_neon_dotprod, -1),
+#endif // !CONFIG_REALTIME_ONLY
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+ ::testing::ValuesIn(avg_neon_dotprod_tests));
#endif // HAVE_NEON_DOTPROD
//------------------------------------------------------------------------------