Add Neon implementations of SAD functions for all block sizes
Add a Neon implementation of aom_sad<w>x<h> for the following block
sizes - that previously only used a scalar C implementation on Arm:
* 128x64
* 64x128, 64x32, 64x16
* 32x64, 32x16, 32x8
* 16x64, 16x32, 16x4
* 8x32, 8x4
* 4x16, 4x8
Also add test coverage for the new Neon functions.
Change-Id: I113b790033a8cb12502a9e707a16c573dd567a34
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 1928de0..8349af7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -766,28 +766,28 @@
add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
specialize qw/aom_sum_sse_2d_i16 sse2 avx2/;
specialize qw/aom_sad128x128 avx2 neon sse2/;
- specialize qw/aom_sad128x64 avx2 sse2/;
- specialize qw/aom_sad64x128 avx2 sse2/;
+ specialize qw/aom_sad128x64 avx2 neon sse2/;
+ specialize qw/aom_sad64x128 avx2 neon sse2/;
specialize qw/aom_sad64x64 avx2 neon msa sse2/;
- specialize qw/aom_sad64x32 avx2 msa sse2/;
- specialize qw/aom_sad32x64 avx2 msa sse2/;
+ specialize qw/aom_sad64x32 avx2 neon msa sse2/;
+ specialize qw/aom_sad32x64 avx2 neon msa sse2/;
specialize qw/aom_sad32x32 avx2 neon msa sse2/;
- specialize qw/aom_sad32x16 avx2 msa sse2/;
- specialize qw/aom_sad16x32 msa sse2/;
+ specialize qw/aom_sad32x16 avx2 neon msa sse2/;
+ specialize qw/aom_sad16x32 neon msa sse2/;
specialize qw/aom_sad16x16 neon msa sse2/;
specialize qw/aom_sad16x8 neon msa sse2/;
specialize qw/aom_sad8x16 neon msa sse2/;
specialize qw/aom_sad8x8 neon msa sse2/;
- specialize qw/aom_sad8x4 msa sse2/;
- specialize qw/aom_sad4x8 msa sse2/;
+ specialize qw/aom_sad8x4 neon msa sse2/;
+ specialize qw/aom_sad4x8 neon msa sse2/;
specialize qw/aom_sad4x4 neon msa sse2/;
- specialize qw/aom_sad4x16 sse2/;
- specialize qw/aom_sad16x4 sse2/;
- specialize qw/aom_sad8x32 sse2/;
- specialize qw/aom_sad32x8 sse2/;
- specialize qw/aom_sad16x64 sse2/;
- specialize qw/aom_sad64x16 sse2/;
+ specialize qw/aom_sad4x16 neon sse2/;
+ specialize qw/aom_sad16x4 neon sse2/;
+ specialize qw/aom_sad8x32 neon sse2/;
+ specialize qw/aom_sad32x8 neon sse2/;
+ specialize qw/aom_sad16x64 neon sse2/;
+ specialize qw/aom_sad64x16 neon sse2/;
specialize qw/aom_sad_skip_128x128 avx2 sse2 neon/;
specialize qw/aom_sad_skip_128x64 avx2 sse2 neon/;
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index df66275..6cb7a03 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -232,128 +232,73 @@
return horizontal_add_u16x8(sum);
}
-unsigned int aom_sad128x128_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad128xh_neon(src, src_stride, ref, ref_stride, 128);
-}
-
-unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad64xh_neon(src, src_stride, ref, ref_stride, 64);
-}
-
-unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad32xh_neon(src, src_stride, ref, ref_stride, 32);
-}
-
-unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad16xh_neon(src, src_stride, ref, ref_stride, 16);
-}
-
-unsigned int aom_sad16x8_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad16xh_neon(src, src_stride, ref, ref_stride, 8);
-}
-
-unsigned int aom_sad8x16_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad8xh_neon(src, src_stride, ref, ref_stride, 16);
-}
-
-unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad8xh_neon(src, src_stride, ref, ref_stride, 8);
-}
-
-unsigned int aom_sad4x4_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- return sad4xh_neon(src, src_stride, ref, ref_stride, 4);
-}
-
-#define FSADS128_H(h) \
- unsigned int aom_sad_skip_128x##h##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- const uint32_t sum = sad128xh_neon(src_ptr, 2 * src_stride, ref_ptr, \
- 2 * ref_stride, h / 2); \
- return 2 * sum; \
+#define SAD_WXH_NEON(w, h) \
+ unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
}
-FSADS128_H(128)
-FSADS128_H(64)
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+SAD_WXH_NEON(4, 16)
-#undef FSADS128_H
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+SAD_WXH_NEON(8, 32)
-#define FSADS64_H(h) \
- unsigned int aom_sad_skip_64x##h##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- return 2 * sad64xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
- h / 2); \
+SAD_WXH_NEON(16, 4)
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+SAD_WXH_NEON(16, 64)
+
+SAD_WXH_NEON(32, 8)
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 16)
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+SAD_WXH_NEON(64, 128)
+
+SAD_WXH_NEON(128, 64)
+SAD_WXH_NEON(128, 128)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h) \
+ unsigned int aom_sad_skip_##w##x##h##_neon( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * \
+ sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
}
-FSADS64_H(128)
-FSADS64_H(64)
-FSADS64_H(32)
-FSADS64_H(16)
+SAD_SKIP_WXH_NEON(4, 8)
+SAD_SKIP_WXH_NEON(4, 16)
-#undef FSADS64_H
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+SAD_SKIP_WXH_NEON(8, 32)
-#define FSADS32_H(h) \
- unsigned int aom_sad_skip_32x##h##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- return 2 * sad32xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
- h / 2); \
- }
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+SAD_SKIP_WXH_NEON(16, 64)
-FSADS32_H(64)
-FSADS32_H(32)
-FSADS32_H(16)
-FSADS32_H(8)
+SAD_SKIP_WXH_NEON(32, 8)
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
-#undef FSADS32_H
+SAD_SKIP_WXH_NEON(64, 16)
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+SAD_SKIP_WXH_NEON(64, 128)
-#define FSADS16_H(h) \
- unsigned int aom_sad_skip_16x##h##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- return 2 * sad16xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
- h / 2); \
- }
+SAD_SKIP_WXH_NEON(128, 64)
+SAD_SKIP_WXH_NEON(128, 128)
-FSADS16_H(64)
-FSADS16_H(32)
-FSADS16_H(16)
-FSADS16_H(8)
-
-#undef FSADS16_H
-
-#define FSADS8_H(h) \
- unsigned int aom_sad_skip_8x##h##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- return 2 * sad8xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
- h / 2); \
- }
-
-FSADS8_H(32)
-FSADS8_H(16)
-FSADS8_H(8)
-
-#undef FSADS8_H
-
-#define FSADS4_H(h) \
- unsigned int aom_sad_skip_4x##h##_neon( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride) { \
- return 2 * sad4xh_neon(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
- h / 2); \
- }
-
-FSADS4_H(16)
-FSADS4_H(8)
-
-#undef FSADS4_H
+#undef SAD_SKIP_WXH_NEON
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 55897f2..294105b 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1790,13 +1790,29 @@
#if HAVE_NEON
const SadMxNParam neon_tests[] = {
make_tuple(128, 128, &aom_sad128x128_neon, -1),
+ make_tuple(128, 64, &aom_sad128x64_neon, -1),
+ make_tuple(64, 128, &aom_sad64x128_neon, -1),
make_tuple(64, 64, &aom_sad64x64_neon, -1),
+ make_tuple(64, 32, &aom_sad64x32_neon, -1),
+ make_tuple(32, 64, &aom_sad32x64_neon, -1),
make_tuple(32, 32, &aom_sad32x32_neon, -1),
+ make_tuple(32, 16, &aom_sad32x16_neon, -1),
+ make_tuple(16, 32, &aom_sad16x32_neon, -1),
make_tuple(16, 16, &aom_sad16x16_neon, -1),
make_tuple(16, 8, &aom_sad16x8_neon, -1),
make_tuple(8, 16, &aom_sad8x16_neon, -1),
make_tuple(8, 8, &aom_sad8x8_neon, -1),
+ make_tuple(8, 4, &aom_sad8x4_neon, -1),
+ make_tuple(4, 8, &aom_sad4x8_neon, -1),
make_tuple(4, 4, &aom_sad4x4_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16_neon, -1),
+ make_tuple(32, 8, &aom_sad32x8_neon, -1),
+ make_tuple(16, 64, &aom_sad16x64_neon, -1),
+ make_tuple(16, 4, &aom_sad16x4_neon, -1),
+ make_tuple(8, 32, &aom_sad8x32_neon, -1),
+ make_tuple(4, 16, &aom_sad4x16_neon, -1),
+#endif
};
INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));