Add Neon implementations of SAD4D functions for all block sizes
Add a Neon implementation of aom_sad<w>x<h>4d for the following block
sizes - that previously only used a scalar C implementation on Arm:
* 128x64, 128x128
* 64x16, 64x32, 64x128
* 32x8, 32x16, 32x64
* 16x4, 16x8, 16x32, 16x64
* 8x4, 8x8, 8x16, 8x32
* 4x4, 4x8, 4x16, 4x32
Also add test coverage for the new Neon functions.
Change-Id: I7bda537c335edffbd527cb661f4397a4c59ea6de
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index fc9dab1..5e081a7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1002,34 +1002,31 @@
add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
}
- specialize qw/aom_sad128x128x4d avx2 sse2/;
- specialize qw/aom_sad128x64x4d avx2 sse2/;
- specialize qw/aom_sad64x128x4d avx2 sse2/;
+ specialize qw/aom_sad128x128x4d avx2 neon sse2/;
+ specialize qw/aom_sad128x64x4d avx2 neon sse2/;
+ specialize qw/aom_sad64x128x4d avx2 neon sse2/;
specialize qw/aom_sad64x64x4d avx2 neon msa sse2/;
- specialize qw/aom_sad64x32x4d avx2 msa sse2/;
- specialize qw/aom_sad64x16x4d avx2 sse2/;
- specialize qw/aom_sad32x64x4d avx2 msa sse2/;
+ specialize qw/aom_sad64x32x4d avx2 neon msa sse2/;
+ specialize qw/aom_sad32x64x4d avx2 neon msa sse2/;
specialize qw/aom_sad32x32x4d avx2 neon msa sse2/;
- specialize qw/aom_sad32x16x4d avx2 msa sse2/;
- specialize qw/aom_sad32x8x4d avx2 sse2/;
- specialize qw/aom_sad16x64x4d sse2/;
- specialize qw/aom_sad16x32x4d msa sse2/;
- specialize qw/aom_sad16x16x4d neon msa sse2/;
- specialize qw/aom_sad16x8x4d msa sse2/;
+ specialize qw/aom_sad32x16x4d avx2 neon msa sse2/;
+ specialize qw/aom_sad16x32x4d neon msa sse2/;
+ specialize qw/aom_sad16x16x4d neon msa sse2/;
+ specialize qw/aom_sad16x8x4d neon msa sse2/;
- specialize qw/aom_sad8x16x4d msa sse2/;
- specialize qw/aom_sad8x8x4d msa sse2/;
- specialize qw/aom_sad8x4x4d msa sse2/;
- specialize qw/aom_sad4x16x4d msa sse2/;
- specialize qw/aom_sad4x8x4d msa sse2/;
- specialize qw/aom_sad4x4x4d msa sse2/;
+ specialize qw/aom_sad8x16x4d neon msa sse2/;
+ specialize qw/aom_sad8x8x4d neon msa sse2/;
+ specialize qw/aom_sad8x4x4d neon msa sse2/;
+ specialize qw/aom_sad4x32x4d neon sse2/;
+ specialize qw/aom_sad4x8x4d neon msa sse2/;
+ specialize qw/aom_sad4x4x4d neon msa sse2/;
- specialize qw/aom_sad4x32x4d sse2/;
- specialize qw/aom_sad4x16x4d sse2/;
- specialize qw/aom_sad16x4x4d sse2/;
- specialize qw/aom_sad8x32x4d sse2/;
- specialize qw/aom_sad32x8x4d sse2/;
- specialize qw/aom_sad64x16x4d sse2/;
+ specialize qw/aom_sad64x16x4d avx2 neon sse2/;
+ specialize qw/aom_sad32x8x4d avx2 neon sse2/;
+ specialize qw/aom_sad16x64x4d neon sse2/;
+ specialize qw/aom_sad16x4x4d neon sse2/;
+ specialize qw/aom_sad8x32x4d neon sse2/;
+ specialize qw/aom_sad4x16x4d neon msa sse2/;
specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon/;
specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon/;
@@ -1046,15 +1043,12 @@
specialize qw/aom_sad_skip_16x32x4d sse2 neon/;
specialize qw/aom_sad_skip_16x16x4d sse2 neon/;
specialize qw/aom_sad_skip_16x8x4d sse2 neon/;
+ specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
specialize qw/aom_sad_skip_8x16x4d sse2 neon/;
specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
- specialize qw/aom_sad_skip_4x16x4d sse2 neon/;
- specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
specialize qw/aom_sad_skip_4x32x4d sse2 neon/;
specialize qw/aom_sad_skip_4x16x4d sse2 neon/;
- specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
- specialize qw/aom_sad_skip_32x8x4d sse2 neon/;
- specialize qw/aom_sad_skip_64x16x4d sse2 neon/;
+ specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
specialize qw/aom_sad128x128x4d_avg sse2/;
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index bbc0507..94b0b9c 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -277,23 +277,43 @@
res[3] = horizontal_add_u16x8(sum[3]);
}
-void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- sad64xhx4d_neon(src, src_stride, ref, ref_stride, res, 64);
-}
+#define SAD_WXH_4D_NEON(w, h) \
+ void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
+ }
-void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- sad32xhx4d_neon(src, src_stride, ref, ref_stride, res, 32);
-}
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+SAD_WXH_4D_NEON(4, 16)
+SAD_WXH_4D_NEON(4, 32)
-void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- sad16xhx4d_neon(src, src_stride, ref, ref_stride, res, 16);
-}
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+SAD_WXH_4D_NEON(8, 32)
+
+SAD_WXH_4D_NEON(16, 4)
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+SAD_WXH_4D_NEON(16, 64)
+
+SAD_WXH_4D_NEON(32, 8)
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 16)
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+SAD_WXH_4D_NEON(64, 128)
+
+SAD_WXH_4D_NEON(128, 64)
+SAD_WXH_4D_NEON(128, 128)
+
+#undef SAD_WXH_4D_NEON
#define SAD_SKIP_WXH_4D_NEON(w, h) \
void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 294105b..055da2f 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1817,9 +1817,30 @@
INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
const SadMxNx4Param x4d_neon_tests[] = {
+ make_tuple(128, 128, &aom_sad128x128x4d_neon, -1),
+ make_tuple(128, 64, &aom_sad128x64x4d_neon, -1),
+ make_tuple(64, 128, &aom_sad64x128x4d_neon, -1),
make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
+ make_tuple(64, 32, &aom_sad64x32x4d_neon, -1),
+ make_tuple(32, 64, &aom_sad32x64x4d_neon, -1),
make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
+ make_tuple(32, 16, &aom_sad32x16x4d_neon, -1),
+ make_tuple(16, 32, &aom_sad16x32x4d_neon, -1),
make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
+ make_tuple(16, 8, &aom_sad16x8x4d_neon, -1),
+ make_tuple(8, 16, &aom_sad8x16x4d_neon, -1),
+ make_tuple(8, 8, &aom_sad8x8x4d_neon, -1),
+ make_tuple(8, 4, &aom_sad8x4x4d_neon, -1),
+ make_tuple(4, 8, &aom_sad4x8x4d_neon, -1),
+ make_tuple(4, 4, &aom_sad4x4x4d_neon, -1),
+#if !CONFIG_REALTIME_ONLY
+ make_tuple(64, 16, &aom_sad64x16x4d_neon, -1),
+ make_tuple(32, 8, &aom_sad32x8x4d_neon, -1),
+ make_tuple(16, 64, &aom_sad16x64x4d_neon, -1),
+ make_tuple(16, 4, &aom_sad16x4x4d_neon, -1),
+ make_tuple(8, 32, &aom_sad8x32x4d_neon, -1),
+ make_tuple(4, 16, &aom_sad4x16x4d_neon, -1),
+#endif
};
INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
const SadSkipMxNParam skip_neon_tests[] = {