Add Neon sad_skip[_4d] functions for 4x4, 8x4 and 16x4 blocks
Add Neon implementations of sad_skip and sad_skip_x4d functions for
4x4, 8x4 and 16x4 block sizes. Also add the corresponding tests.
Change-Id: I2e194eb2a2e7e56e95c440f0cedd431e5228a17e
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b32e334..2feed0b 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -806,9 +806,12 @@
specialize qw/aom_sad_skip_16x8 sse2 neon/;
specialize qw/aom_sad_skip_8x16 sse2 neon/;
specialize qw/aom_sad_skip_8x8 sse2 neon/;
+ specialize qw/aom_sad_skip_8x4 neon/;
specialize qw/aom_sad_skip_4x8 sse2 neon/;
+ specialize qw/aom_sad_skip_4x4 neon/;
specialize qw/aom_sad_skip_4x16 sse2 neon/;
+ specialize qw/aom_sad_skip_16x4 neon/;
specialize qw/aom_sad_skip_8x32 sse2 neon/;
specialize qw/aom_sad_skip_32x8 sse2 neon/;
specialize qw/aom_sad_skip_16x64 sse2 neon/;
@@ -1034,11 +1037,14 @@
specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon/;
specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon/;
specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon/;
+ specialize qw/aom_sad_skip_16x4x4d neon/;
specialize qw/aom_sad_skip_8x32x4d sse2 neon/;
specialize qw/aom_sad_skip_8x16x4d sse2 neon/;
specialize qw/aom_sad_skip_8x8x4d sse2 neon/;
+ specialize qw/aom_sad_skip_8x4x4d neon/;
specialize qw/aom_sad_skip_4x16x4d sse2 neon/;
specialize qw/aom_sad_skip_4x8x4d sse2 neon/;
+ specialize qw/aom_sad_skip_4x4x4d neon/;
specialize qw/aom_sad128x128x3d avx2/;
specialize qw/aom_sad128x64x3d avx2/;
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index 7e491bb..d949e4a 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -287,8 +287,8 @@
src += 2 * src_stride;
int ref_offset = 2 * ref_stride;
- int i = (h - 1) / 2;
- do {
+ int i = h / 2;
+ while (--i != 0) {
s = load_unaligned_u8(src, src_stride);
r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
@@ -302,7 +302,7 @@
src += 2 * src_stride;
ref_offset += 2 * ref_stride;
- } while (--i != 0);
+ }
vst1q_u32(res, horizontal_add_4d_u16x8(sum));
}
@@ -356,13 +356,16 @@
res[3] <<= 1; \
}
+SAD_SKIP_WXH_4D_NEON(4, 4)
SAD_SKIP_WXH_4D_NEON(4, 8)
SAD_SKIP_WXH_4D_NEON(4, 16)
+SAD_SKIP_WXH_4D_NEON(8, 4)
SAD_SKIP_WXH_4D_NEON(8, 8)
SAD_SKIP_WXH_4D_NEON(8, 16)
SAD_SKIP_WXH_4D_NEON(8, 32)
+SAD_SKIP_WXH_4D_NEON(16, 4)
SAD_SKIP_WXH_4D_NEON(16, 8)
SAD_SKIP_WXH_4D_NEON(16, 16)
SAD_SKIP_WXH_4D_NEON(16, 32)
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 6a22289..841908f 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -348,13 +348,16 @@
sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
}
+SAD_SKIP_WXH_NEON(4, 4)
SAD_SKIP_WXH_NEON(4, 8)
SAD_SKIP_WXH_NEON(4, 16)
+SAD_SKIP_WXH_NEON(8, 4)
SAD_SKIP_WXH_NEON(8, 8)
SAD_SKIP_WXH_NEON(8, 16)
SAD_SKIP_WXH_NEON(8, 32)
+SAD_SKIP_WXH_NEON(16, 4)
SAD_SKIP_WXH_NEON(16, 8)
SAD_SKIP_WXH_NEON(16, 16)
SAD_SKIP_WXH_NEON(16, 32)
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 707a3ec..c10e929 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1886,11 +1886,14 @@
make_tuple(16, 8, &aom_sad_skip_16x8_neon, -1),
make_tuple(8, 16, &aom_sad_skip_8x16_neon, -1),
make_tuple(8, 8, &aom_sad_skip_8x8_neon, -1),
+ make_tuple(8, 4, &aom_sad_skip_8x4_neon, -1),
make_tuple(4, 8, &aom_sad_skip_4x8_neon, -1),
+ make_tuple(4, 4, &aom_sad_skip_4x4_neon, -1),
#if !CONFIG_REALTIME_ONLY
make_tuple(64, 16, &aom_sad_skip_64x16_neon, -1),
make_tuple(32, 8, &aom_sad_skip_32x8_neon, -1),
make_tuple(16, 64, &aom_sad_skip_16x64_neon, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4_neon, -1),
make_tuple(8, 32, &aom_sad_skip_8x32_neon, -1),
make_tuple(4, 16, &aom_sad_skip_4x16_neon, -1),
#endif
@@ -1910,13 +1913,16 @@
make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon, -1),
make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon, -1),
make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon, -1),
- make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
make_tuple(8, 16, &aom_sad_skip_8x16x4d_neon, -1),
+ make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
+ make_tuple(8, 4, &aom_sad_skip_8x4x4d_neon, -1),
make_tuple(4, 8, &aom_sad_skip_4x8x4d_neon, -1),
+ make_tuple(4, 4, &aom_sad_skip_4x4x4d_neon, -1),
#if !CONFIG_REALTIME_ONLY
make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon, -1),
make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon, -1),
make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon, -1),
+ make_tuple(16, 4, &aom_sad_skip_16x4x4d_neon, -1),
make_tuple(8, 32, &aom_sad_skip_8x32x4d_neon, -1),
make_tuple(4, 16, &aom_sad_skip_4x16x4d_neon, -1),
#endif