Add Neon implementation of Hadamard function for 4x4 case
Add a Neon implementation of aom_hadamard_4x4 function
that previously only used a scalar C implementation on Arm.
Also add test coverage for the new Neon function.
Change-Id: Ia552b97645e67a34334b3952bf4b868d5d3fca28
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b3f8ec7..5e34fd0 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1238,7 +1238,7 @@
# hamadard transform and satd for implmenting temporal dependency model
#
add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/aom_hadamard_4x4 sse2/;
+ specialize qw/aom_hadamard_4x4 sse2 neon/;
add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/aom_hadamard_8x8 sse2 neon/;
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 75dd7d6..646d981 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -15,6 +15,38 @@
#include "aom_dsp/arm/mem_neon.h"
#include "aom_dsp/arm/transpose_neon.h"
+static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3) {
+ const int16x4_t b0 = vhadd_s16(*a0, *a1);
+ const int16x4_t b1 = vhsub_s16(*a0, *a1);
+ const int16x4_t b2 = vhadd_s16(*a2, *a3);
+ const int16x4_t b3 = vhsub_s16(*a2, *a3);
+
+ *a0 = vadd_s16(b0, b2);
+ *a1 = vadd_s16(b1, b3);
+ *a2 = vsub_s16(b0, b2);
+ *a3 = vsub_s16(b1, b3);
+}
+
+void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int16x4_t a0 = vld1_s16(src_diff);
+ int16x4_t a1 = vld1_s16(src_diff + src_stride);
+ int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride);
+ int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride);
+
+ hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+ transpose_s16_4x4d(&a0, &a1, &a2, &a3);
+
+ hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+ store_s16_to_tran_low(coeff, a0);
+ store_s16_to_tran_low(coeff + 4, a1);
+ store_s16_to_tran_low(coeff + 8, a2);
+ store_s16_to_tran_low(coeff + 12, a3);
+}
+
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 73a5127..4d52407 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -622,4 +622,9 @@
vst1q_s32(buf + 8, v1);
}
+static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
+ const int32x4_t v0 = vmovl_s16(a);
+ vst1q_s32(buf, v0);
+}
+
#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 0fe7f42..a15231f 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -349,7 +349,8 @@
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(
NEON, HadamardLowbdTest,
- ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8, 8),
+ ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_neon, 4, 4),
+ HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8, 8),
HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16, 16)));
#endif // HAVE_NEON