Add Neon implementation of Hadamard function for 4x4 case

Add a Neon implementation of aom_hadamard_4x4 function
that previously only used a scalar C implementation on Arm.

Also add test coverage for the new Neon function.

Change-Id: Ia552b97645e67a34334b3952bf4b868d5d3fca28
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b3f8ec7..5e34fd0 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1238,7 +1238,7 @@
   # hamadard transform and satd for implmenting temporal dependency model
   #
   add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-  specialize qw/aom_hadamard_4x4 sse2/;
+  specialize qw/aom_hadamard_4x4 sse2 neon/;
 
   add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
   specialize qw/aom_hadamard_8x8 sse2 neon/;
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 75dd7d6..646d981 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -15,6 +15,38 @@
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 
+static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
+                                         int16x4_t *a2, int16x4_t *a3) {
+  const int16x4_t b0 = vhadd_s16(*a0, *a1);
+  const int16x4_t b1 = vhsub_s16(*a0, *a1);
+  const int16x4_t b2 = vhadd_s16(*a2, *a3);
+  const int16x4_t b3 = vhsub_s16(*a2, *a3);
+
+  *a0 = vadd_s16(b0, b2);
+  *a1 = vadd_s16(b1, b3);
+  *a2 = vsub_s16(b0, b2);
+  *a3 = vsub_s16(b1, b3);
+}
+
+void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  int16x4_t a0 = vld1_s16(src_diff);
+  int16x4_t a1 = vld1_s16(src_diff + src_stride);
+  int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride);
+  int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride);
+
+  hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+  transpose_s16_4x4d(&a0, &a1, &a2, &a3);
+
+  hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+  store_s16_to_tran_low(coeff, a0);
+  store_s16_to_tran_low(coeff + 4, a1);
+  store_s16_to_tran_low(coeff + 8, a2);
+  store_s16_to_tran_low(coeff + 12, a3);
+}
+
 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
                                  int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
                                  int16x8_t *a6, int16x8_t *a7) {
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 73a5127..4d52407 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -622,4 +622,9 @@
   vst1q_s32(buf + 8, v1);
 }
 
+static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
+  const int32x4_t v0 = vmovl_s16(a);
+  vst1q_s32(buf, v0);
+}
+
 #endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 0fe7f42..a15231f 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -349,7 +349,8 @@
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardLowbdTest,
-    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8, 8),
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_4x4_neon, 4, 4),
+                      HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8, 8),
                       HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16, 16)));
 #endif  // HAVE_NEON