Refactor and optimize aom_hadamard_16x16_neon

Unroll the loop to remove the need for an awkward, "two steps
forward, one step back", memory access pattern. This then removes the
requirement for a large scratch buffer on the stack - so remove that
too.

Change-Id: I20dd066efaf20f917796dc680bc1749994eafaa9
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index ec9fc2c..eda5db0 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -186,45 +186,68 @@
 
 void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
                              tran_low_t *coeff) {
-  DECLARE_ALIGNED(32, tran_low_t, temp_coeff[16 * 16]);
   /* Rearrange 16x16 to 8x32 and remove stride.
    * Top left first. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
-                        temp_coeff + 0);
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
   /* Top right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
-                        temp_coeff + 64);
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
   /* Bottom left. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
-                        temp_coeff + 128);
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
   /* Bottom right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
-                        temp_coeff + 192);
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
 
-  tran_low_t *t_coeff = temp_coeff;
-  for (int i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = load_tran_low_to_s16q(t_coeff + 0);
-    const int16x8_t a1 = load_tran_low_to_s16q(t_coeff + 64);
-    const int16x8_t a2 = load_tran_low_to_s16q(t_coeff + 128);
-    const int16x8_t a3 = load_tran_low_to_s16q(t_coeff + 192);
+  for (int i = 0; i < 64; i += 16) {
+    const int16x8_t a00 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a01 = load_tran_low_to_s16q(coeff + 64);
+    const int16x8_t a02 = load_tran_low_to_s16q(coeff + 128);
+    const int16x8_t a03 = load_tran_low_to_s16q(coeff + 192);
 
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
+    const int16x8_t b00 = vhaddq_s16(a00, a01);
+    const int16x8_t b01 = vhsubq_s16(a00, a01);
+    const int16x8_t b02 = vhaddq_s16(a02, a03);
+    const int16x8_t b03 = vhsubq_s16(a02, a03);
 
-    const int16x8_t c0 = vaddq_s16(b0, b2);
-    const int16x8_t c1 = vaddq_s16(b1, b3);
-    const int16x8_t c2 = vsubq_s16(b0, b2);
-    const int16x8_t c3 = vsubq_s16(b1, b3);
+    const int16x8_t c00 = vaddq_s16(b00, b02);
+    const int16x8_t c01 = vaddq_s16(b01, b03);
+    const int16x8_t c02 = vsubq_s16(b00, b02);
+    const int16x8_t c03 = vsubq_s16(b01, b03);
 
-    store_s16q_to_tran_low_offset_4(coeff + 0, c0);
-    store_s16q_to_tran_low_offset_4(coeff + 64, c1);
-    store_s16q_to_tran_low_offset_4(coeff + 128, c2);
-    store_s16q_to_tran_low_offset_4(coeff + 192, c3);
+    const int16x8_t a10 = load_tran_low_to_s16q(coeff + 8 + 0);
+    const int16x8_t a11 = load_tran_low_to_s16q(coeff + 8 + 64);
+    const int16x8_t a12 = load_tran_low_to_s16q(coeff + 8 + 128);
+    const int16x8_t a13 = load_tran_low_to_s16q(coeff + 8 + 192);
 
-    t_coeff += 8;
-    coeff += (4 + (((i >> 3) & 1) << 3));
+    const int16x8_t b10 = vhaddq_s16(a10, a11);
+    const int16x8_t b11 = vhsubq_s16(a10, a11);
+    const int16x8_t b12 = vhaddq_s16(a12, a13);
+    const int16x8_t b13 = vhsubq_s16(a12, a13);
+
+    const int16x8_t c10 = vaddq_s16(b10, b12);
+    const int16x8_t c11 = vaddq_s16(b11, b13);
+    const int16x8_t c12 = vsubq_s16(b10, b12);
+    const int16x8_t c13 = vsubq_s16(b11, b13);
+
+    store_s16_to_tran_low(coeff + 0 + 0, vget_low_s16(c00));
+    store_s16_to_tran_low(coeff + 0 + 4, vget_low_s16(c10));
+    store_s16_to_tran_low(coeff + 0 + 8, vget_high_s16(c00));
+    store_s16_to_tran_low(coeff + 0 + 12, vget_high_s16(c10));
+
+    store_s16_to_tran_low(coeff + 64 + 0, vget_low_s16(c01));
+    store_s16_to_tran_low(coeff + 64 + 4, vget_low_s16(c11));
+    store_s16_to_tran_low(coeff + 64 + 8, vget_high_s16(c01));
+    store_s16_to_tran_low(coeff + 64 + 12, vget_high_s16(c11));
+
+    store_s16_to_tran_low(coeff + 128 + 0, vget_low_s16(c02));
+    store_s16_to_tran_low(coeff + 128 + 4, vget_low_s16(c12));
+    store_s16_to_tran_low(coeff + 128 + 8, vget_high_s16(c02));
+    store_s16_to_tran_low(coeff + 128 + 12, vget_high_s16(c12));
+
+    store_s16_to_tran_low(coeff + 192 + 0, vget_low_s16(c03));
+    store_s16_to_tran_low(coeff + 192 + 4, vget_low_s16(c13));
+    store_s16_to_tran_low(coeff + 192 + 8, vget_high_s16(c03));
+    store_s16_to_tran_low(coeff + 192 + 12, vget_high_s16(c13));
+
+    coeff += 16;
   }
 }
 
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 4d52407..70e88c8 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -609,19 +609,6 @@
   vst1q_s32(buf + 4, v1);
 }
 
-// Stores the second result at an offset of 8 (instead of 4) to match the output
-// with that of C implementation and the function is similar to
-// store_s16q_to_tran_low(). The offset in the function name signifies that
-// pointer should be incremented by at least 4 in the calling function after
-// store_s16q_to_tran_low_offset_4() call.
-static INLINE void store_s16q_to_tran_low_offset_4(tran_low_t *buf,
-                                                   const int16x8_t a) {
-  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
-  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
-  vst1q_s32(buf, v0);
-  vst1q_s32(buf + 8, v1);
-}
-
 static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
   const int32x4_t v0 = vmovl_s16(a);
   vst1q_s32(buf, v0);