Refactor and optimize aom_hadamard_16x16_neon
Unroll the loop to remove the need for an awkward, "two steps
forward, one step back", memory access pattern. This then removes the
requirement for a large scratch buffer on the stack - so remove that
too.
Change-Id: I20dd066efaf20f917796dc680bc1749994eafaa9
diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index ec9fc2c..eda5db0 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -186,45 +186,68 @@
void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
tran_low_t *coeff) {
- DECLARE_ALIGNED(32, tran_low_t, temp_coeff[16 * 16]);
/* Rearrange 16x16 to 8x32 and remove stride.
* Top left first. */
- aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
- temp_coeff + 0);
+ aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
/* Top right. */
- aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
- temp_coeff + 64);
+ aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
/* Bottom left. */
- aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
- temp_coeff + 128);
+ aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
/* Bottom right. */
- aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
- temp_coeff + 192);
+ aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
- tran_low_t *t_coeff = temp_coeff;
- for (int i = 0; i < 64; i += 8) {
- const int16x8_t a0 = load_tran_low_to_s16q(t_coeff + 0);
- const int16x8_t a1 = load_tran_low_to_s16q(t_coeff + 64);
- const int16x8_t a2 = load_tran_low_to_s16q(t_coeff + 128);
- const int16x8_t a3 = load_tran_low_to_s16q(t_coeff + 192);
+ for (int i = 0; i < 64; i += 16) {
+ const int16x8_t a00 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a01 = load_tran_low_to_s16q(coeff + 64);
+ const int16x8_t a02 = load_tran_low_to_s16q(coeff + 128);
+ const int16x8_t a03 = load_tran_low_to_s16q(coeff + 192);
- const int16x8_t b0 = vhaddq_s16(a0, a1);
- const int16x8_t b1 = vhsubq_s16(a0, a1);
- const int16x8_t b2 = vhaddq_s16(a2, a3);
- const int16x8_t b3 = vhsubq_s16(a2, a3);
+ const int16x8_t b00 = vhaddq_s16(a00, a01);
+ const int16x8_t b01 = vhsubq_s16(a00, a01);
+ const int16x8_t b02 = vhaddq_s16(a02, a03);
+ const int16x8_t b03 = vhsubq_s16(a02, a03);
- const int16x8_t c0 = vaddq_s16(b0, b2);
- const int16x8_t c1 = vaddq_s16(b1, b3);
- const int16x8_t c2 = vsubq_s16(b0, b2);
- const int16x8_t c3 = vsubq_s16(b1, b3);
+ const int16x8_t c00 = vaddq_s16(b00, b02);
+ const int16x8_t c01 = vaddq_s16(b01, b03);
+ const int16x8_t c02 = vsubq_s16(b00, b02);
+ const int16x8_t c03 = vsubq_s16(b01, b03);
- store_s16q_to_tran_low_offset_4(coeff + 0, c0);
- store_s16q_to_tran_low_offset_4(coeff + 64, c1);
- store_s16q_to_tran_low_offset_4(coeff + 128, c2);
- store_s16q_to_tran_low_offset_4(coeff + 192, c3);
+ const int16x8_t a10 = load_tran_low_to_s16q(coeff + 8 + 0);
+ const int16x8_t a11 = load_tran_low_to_s16q(coeff + 8 + 64);
+ const int16x8_t a12 = load_tran_low_to_s16q(coeff + 8 + 128);
+ const int16x8_t a13 = load_tran_low_to_s16q(coeff + 8 + 192);
- t_coeff += 8;
- coeff += (4 + (((i >> 3) & 1) << 3));
+ const int16x8_t b10 = vhaddq_s16(a10, a11);
+ const int16x8_t b11 = vhsubq_s16(a10, a11);
+ const int16x8_t b12 = vhaddq_s16(a12, a13);
+ const int16x8_t b13 = vhsubq_s16(a12, a13);
+
+ const int16x8_t c10 = vaddq_s16(b10, b12);
+ const int16x8_t c11 = vaddq_s16(b11, b13);
+ const int16x8_t c12 = vsubq_s16(b10, b12);
+ const int16x8_t c13 = vsubq_s16(b11, b13);
+
+ store_s16_to_tran_low(coeff + 0 + 0, vget_low_s16(c00));
+ store_s16_to_tran_low(coeff + 0 + 4, vget_low_s16(c10));
+ store_s16_to_tran_low(coeff + 0 + 8, vget_high_s16(c00));
+ store_s16_to_tran_low(coeff + 0 + 12, vget_high_s16(c10));
+
+ store_s16_to_tran_low(coeff + 64 + 0, vget_low_s16(c01));
+ store_s16_to_tran_low(coeff + 64 + 4, vget_low_s16(c11));
+ store_s16_to_tran_low(coeff + 64 + 8, vget_high_s16(c01));
+ store_s16_to_tran_low(coeff + 64 + 12, vget_high_s16(c11));
+
+ store_s16_to_tran_low(coeff + 128 + 0, vget_low_s16(c02));
+ store_s16_to_tran_low(coeff + 128 + 4, vget_low_s16(c12));
+ store_s16_to_tran_low(coeff + 128 + 8, vget_high_s16(c02));
+ store_s16_to_tran_low(coeff + 128 + 12, vget_high_s16(c12));
+
+ store_s16_to_tran_low(coeff + 192 + 0, vget_low_s16(c03));
+ store_s16_to_tran_low(coeff + 192 + 4, vget_low_s16(c13));
+ store_s16_to_tran_low(coeff + 192 + 8, vget_high_s16(c03));
+ store_s16_to_tran_low(coeff + 192 + 12, vget_high_s16(c13));
+
+ coeff += 16;
}
}
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 4d52407..70e88c8 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -609,19 +609,6 @@
vst1q_s32(buf + 4, v1);
}
-// Stores the second result at an offset of 8 (instead of 4) to match the output
-// with that of C implementation and the function is similar to
-// store_s16q_to_tran_low(). The offset in the function name signifies that
-// pointer should be incremented by at least 4 in the calling function after
-// store_s16q_to_tran_low_offset_4() call.
-static INLINE void store_s16q_to_tran_low_offset_4(tran_low_t *buf,
- const int16x8_t a) {
- const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
- const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
- vst1q_s32(buf, v0);
- vst1q_s32(buf + 8, v1);
-}
-
static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
const int32x4_t v0 = vmovl_s16(a);
vst1q_s32(buf, v0);