Speed improvement in av1_convolve_2d_sr_neon for 32-bit
Observed 9% gains w.r.t. existing neon code in unit test.
Change-Id: I8eeda087f65ddae1bf69e118fa615f350fb114d5
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index ca6a45e..b2e45c0 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -960,10 +960,17 @@
width = w;
if (width <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint16x4_t d0, d1, d2, d3;
- uint16x8_t dd0, dd1;
- uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t d0;
+ uint16x8_t dd0;
+ uint8x8_t d01;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t d1, d2, d3;
+ uint16x8_t dd1;
+ uint8x8_t d23;
+#endif
d_u8 = dst_u8_ptr;
v_s = v_src_ptr;
@@ -981,6 +988,7 @@
v_s += (7 * im_stride);
do {
+#if defined(__aarch64__)
load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
@@ -1058,11 +1066,48 @@
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+ d01 = vqmovn_u16(dd0);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
} else {
// if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
- uint8x8_t res0, res1, res2, res3;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t res1, res2, res3;
+#endif
do {
__builtin_prefetch(v_src_ptr + 0 * im_stride);
@@ -1082,6 +1127,7 @@
height = h;
do {
+#if defined(__aarch64__)
load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
@@ -1126,6 +1172,28 @@
s5 = s9;
s6 = s10;
height -= 4;
+#else
+ s7 = vld1q_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
} while (height > 0);
v_src_ptr += 8;
dst_u8_ptr += 8;