Add 4-tap specialisation to av1_convolve_y_sr_neon

Add specialised path for 4-tap filters in av1_convolve_y_sr_neon. This
gives between 20% and 40% uplift compared to using the 6-tap path.

Change-Id: I2769e58db2ab1bbfd4e69c03b74d75bd3b920ee7
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index c86215e..bd11b7c 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -460,6 +460,103 @@
   }
 }
 
+static INLINE uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x8_t s2, const int16x8_t s3,
+                                      const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_y_sr_4tap_neon(const uint8_t *src,
+                                           const int src_stride, uint8_t *dst,
+                                           const int dst_stride, int w, int h,
+                                           const int16_t *filter_y) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
+
+  if (w == 4) {
+    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
+    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
+      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
+      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
+      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
+
+      uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter);
+      uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2;
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      int height = h;
+      const uint8_t *s = src + 3 * src_stride;
+      uint8_t *d = dst;
+
+      do {
+        uint8x8_t t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter);
+        uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter);
+        uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter);
+        uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,
                                       const int16x4_t s4, const int16x4_t s5,
@@ -1033,7 +1130,7 @@
   }
 
   const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
-  const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps;
+  const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
   const int vert_offset = clamped_y_taps / 2 - 1;
 
   src -= vert_offset * src_stride;
@@ -1050,7 +1147,10 @@
   // Filter values are even so halve to reduce precision requirements.
   const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1);
 
-  if (y_filter_taps < 8) {
+  if (y_filter_taps <= 4) {
+    convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h,
+                            y_filter_ptr);
+  } else if (y_filter_taps == 6) {
     convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);
   } else {
     convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter);