Add 4-tap specialisation to aom_convolve8_vert_neon

Add a specialized Neon path for 4-tap filters in
aom_convolve8_vert_neon, and use it for the neon_dotprod and neon_i8mm
versions as well. This gives between 40% and 50% uplift compared to
using the 8-tap path.

Change-Id: I93e48a62d851af5ff0c6f8015cb91f687c970802
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 5665b5e..43aef54 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -243,18 +243,6 @@
   return sum;
 }
 
-static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
-                                    const int16x8_t s2, const int16x8_t s3,
-                                    const int16x4_t filter) {
-  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
-  sum = vmlaq_lane_s16(sum, s1, filter, 1);
-  sum = vmlaq_lane_s16(sum, s2, filter, 2);
-  sum = vmlaq_lane_s16(sum, s3, filter, 3);
-
-  // We halved the filter values so -1 from right shift.
-  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
-}
-
 static INLINE void convolve8_horiz_4tap_neon(const uint8_t *src,
                                              ptrdiff_t src_stride, uint8_t *dst,
                                              ptrdiff_t dst_stride,
@@ -368,22 +356,13 @@
   }
 }
 
-void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
+static INLINE void convolve8_vert_8tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
   const int16x8_t filter = vld1q_s16(filter_y);
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
   if (w == 4) {
     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -478,3 +457,25 @@
     } while (w != 0);
   }
 }
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
index 0aebc6d..83fbd0a 100644
--- a/aom_dsp/arm/aom_convolve8_neon.h
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -103,4 +103,102 @@
   }
 }
 
+static INLINE uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x4_t filter) {
+  int16x8_t sum = vmulq_lane_s16(s0, filter, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter, 3);
+
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve8_vert_4tap_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const int16_t *filter_y, int w,
+                                            int h) {
+  // All filter values are even, halve to reduce intermediate precision
+  // requirements.
+  const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1);
+
+  if (w == 4) {
+    uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+    uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+
+    int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01));
+    int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride);
+      uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride);
+      uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride);
+      uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride);
+
+      int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23));
+      int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34));
+      int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45));
+      int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56));
+
+      uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter);
+      uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2;
+      load_u8_8x3(src, src_stride, &t0, &t1, &t2);
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+
+      int height = h;
+      const uint8_t *s = src + 3 * src_stride;
+      uint8_t *d = dst;
+
+      do {
+        uint8x8_t t3;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter);
+        uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter);
+        uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter);
+        uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index f49d33f..4d47d86 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -370,24 +370,13 @@
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h) {
+static INLINE void convolve8_vert_8tap_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   int8x16x2_t samples_LUT;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
   if (w == 4) {
     uint8x8_t t0, t1, t2, t3, t4, t5, t6;
     load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
@@ -536,3 +525,26 @@
     } while (w != 0);
   }
 }
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y,
+                                     w, h);
+  }
+}
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 9727639..21a4551 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -340,24 +340,13 @@
   return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+static INLINE void convolve8_vert_8tap_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
   const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   uint8x16x2_t samples_LUT;
 
-  assert((intptr_t)dst % 4 == 0);
-  assert(dst_stride % 4 == 0);
-
-  (void)filter_x;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
-
   if (w == 4) {
     uint8x8_t s0, s1, s2, s3, s4, s5, s6;
     load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
@@ -478,3 +467,26 @@
     } while (w != 0);
   }
 }
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (get_filter_taps_convolve8(filter_y) <= 4) {
+    convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride,
+                             filter_y, w, h);
+  } else {
+    convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w,
+                                  h);
+  }
+}
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index 32a462a..ba18700 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -174,6 +174,16 @@
   *s3 = vld1_u8(s);
 }
 
+static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+}
+
 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x4_t *const s0, uint16x4_t *const s1,
                                 uint16x4_t *const s2, uint16x4_t *const s3) {