Add 2-tap path for aom_highbd_convolve8_vert_neon

Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in both Neon and SVE Neon versions
of aom_highbd_convolve8_vert. This provides between 40% and 70% uplift
over the 4-tap implementation.

Change-Id: I0526e13599d8519f06c322e4317aeb943ebfd795
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index a433b95..99ad0ba 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -366,7 +366,12 @@
 
     src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-    if (get_filter_taps_convolve8(filter_y) <= 4) {
+    const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+    if (filter_taps == 2) {
+      highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                      dst_stride, filter_y, w, h, bd);
+    } else if (filter_taps == 4) {
       highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
                                       dst_stride, filter_y, w, h, bd);
     } else {
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index 0777378..b87b4ba 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -199,4 +199,81 @@
   }
 }
 
+static INLINE void highbd_convolve8_vert_2tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+  // reduce intermediate precision requirements and allow the use of non
+  // widening multiply.
+  const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+  const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      uint16x8_t s0 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
+      uint16x8_t s1 =
+          load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
+      uint16x8_t s2 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
+      uint16x8_t s3 =
+          load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
+
+      uint16x8_t sum01 = vmulq_u16(s0, f0);
+      sum01 = vmlaq_u16(sum01, s1, f1);
+      uint16x8_t sum23 = vmulq_u16(s2, f0);
+      sum23 = vmlaq_u16(sum23, s3, f1);
+
+      // We divided filter taps by 8 so subtract 3 from right shift.
+      sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+      sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+      sum01 = vminq_u16(sum01, max);
+      sum23 = vminq_u16(sum23, max);
+
+      store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+      store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        uint16x8_t s0, s1, s2;
+        load_u16_8x3(s, src_stride, &s0, &s1, &s2);
+
+        uint16x8_t sum01 = vmulq_u16(s0, f0);
+        sum01 = vmlaq_u16(sum01, s1, f1);
+        uint16x8_t sum23 = vmulq_u16(s1, f0);
+        sum23 = vmlaq_u16(sum23, s2, f1);
+
+        // We divided filter taps by 8 so subtract 3 from right shift.
+        sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+        sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+        sum01 = vminq_u16(sum01, max);
+        sum23 = vminq_u16(sum23, max);
+
+        vst1q_u16(d + 0 * dst_stride, sum01);
+        vst1q_u16(d + 1 * dst_stride, sum23);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 789b38a..f519395 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -556,7 +556,12 @@
 
   src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-  if (get_filter_taps_convolve8(filter_y) <= 4) {
+  const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                    dst_stride, filter_y, width, height, bd);
+  } else if (filter_taps == 4) {
     highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
                                     dst_stride, filter_y, width, height, bd);
   } else {
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index ba18700..b1f6ebe 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -231,6 +231,16 @@
   *s1 = vld1q_u16(s);
 }
 
+static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+}
+
 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2, uint16x8_t *const s3) {