Add 2-tap path for aom_highbd_convolve8_vert_neon Add a specialized Neon implementation for 2-tap filters and use it instead of the 4-tap implementation in both Neon and SVE Neon versions of aom_highbd_convolve8_vert. This provides between 40% and 70% uplift over the 4-tap implementation. Change-Id: I0526e13599d8519f06c322e4317aeb943ebfd795

commit: 6e20f87defacf025c68ee81c2c825ebc7e3936c0 [log] [tgz]
author: Salome Thirot <salome.thirot@arm.com> Fri Apr 19 14:16:39 2024 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> Mon Apr 22 17:00:01 2024 +0000
tree: fbf521d431dabbdc54c20a5a79809f2673dd817a
parent: 8e161f9cb1bfa330b34268b2bc2f2c000a09ec3b [diff]
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index a433b95..99ad0ba 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c

@@ -366,7 +366,12 @@
 
     src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-    if (get_filter_taps_convolve8(filter_y) <= 4) {
+    const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+    if (filter_taps == 2) {
+      highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                      dst_stride, filter_y, w, h, bd);
+    } else if (filter_taps == 4) {
       highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
                                       dst_stride, filter_y, w, h, bd);
     } else {

diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index 0777378..b87b4ba 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h

@@ -199,4 +199,81 @@
   }
 }
 
+static INLINE void highbd_convolve8_vert_2tap_neon(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+  // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+  // reduce intermediate precision requirements and allow the use of non
+  // widening multiply.
+  const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+  const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    do {
+      uint16x8_t s0 =
+          load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
+      uint16x8_t s1 =
+          load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
+      uint16x8_t s2 =
+          load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
+      uint16x8_t s3 =
+          load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
+
+      uint16x8_t sum01 = vmulq_u16(s0, f0);
+      sum01 = vmlaq_u16(sum01, s1, f1);
+      uint16x8_t sum23 = vmulq_u16(s2, f0);
+      sum23 = vmlaq_u16(sum23, s3, f1);
+
+      // We divided filter taps by 8 so subtract 3 from right shift.
+      sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+      sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+      sum01 = vminq_u16(sum01, max);
+      sum23 = vminq_u16(sum23, max);
+
+      store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+      store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint16_t *s = src_ptr;
+      uint16_t *d = dst_ptr;
+
+      do {
+        uint16x8_t s0, s1, s2;
+        load_u16_8x3(s, src_stride, &s0, &s1, &s2);
+
+        uint16x8_t sum01 = vmulq_u16(s0, f0);
+        sum01 = vmlaq_u16(sum01, s1, f1);
+        uint16x8_t sum23 = vmulq_u16(s1, f0);
+        sum23 = vmlaq_u16(sum23, s2, f1);
+
+        // We divided filter taps by 8 so subtract 3 from right shift.
+        sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+        sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+        sum01 = vminq_u16(sum01, max);
+        sum23 = vminq_u16(sum23, max);
+
+        vst1q_u16(d + 0 * dst_stride, sum01);
+        vst1q_u16(d + 1 * dst_stride, sum23);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 2 * src_stride;
+      dst_ptr += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
 #endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_

diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 789b38a..f519395 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c

@@ -556,7 +556,12 @@
 
   src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
 
-  if (get_filter_taps_convolve8(filter_y) <= 4) {
+  const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+  if (filter_taps == 2) {
+    highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+                                    dst_stride, filter_y, width, height, bd);
+  } else if (filter_taps == 4) {
     highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
                                     dst_stride, filter_y, width, height, bd);
   } else {

diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index ba18700..b1f6ebe 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h

@@ -231,6 +231,16 @@
   *s1 = vld1q_u16(s);
 }
 
+static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+}
+
 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
                                 uint16x8_t *const s0, uint16x8_t *const s1,
                                 uint16x8_t *const s2, uint16x8_t *const s3) {
commit	6e20f87defacf025c68ee81c2c825ebc7e3936c0	[log] [tgz]
author	Salome Thirot <salome.thirot@arm.com>	Fri Apr 19 14:16:39 2024 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	Mon Apr 22 17:00:01 2024 +0000
tree	fbf521d431dabbdc54c20a5a79809f2673dd817a
parent	8e161f9cb1bfa330b34268b2bc2f2c000a09ec3b [diff]