Add 2-tap path for aom_highbd_convolve8_vert_neon
Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in both Neon and SVE Neon versions
of aom_highbd_convolve8_vert. This provides between 40% and 70% uplift
over the 4-tap implementation.
Change-Id: I0526e13599d8519f06c322e4317aeb943ebfd795
diff --git a/aom_dsp/arm/highbd_convolve8_neon.c b/aom_dsp/arm/highbd_convolve8_neon.c
index a433b95..99ad0ba 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.c
+++ b/aom_dsp/arm/highbd_convolve8_neon.c
@@ -366,7 +366,12 @@
src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
- if (get_filter_taps_convolve8(filter_y) <= 4) {
+ const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+ if (filter_taps == 2) {
+ highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+ dst_stride, filter_y, w, h, bd);
+ } else if (filter_taps == 4) {
highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
dst_stride, filter_y, w, h, bd);
} else {
diff --git a/aom_dsp/arm/highbd_convolve8_neon.h b/aom_dsp/arm/highbd_convolve8_neon.h
index 0777378..b87b4ba 100644
--- a/aom_dsp/arm/highbd_convolve8_neon.h
+++ b/aom_dsp/arm/highbd_convolve8_neon.h
@@ -199,4 +199,81 @@
}
}
+static INLINE void highbd_convolve8_vert_2tap_neon(
+ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+ ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
+ // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
+ // reduce intermediate precision requirements and allow the use of non
+ // widening multiply.
+ const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
+ const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);
+
+ const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+ if (w == 4) {
+ do {
+ uint16x8_t s0 =
+ load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride);
+ uint16x8_t s1 =
+ load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride);
+ uint16x8_t s2 =
+ load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride);
+ uint16x8_t s3 =
+ load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride);
+
+ uint16x8_t sum01 = vmulq_u16(s0, f0);
+ sum01 = vmlaq_u16(sum01, s1, f1);
+ uint16x8_t sum23 = vmulq_u16(s2, f0);
+ sum23 = vmlaq_u16(sum23, s3, f1);
+
+ // We divided filter taps by 8 so subtract 3 from right shift.
+ sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+ sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+ sum01 = vminq_u16(sum01, max);
+ sum23 = vminq_u16(sum23, max);
+
+ store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
+ store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ do {
+ int width = w;
+ const uint16_t *s = src_ptr;
+ uint16_t *d = dst_ptr;
+
+ do {
+ uint16x8_t s0, s1, s2;
+ load_u16_8x3(s, src_stride, &s0, &s1, &s2);
+
+ uint16x8_t sum01 = vmulq_u16(s0, f0);
+ sum01 = vmlaq_u16(sum01, s1, f1);
+ uint16x8_t sum23 = vmulq_u16(s1, f0);
+ sum23 = vmlaq_u16(sum23, s2, f1);
+
+ // We divided filter taps by 8 so subtract 3 from right shift.
+ sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
+ sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);
+
+ sum01 = vminq_u16(sum01, max);
+ sum23 = vminq_u16(sum23, max);
+
+ vst1q_u16(d + 0 * dst_stride, sum01);
+ vst1q_u16(d + 1 * dst_stride, sum23);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width != 0);
+ src_ptr += 2 * src_stride;
+ dst_ptr += 2 * dst_stride;
+ h -= 2;
+ } while (h > 0);
+ }
+}
+
#endif // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/highbd_convolve8_sve.c b/aom_dsp/arm/highbd_convolve8_sve.c
index 789b38a..f519395 100644
--- a/aom_dsp/arm/highbd_convolve8_sve.c
+++ b/aom_dsp/arm/highbd_convolve8_sve.c
@@ -556,7 +556,12 @@
src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
- if (get_filter_taps_convolve8(filter_y) <= 4) {
+ const int filter_taps = get_filter_taps_convolve8(filter_y);
+
+ if (filter_taps == 2) {
+ highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst,
+ dst_stride, filter_y, width, height, bd);
+ } else if (filter_taps == 4) {
highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst,
dst_stride, filter_y, width, height, bd);
} else {
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index ba18700..b1f6ebe 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -231,6 +231,16 @@
*s1 = vld1q_u16(s);
}
+static INLINE void load_u16_8x3(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+}
+
static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
uint16x8_t *const s0, uint16x8_t *const s1,
uint16x8_t *const s2, uint16x8_t *const s3) {