Add merged impl of 6tap av1_convolve_2d_sr_neon_dotprod Merge the horizontal and vertical passes of 2D convolution for 6tap filters, avoiding the use of an intermediate buffer. This gives around 10% uplift over the split implementation. Change-Id: I89546369b9b04d460696f29a09bf2a62a9ea123c
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h index 46aa16e..1aebcf9 100644 --- a/aom_dsp/arm/mem_neon.h +++ b/aom_dsp/arm/mem_neon.h
@@ -1053,6 +1053,21 @@ *s7 = vld1q_u8(s); } +static INLINE void load_u8_16x5(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); +} + static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3) {
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c index 20da298..964270b 100644 --- a/av1/common/arm/convolve_neon_dotprod.c +++ b/av1/common/arm/convolve_neon_dotprod.c
@@ -1276,6 +1276,80 @@ } while (--height != 0); } +static INLINE void convolve_2d_sr_6tap_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + const int bd = 8; + // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Accumulate into 128 << FILTER_BITS to account for range transform. + // Halve the total because we halved the filter values. + const int32x4_t correction = + vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); + const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; + load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); + s += 5 * src_stride; + + int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl); + int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl); + int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl); + int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl); + int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl); + + do { + uint8x16_t h_s5, h_s6, h_s7, h_s8; + load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); + + int16x8_t v_s5 = + convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl); + int16x8_t v_s6 = + convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl); + int16x8_t v_s7 = + convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl); + int16x8_t v_s8 = + convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl); + + uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + y_filter, vert_const); + uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + y_filter, vert_const); + uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + y_filter, vert_const); + uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + y_filter, vert_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, @@ -1320,6 +1394,12 @@ convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_0_7, y_filter_8_11); } else { + if (x_filter_taps >= 6 && y_filter_taps == 6) { + convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w, + h, x_filter_ptr, y_filter_ptr); + return; + } + DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);