Optimize vertical filter kernel in av1_warp_affine Some filter kernels contain zero-padded coefficients in the vertical_filter_4x1_f1 and vertical_filter_8x1_f1 functions. Skip unnecessary multiplications by zero in the Neon (Armv8.0), Neon I8MM, and SVE implementations to reduce instruction count, yielding up to ~5% performance improvement. Move the vertical_filter_4x1_f1 and vertical_filter_8x1_f1 functions into the header file, as their implementations are identical across all extensions. Change-Id: I23e8cd8e2516edd3323addd956549ce6e2a6d494
diff --git a/av1/common/arm/warp_plane_neon.c b/av1/common/arm/warp_plane_neon.c index f4c1377..f7b393a 100644 --- a/av1/common/arm/warp_plane_neon.c +++ b/av1/common/arm/warp_plane_neon.c
@@ -143,31 +143,6 @@ return horizontal_filter_8x1_f1_beta0(in, f_s16); } -static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, - int32x4_t *res, int sy) { - int16x4_t s0 = vget_low_s16(src[0]); - int16x4_t s1 = vget_low_s16(src[1]); - int16x4_t s2 = vget_low_s16(src[2]); - int16x4_t s3 = vget_low_s16(src[3]); - int16x4_t s4 = vget_low_s16(src[4]); - int16x4_t s5 = vget_low_s16(src[5]); - int16x4_t s6 = vget_low_s16(src[6]); - int16x4_t s7 = vget_low_s16(src[7]); - - int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); - - int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); - m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); - - *res = m0123; -} - static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma) { @@ -194,43 +169,6 @@ *res = horizontal_add_4d_s32x4(m0123_pairs); } -static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, - int sy) { - int16x8_t s0 = src[0]; - int16x8_t s1 = src[1]; - int16x8_t s2 = src[2]; - int16x8_t s3 = src[3]; - int16x8_t s4 = src[4]; - int16x8_t s5 = src[5]; - int16x8_t s6 = src[6]; - int16x8_t s7 = src[7]; - - int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); - - int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); - - int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); - - *res_low = m0123; - *res_high = m4567; -} - static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy,
diff --git a/av1/common/arm/warp_plane_neon.h b/av1/common/arm/warp_plane_neon.h index 6c50c41..0bf5c03 100644 --- a/av1/common/arm/warp_plane_neon.h +++ b/av1/common/arm/warp_plane_neon.h
@@ -24,23 +24,91 @@ #include "av1/common/warped_motion.h" #include "av1/common/scale.h" -static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, - int32x4_t *res, int sy); - static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma); -static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, - int sy); - static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy, int gamma); +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { + int16_t *f_ptr = + (int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)); + int16x8_t f = vld1q_s16(f_ptr); + + int32x4_t m0123; + if (f_ptr[0] != 0) { + m0123 = vmull_lane_s16(vget_low_s16(src[0]), vget_low_s16(f), 0); + } else { + m0123 = vdupq_n_s32(0); + } + if (f_ptr[1] != 0) { + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[1]), vget_low_s16(f), 1); + } + + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[2]), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[3]), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[4]), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[5]), vget_high_s16(f), 1); + + if (f_ptr[6] != 0) { + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[6]), vget_high_s16(f), 2); + } + if (f_ptr[7] != 0) { + m0123 = vmlal_lane_s16(m0123, vget_low_s16(src[7]), vget_high_s16(f), 3); + } + + *res = m0123; +} + +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *s, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { + int16_t *f_ptr = + (int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)); + int16x8_t f = vld1q_s16(f_ptr); + + int32x4_t m0123, m4567; + if (f_ptr[0] != 0) { + m0123 = vmull_lane_s16(vget_low_s16(s[0]), vget_low_s16(f), 0); + m4567 = vmull_lane_s16(vget_high_s16(s[0]), vget_low_s16(f), 0); + } else { + m0123 = vdupq_n_s32(0); + m4567 = vdupq_n_s32(0); + } + if (f_ptr[1] != 0) { + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[1]), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[1]), vget_low_s16(f), 1); + } + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[2]), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[2]), vget_low_s16(f), 2); + + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[3]), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[3]), vget_low_s16(f), 3); + + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[4]), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[4]), vget_high_s16(f), 0); + + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[5]), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[5]), vget_high_s16(f), 1); + + if (f_ptr[6] != 0) { + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[6]), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[6]), vget_high_s16(f), 2); + } + if (f_ptr[7] != 0) { + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s[7]), vget_high_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s[7]), vget_high_s16(f), 3); + } + + *res_low = m0123; + *res_high = m4567; +} + static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { out[0] = vld1q_s16(
diff --git a/av1/common/arm/warp_plane_neon_i8mm.c b/av1/common/arm/warp_plane_neon_i8mm.c index 44689e9..3f16ddd 100644 --- a/av1/common/arm/warp_plane_neon_i8mm.c +++ b/av1/common/arm/warp_plane_neon_i8mm.c
@@ -244,31 +244,6 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, - int32x4_t *res, int sy) { - int16x4_t s0 = vget_low_s16(src[0]); - int16x4_t s1 = vget_low_s16(src[1]); - int16x4_t s2 = vget_low_s16(src[2]); - int16x4_t s3 = vget_low_s16(src[3]); - int16x4_t s4 = vget_low_s16(src[4]); - int16x4_t s5 = vget_low_s16(src[5]); - int16x4_t s6 = vget_low_s16(src[6]); - int16x4_t s7 = vget_low_s16(src[7]); - - int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); - - int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); - m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); - - *res = m0123; -} - static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma) { @@ -295,43 +270,6 @@ *res = horizontal_add_4d_s32x4(m0123_pairs); } -static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, - int sy) { - int16x8_t s0 = src[0]; - int16x8_t s1 = src[1]; - int16x8_t s2 = src[2]; - int16x8_t s3 = src[3]; - int16x8_t s4 = src[4]; - int16x8_t s5 = src[5]; - int16x8_t s6 = src[6]; - int16x8_t s7 = src[7]; - - int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); - - int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); - - int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); - - *res_low = m0123; - *res_high = m4567; -} - static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy,
diff --git a/av1/common/arm/warp_plane_sve.c b/av1/common/arm/warp_plane_sve.c index 885ffe8..01a3875 100644 --- a/av1/common/arm/warp_plane_sve.c +++ b/av1/common/arm/warp_plane_sve.c
@@ -247,31 +247,6 @@ return vreinterpretq_s16_u16(res); } -static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, - int32x4_t *res, int sy) { - int16x4_t s0 = vget_low_s16(src[0]); - int16x4_t s1 = vget_low_s16(src[1]); - int16x4_t s2 = vget_low_s16(src[2]); - int16x4_t s3 = vget_low_s16(src[3]); - int16x4_t s4 = vget_low_s16(src[4]); - int16x4_t s5 = vget_low_s16(src[5]); - int16x4_t s6 = vget_low_s16(src[6]); - int16x4_t s7 = vget_low_s16(src[7]); - - int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); - - int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); - m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); - - *res = m0123; -} - static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma) { @@ -295,43 +270,6 @@ *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); } -static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, - int sy) { - int16x8_t s0 = src[0]; - int16x8_t s1 = src[1]; - int16x8_t s2 = src[2]; - int16x8_t s3 = src[3]; - int16x8_t s4 = src[4]; - int16x8_t s5 = src[5]; - int16x8_t s6 = src[6]; - int16x8_t s7 = src[7]; - - int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); - - int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); - m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); - - int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); - m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); - - *res_low = m0123; - *res_high = m4567; -} - static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy,