Optimize Neon DotProd av1_convolve_y_sr 12-tap path Decompose the narrowing right shift step in the convolution kernels to avoid needing two successive complex (and slower) saturating narrow instructions. The new sequence requires a simple truncating, narrowing shift followed by a slower saturating narrowing shift. Change-Id: I68a64ed61d634cbfac3efac1029004e5896f6bbd

commit: e7e386aff07e0e4cc87bb2619816a5e897f47a73 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Tue Aug 26 16:31:42 2025 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> Sun Aug 31 18:00:57 2025 -0700
tree: 6d876edf844af2505d326835e04a3151a5bd4c96
parent: 09f7872651c306e7820a81d0b70771f17728476e [diff]
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index 2a6d7b7..9b7134b 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c

@@ -400,7 +400,7 @@
   sum = vdotq_lane_s32(sum, s2, filters_4_11, 1);
 
   // Further narrowing and packing is performed by the caller.
-  return vqmovn_s32(sum);
+  return vshrn_n_s32(sum, 1);
 }
 
 static inline uint8x8_t convolve12_8_y(
@@ -420,8 +420,9 @@
   sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  int16x8_t sum =
+      vcombine_s16(vshrn_n_s32(sum0123, 1), vshrn_n_s32(sum4567, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static inline void convolve_y_sr_12tap_neon_dotprod(
@@ -490,8 +491,8 @@
           convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
       int16x4_t d3 =
           convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
commit	e7e386aff07e0e4cc87bb2619816a5e897f47a73	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Tue Aug 26 16:31:42 2025 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	Sun Aug 31 18:00:57 2025 -0700
tree	6d876edf844af2505d326835e04a3151a5bd4c96
parent	09f7872651c306e7820a81d0b70771f17728476e [diff]