Optimize Neon I8MM aom_convolve8* implementations The convolution filters used with the aom_convolve8* functions have all even coefficient values. Halving these filter coefficients reduces the intermediate precision requirements inside the convolution kernels, and allows us to avoid some slower saturating narrow instructions in the Neon I8MM aom_convolve8* functions. This patch applies this optimization in the following cases: - 8-tap horizontal - 8-tap vertical - 6-tap vertical (covered by 8-tap vertical code path.) Change-Id: I55072e3be3371cc40f9ffe0f9b1fee308064a8fd

commit: 396ecd58be7bfc9f0db5f0933c045994dbcdc6b2 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Wed Aug 13 18:43:11 2025 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> Thu Aug 21 03:27:01 2025 -0700
tree: b0b8ea94a84cd186bad78a74322a37fcd9778647
parent: 1fe15e949ddd29a5b3a44caf1bbd9539b97dc9a2 [diff]
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index 121e892..6d1ab96 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c

@@ -61,7 +61,7 @@
   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
   // Further narrowing and packing is performed by the caller.
-  return vqmovn_s32(sum);
+  return vmovn_s32(sum);
 }
 
 static inline uint8x8_t convolve8_8_h(const uint8x16_t samples,
@@ -85,14 +85,16 @@
   sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static inline void convolve8_horiz_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_x), 1);
 
   if (w == 4) {
     const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
@@ -104,8 +106,9 @@
       int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl);
       int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl);
       int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
@@ -326,7 +329,7 @@
   sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
   // Further narrowing and packing is performed by the caller.
-  return vqmovn_s32(sum);
+  return vmovn_s32(sum);
 }
 
 static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
@@ -344,14 +347,16 @@
   sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
 
   // Narrow and re-pack.
-  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, FILTER_BITS);
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }
 
 static inline void convolve8_vert_8tap_neon_i8mm(
     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) {
-  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  // Filter values are even, so halve to reduce intermediate precision reqs.
+  const int8x8_t filter = vshrn_n_s16(vld1q_s16(filter_y), 1);
   const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
   uint8x16x2_t samples_LUT;
 
@@ -386,8 +391,9 @@
       int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
       int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
       int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
-      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
 
       store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
       store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
commit	396ecd58be7bfc9f0db5f0933c045994dbcdc6b2	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Wed Aug 13 18:43:11 2025 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	Thu Aug 21 03:27:01 2025 -0700
tree	b0b8ea94a84cd186bad78a74322a37fcd9778647
parent	1fe15e949ddd29a5b3a44caf1bbd9539b97dc9a2 [diff]