Optimize quantize_fp_neon function for LBD and HBD Optimized EOB calculation by removing iscan adjustment in get_max_lane_eob(). Move this logic to get_max_eob(), which adds 1 to the maximum value. Use -1 instead of 0 when comparing against iscan values, to differentiate between the case when non-zero element is at position 0 and that one where no non-zero value exists. This is a port from SVT-AV1: gitlab.com/AOMediaCodec/SVT-AV1/-/commit/f86adc26b60e5796f560ac1971962267b3a63ed4 Change-Id: I500eb61d508284b439f1efceac42595bff5fb520

commit: b66769617a5697afc880c97c8d0056ece04eb8c0 [log] [tgz]
author: Gerda Zsejke More <gerdazsejke.more@arm.com> Tue Oct 28 09:19:49 2025 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> Thu Nov 13 09:37:20 2025 -0800
tree: 24caee1e47cba1b827e80ddb1612565883c92c84
parent: c895900348669d837fbafd1ba6fa2ff94b2c6247 [diff]
diff --git a/av1/encoder/arm/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c
index c1016db..fd4749d 100644
--- a/av1/encoder/arm/av1_highbd_quantize_neon.c
+++ b/av1/encoder/arm/av1_highbd_quantize_neon.c

@@ -60,15 +60,15 @@
 static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
-  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
-  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  const int16x8_t v_iscan = vld1q_s16(iscan);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
 static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
-  return (uint16_t)vmaxvq_s16(v_eobmax);
+  int16_t max_val = vmaxvq_s16(v_eobmax);
+  return (uint16_t)max_val + 1;
 #else
   const int16x4_t v_eobmax_3210 =
       vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
@@ -80,7 +80,7 @@
       vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
   const int16x4_t v_eobmax_final =
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
 #endif
 }
 

diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
index cbeafc7..8d13e57 100644
--- a/av1/encoder/arm/quantize_neon.c
+++ b/av1/encoder/arm/quantize_neon.c

@@ -29,7 +29,8 @@
 
 static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
 #if AOM_ARCH_AARCH64
-  return (uint16_t)vmaxvq_s16(v_eobmax);
+  int16_t max_val = vmaxvq_s16(v_eobmax);
+  return (uint16_t)max_val + 1;
 #else
   const int16x4_t v_eobmax_3210 =
       vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
@@ -41,16 +42,15 @@
       vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
   const int16x4_t v_eobmax_final =
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
 #endif
 }
 
 static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
                                          int16x8_t v_eobmax,
                                          uint16x8_t v_mask) {
-  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
-  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  const int16x8_t v_iscan = vld1q_s16(iscan);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
   return vmaxq_s16(v_eobmax, v_nz_iscan);
 }
 
@@ -445,7 +445,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 #define QM_MULL_SHIFT(x0, x1)                                              \
@@ -580,7 +580,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 static void aom_quantize_b_helper_32x32_neon(
@@ -718,7 +718,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 static void aom_quantize_b_helper_64x64_neon(
@@ -867,7 +867,7 @@
       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
     }
   }
-  *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+  *eob_ptr = get_max_eob(v_eobmax_76543210);
 }
 
 void aom_quantize_b_helper_neon(
commit	b66769617a5697afc880c97c8d0056ece04eb8c0	[log] [tgz]
author	Gerda Zsejke More <gerdazsejke.more@arm.com>	Tue Oct 28 09:19:49 2025 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	Thu Nov 13 09:37:20 2025 -0800
tree	24caee1e47cba1b827e80ddb1612565883c92c84
parent	c895900348669d837fbafd1ba6fa2ff94b2c6247 [diff]