Optimize quantize_fp_neon function for LBD and HBD Optimized EOB calculation by removing iscan adjustment in get_max_lane_eob(). Move this logic to get_max_eob(), which adds 1 to the maximum value. Use -1 instead of 0 when comparing against iscan values, to differentiate between the case when non-zero element is at position 0 and that one where no non-zero value exists. This is a port from SVT-AV1: gitlab.com/AOMediaCodec/SVT-AV1/-/commit/f86adc26b60e5796f560ac1971962267b3a63ed4 Change-Id: I500eb61d508284b439f1efceac42595bff5fb520
diff --git a/av1/encoder/arm/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c index c1016db..fd4749d 100644 --- a/av1/encoder/arm/av1_highbd_quantize_neon.c +++ b/av1/encoder/arm/av1_highbd_quantize_neon.c
@@ -60,15 +60,15 @@ static inline int16x8_t get_max_lane_eob(const int16_t *iscan, int16x8_t v_eobmax, uint16x8_t v_mask) { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); - const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + const int16x8_t v_iscan = vld1q_s16(iscan); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1)); return vmaxq_s16(v_eobmax, v_nz_iscan); } static inline uint16_t get_max_eob(int16x8_t v_eobmax) { #if AOM_ARCH_AARCH64 - return (uint16_t)vmaxvq_s16(v_eobmax); + int16_t max_val = vmaxvq_s16(v_eobmax); + return (uint16_t)max_val + 1; #else const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); @@ -80,7 +80,7 @@ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); - return (uint16_t)vget_lane_s16(v_eobmax_final, 0); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1; #endif }
diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c index cbeafc7..8d13e57 100644 --- a/av1/encoder/arm/quantize_neon.c +++ b/av1/encoder/arm/quantize_neon.c
@@ -29,7 +29,8 @@ static inline uint16_t get_max_eob(int16x8_t v_eobmax) { #if AOM_ARCH_AARCH64 - return (uint16_t)vmaxvq_s16(v_eobmax); + int16_t max_val = vmaxvq_s16(v_eobmax); + return (uint16_t)max_val + 1; #else const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); @@ -41,16 +42,15 @@ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); - return (uint16_t)vget_lane_s16(v_eobmax_final, 0); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1; #endif } static inline int16x8_t get_max_lane_eob(const int16_t *iscan, int16x8_t v_eobmax, uint16x8_t v_mask) { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); - const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + const int16x8_t v_iscan = vld1q_s16(iscan); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1)); return vmaxq_s16(v_eobmax, v_nz_iscan); } @@ -445,7 +445,7 @@ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } - *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; + *eob_ptr = get_max_eob(v_eobmax_76543210); } #define QM_MULL_SHIFT(x0, x1) \ @@ -580,7 +580,7 @@ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } - *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; + *eob_ptr = get_max_eob(v_eobmax_76543210); } static void aom_quantize_b_helper_32x32_neon( @@ -718,7 +718,7 @@ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } - *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; + *eob_ptr = get_max_eob(v_eobmax_76543210); } static void aom_quantize_b_helper_64x64_neon( @@ -867,7 +867,7 @@ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } - *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; + *eob_ptr = get_max_eob(v_eobmax_76543210); } void aom_quantize_b_helper_neon(