[arm]: Improve av1_highbd_quantize_fp_neon().
Remove two vandq_s32 instructions from quantize_4().
Bug: b/217282899
Change-Id: I57092c8bcab69164fce3df4494cc7da9bd19e8a2
diff --git a/av1/encoder/arm/neon/highbd_quantize_neon.c b/av1/encoder/arm/neon/highbd_quantize_neon.c
index 32bbfa4..197eae0 100644
--- a/av1/encoder/arm/neon/highbd_quantize_neon.c
+++ b/av1/encoder/arm/neon/highbd_quantize_neon.c
@@ -31,8 +31,9 @@
const int32x4_t v_abs_coeff_scaled =
vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale));
const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
- // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
- const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+ vreinterpretq_s32_u32(v_mask));
// const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
const int32x4_t v_abs_qcoeff =
vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32);
@@ -46,12 +47,12 @@
const int32x4_t v_dqcoeff =
vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
- vst1q_s32(qcoeff_ptr, vandq_s32(v_qcoeff, vreinterpretq_s32_u32(v_mask)));
- vst1q_s32(dqcoeff_ptr, vandq_s32(v_dqcoeff, vreinterpretq_s32_u32(v_mask)));
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
// Used to find eob.
const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
- return vmovn_u32(vandq_u32(v_mask, nz_qcoeff_mask));
+ return vmovn_u32(nz_qcoeff_mask);
}
static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
@@ -110,12 +111,12 @@
v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
v_dequant_s32, v_round_s32, log_scale);
- // overwrite the DC round with AC round
+ // overwrite the DC constants with AC constants
v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
- // 4 more DC
+ // 4 more AC
v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
v_quant_s32, v_dequant_s32, v_round_s32, log_scale);