[arm]: Fix av1_highbd_quantize_fp_neon().

Use vqdmulhq_s32 instead of vmull_s32/vmull_high_s32.

Change-Id: I11f3996c1ff0c20ba8ff55c31650f6f1c5ebebfd
diff --git a/av1/encoder/arm/neon/highbd_quantize_neon.c b/av1/encoder/arm/neon/highbd_quantize_neon.c
index bd84fe5..32bbfa4 100644
--- a/av1/encoder/arm/neon/highbd_quantize_neon.c
+++ b/av1/encoder/arm/neon/highbd_quantize_neon.c
@@ -16,13 +16,16 @@
 #include "av1/common/quant_common.h"
 #include "av1/encoder/av1_quantize.h"
 
-static INLINE uint16x4_t quantize_4(
-    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, int32x4_t v_dequant_s32,
-    int32x4_t v_round_s32, int log_scale, int shift) {
+static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    int32x4_t v_quant_s32,
+                                    int32x4_t v_dequant_s32,
+                                    int32x4_t v_round_s32, int log_scale) {
   const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
   const int32x4_t v_coeff_sign =
       vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
   const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
   // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
   const int32x4_t v_abs_coeff_scaled =
@@ -30,42 +33,15 @@
   const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
   // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
   const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
-  //  const int abs_qcoeff = (int)((tmp * quant) >> shift);
-  const int64x2_t v_abs_qcoeff_lo =
-      vmull_s32(vget_low_s32(v_tmp), vget_low_s32(v_quant_s32));
-#ifdef __aarch64__
-  const int64x2_t v_abs_qcoeff_hi = vmull_high_s32(v_tmp, v_quant_s32);
-#else
-  const int64x2_t v_abs_qcoeff_hi =
-      vmull_s32(vget_high_s32(v_tmp), vget_high_s32(v_quant_s32));
-#endif
-  // vshlq_s64 will shift right if shift value is negative.
-  const int64x2_t v_shift = vdupq_n_s64(-shift);
-  const int64x2_t v_abs_qcoeff_lo_sh = vshlq_s64(v_abs_qcoeff_lo, v_shift);
-  const int64x2_t v_abs_qcoeff_hi_sh = vshlq_s64(v_abs_qcoeff_hi, v_shift);
-  const int32x4_t v_abs_qcoeff = vcombine_s32(vmovn_s64(v_abs_qcoeff_lo_sh),
-                                              vmovn_s64(v_abs_qcoeff_hi_sh));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32);
   //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
   const int32x4_t v_qcoeff =
       vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
-  //  const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
-  const int64x2_t v_abs_dqcoeff_lo =
-      vmull_s32(vget_low_s32(v_abs_qcoeff), vget_low_s32(v_dequant_s32));
-#ifdef __aarch64__
-  const int64x2_t v_abs_dqcoeff_hi =
-      vmull_high_s32(v_abs_qcoeff, v_dequant_s32);
-#else
-  const int64x2_t v_abs_dqcoeff_hi =
-      vmull_s32(vget_high_s32(v_abs_qcoeff), vget_high_s32(v_dequant_s32));
-#endif
-  // vshlq_s64 will shift right if shift value is negative.
-  const int64x2_t v_log_scale = vdupq_n_s64(-log_scale);
-  const int64x2_t v_abs_dqcoeff_lo_sh =
-      vshlq_s64(v_abs_dqcoeff_lo, v_log_scale);
-  const int64x2_t v_abs_dqcoeff_hi_sh =
-      vshlq_s64(v_abs_dqcoeff_hi, v_log_scale);
-  const int32x4_t v_abs_dqcoeff = vcombine_s32(vmovn_s64(v_abs_dqcoeff_lo_sh),
-                                               vmovn_s64(v_abs_dqcoeff_hi_sh));
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
   //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
   const int32x4_t v_dqcoeff =
       vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
@@ -115,7 +91,6 @@
   (void)zbin_ptr;
   (void)quant_shift_ptr;
 
-  const int shift = 16 - log_scale;
   const int16x4_t v_quant = vld1_s16(quant_ptr);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
   const int16x4_t v_zero = vdup_n_s16(0);
@@ -126,14 +101,14 @@
   const int16x4_t v_round =
       vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
-  int32x4_t v_quant_s32 = vaddl_s16(v_quant, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
 
   // DC and first 3 AC
   v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
-                         v_dequant_s32, v_round_s32, log_scale, shift);
+                         v_dequant_s32, v_round_s32, log_scale);
 
   // overwrite the DC round with AC round
   v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
@@ -141,9 +116,8 @@
   v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
 
   // 4 more DC
-  v_mask_hi =
-      quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32,
-                 v_dequant_s32, v_round_s32, log_scale, shift);
+  v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                         v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
 
   // Find the max lane eob for the first 8 coeffs.
   v_eobmax =
@@ -156,10 +130,9 @@
     dqcoeff_ptr += 8;
     iscan += 8;
     v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
-                           v_dequant_s32, v_round_s32, log_scale, shift);
-    v_mask_hi =
-        quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32,
-                   v_dequant_s32, v_round_s32, log_scale, shift);
+                           v_dequant_s32, v_round_s32, log_scale);
+    v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
     // Find the max lane eob for 8 coeffs.
     v_eobmax =
         get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));