Fix high bit-depth quantization process

Scale the rounding factor according to the scaling factor applied
to the quantization step size. This resolves a compression
performance regression in 32x32 and above transform size.

BUG=aomedia:599

Change-Id: Id3fc9a46c4a8843ff5d77ccaa59ee3112b12d7f4
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 63727df..84d4bbf 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -1547,7 +1547,7 @@
 #endif
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const int64_t tmp = abs_coeff + (round_ptr[rc != 0] >> log_scale);
 #if CONFIG_AOM_QM
       const uint32_t abs_qcoeff =
           (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
index fa56260..764c4c5 100644
--- a/av1/encoder/x86/av1_highbd_quantize_sse4.c
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -133,7 +133,8 @@
     coeff[0] = _mm_loadu_si128((__m128i const *)src);
 
     qparam[0] =
-        _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
+        _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
+                      round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
     qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
     qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);