Scale PVQ input to OD_COEFF_SHIFT resolution.

This ensures we operate at the same precision that Daala uses, which matters
when activity masking is enabled, because of the gain companding.

Metrics from Patchset 4 (which had slightly incorrect rounding):

With activity masking (5 frames only):
av1_pvq_AM_ref_5f@2017-02-07T03:37:53.702Z -> av1_pvq_AM_derf_fix2_coeff_scaling_5f@2017-02-07T00:12:24.427Z

    PSNR |  PSNR Cb |  PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
  0.6653 | -12.3177 | -12.1858 |   0.3350 | 4.1013 |  2.0964 |    -4.0539

In particular for Netflix_Crosswalk_1920x1080_60fps_8bit_420_60f.y4m
 -5.0589 | -22.3077 | -21.2188 |  -7.0389 | -3.3715 |-5.7794 |   -13.1891

I.e., it fixes the large regression with AM on this sequence, and
 substantially improves chroma (at a lesser cost to other metrics).

Without activity masking (5 frames only):
av1_pvq_ref_5f@2017-02-07T03:52:51.279Z -> av1_pvq_derf_fix2_coeff_scaling_5f@2017-02-07T00:12:48.873Z

    PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
  0.0989 | -0.0322 | -0.0464 |   0.1883 | 0.0795 |  0.0579 |     0.0923

Change-Id: I46b808b7c8e4733465f8bebc8336dfd5b75783ec
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index fac99f9..5158ae7 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1129,7 +1129,9 @@
     int tx_size, TX_TYPE tx_type, int *rate, int speed, PVQ_INFO *pvq_info) {
   const int tx_blk_size = tx_size_wide[tx_size];
   PVQ_SKIP_TYPE ac_dc_coded;
-  int quant_shift = get_tx_scale(tx_size);
+  /*TODO(tterribe): Handle CONFIG_AOM_HIGHBITDEPTH.*/
+  int coeff_shift = 3 - get_tx_scale(tx_size);
+  int rounding_mask;
   int pvq_dc_quant;
   int use_activity_masking = daala_enc->use_activity_masking;
   int tell;
@@ -1148,15 +1150,16 @@
   DECLARE_ALIGNED(16, int32_t, ref_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
   DECLARE_ALIGNED(16, int32_t, out_int32[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 
+  assert(OD_COEFF_SHIFT >= 3);
   // DC quantizer for PVQ
   if (use_activity_masking)
     pvq_dc_quant = OD_MAXI(
-        1, (quant[0] >> quant_shift) *
+        1, (quant[0] << (OD_COEFF_SHIFT - 3)) *
                    daala_enc->state.pvq_qm_q4[plane]
                                              [od_qm_get_index(tx_size, 0)] >>
                4);
   else
-    pvq_dc_quant = OD_MAXI(1, quant[0] >> quant_shift);
+    pvq_dc_quant = OD_MAXI(1, quant[0] << (OD_COEFF_SHIFT - 3));
 
   *eob = 0;
 
@@ -1174,8 +1177,8 @@
 
   // copy int16 inputs to int32
   for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
-    ref_int32[i] = ref_coeff_pvq[i];
-    in_int32[i] = coeff_pvq[i];
+    ref_int32[i] = ref_coeff_pvq[i] << (OD_COEFF_SHIFT - coeff_shift);
+    in_int32[i] = coeff_pvq[i] << (OD_COEFF_SHIFT - coeff_shift);
   }
 
 #if PVQ_CHROMA_RD
@@ -1192,8 +1195,8 @@
 
   ac_dc_coded = od_pvq_encode(
       daala_enc, ref_int32, in_int32, out_int32,
-      (int)quant[0] >> quant_shift,  // scale/quantizer
-      (int)quant[1] >> quant_shift,  // scale/quantizer
+      quant[0] << (OD_COEFF_SHIFT - 3),  // scale/quantizer
+      quant[1] << (OD_COEFF_SHIFT - 3),  // scale/quantizer
       plane, tx_size, OD_PVQ_BETA[use_activity_masking][plane][tx_size],
       OD_ROBUST_STREAM,
       0,        // is_keyframe,
@@ -1220,7 +1223,12 @@
   out_int32[0] += ref_int32[0];
 
   // copy int32 result back to int16
-  for (i = 0; i < tx_blk_size * tx_blk_size; i++) dqcoeff_pvq[i] = out_int32[i];
+  assert(OD_COEFF_SHIFT > coeff_shift);
+  rounding_mask = (1 << (OD_COEFF_SHIFT - coeff_shift - 1)) - 1;
+  for (i = 0; i < tx_blk_size * tx_blk_size; i++) {
+    dqcoeff_pvq[i] = (out_int32[i] + (out_int32[i] < 0) + rounding_mask) >>
+                     (OD_COEFF_SHIFT - coeff_shift);
+  }
 
   // Back to original coefficient order
   od_coding_order_to_raster(dqcoeff, tx_blk_size, tx_type, dqcoeff_pvq,