Trellis based adaptive quantization

This commit combines uniform quantizer with trellis based coefficient
level optimization. It improves the codebase compression performance:

lowres 0.8%
midres 1.0%
hdres  1.6%

Note that the current trellis optimization unit is using C code. This
will make the cost of the overall quantization process slower. A number
of optimizations will come up next.

Change-Id: Id441dd238e4844409d0f08f82604be777f3f5282
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index dfb72ea..9d0eb66 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -52,10 +52,6 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-#define RDTRUNC(RM, DM, R, D)                        \
-  (((1 << (VP9_PROB_COST_SHIFT - 1)) + (R) * (RM)) & \
-   ((1 << VP9_PROB_COST_SHIFT) - 1))
-
 typedef struct vp10_token_state {
   int           rate;
   int           error;
@@ -65,16 +61,12 @@
 } vp10_token_state;
 
 // TODO(jimbankoski): experiment to find optimal RD numbers.
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {9, 7}, {7, 5}, };
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {9, 7}, {8, 5}, };
 
 #define UPDATE_RD_COST()\
 {\
   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
-  if (rd_cost0 == rd_cost1) {\
-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
-  }\
 }
 
 // This function is a place holder for now but may ultimately need
@@ -90,8 +82,8 @@
   return pt;
 }
 
-static int optimize_b(MACROBLOCK *mb, int plane, int block,
-                      TX_SIZE tx_size, int ctx) {
+int vp10_optimize_b(MACROBLOCK *mb, int plane, int block,
+                    TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -961,7 +953,7 @@
                                tx_size);
 #else
           vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
-                           tx_size, VP10_XFORM_QUANT_B);
+                           tx_size, VP10_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
         } else if (x->skip_txfm[plane][blk_index] == SKIP_TXFM_AC_ONLY) {
           // fast path forward transform and quantization
@@ -986,7 +978,7 @@
                              tx_size);
 #else
         vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
-                         tx_size, VP10_XFORM_QUANT_B);
+                         tx_size, VP10_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
       }
     }
@@ -1021,7 +1013,7 @@
     }
 #endif
     ctx = combine_entropy_contexts(*a, *l);
-    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+    *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
   } else {
     *a = *l = p->eobs[block] > 0;
   }
@@ -1293,7 +1285,7 @@
     if (x->optimize) {
       int ctx;
       ctx = combine_entropy_contexts(*a, *l);
-      *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+      *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
     } else {
       *a = *l = p->eobs[block] > 0;
     }
diff --git a/vp10/encoder/encodemb.h b/vp10/encoder/encodemb.h
index eae1db7..cef6ccc 100644
--- a/vp10/encoder/encodemb.h
+++ b/vp10/encoder/encodemb.h
@@ -56,6 +56,9 @@
                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 #endif
 
+int vp10_optimize_b(MACROBLOCK *mb, int plane, int block,
+                    TX_SIZE tx_size, int ctx);
+
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 2a8b33f..8789bc5 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -1223,10 +1223,7 @@
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
-      int qrounding_factor_fp = i == 0 ? 48 : 42;
-      if (q == 0)
-        qrounding_factor_fp = 64;
-
+      int qrounding_factor_fp = 64;
       // y
       quant = i == 0 ? vp10_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
                      : vp10_ac_quant(q, 0, cm->bit_depth);
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 640a409..1cf43b5 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1213,6 +1213,10 @@
   int rate;
   int64_t dist;
   int64_t sse;
+#if !CONFIG_NEW_QUANT
+  ENTROPY_CONTEXT coeff_ctx = combine_entropy_contexts(
+      *(args->t_above + blk_col), *(args->t_left + blk_row));
+#endif
 
   if (args->exit_early)
     return;
@@ -1264,9 +1268,10 @@
                            plane_bsize, tx_size);
 #else
       vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                       plane_bsize, tx_size, VP10_XFORM_QUANT_B);
+                       plane_bsize, tx_size, VP10_XFORM_QUANT_FP);
+      vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
 #endif  // CONFIG_NEW_QUANT
-      dist_block(args->cpi, x, plane, block, blk_row, blk_col,
+     dist_block(args->cpi, x, plane, block, blk_row, blk_col,
                  tx_size, &dist, &sse);
     } else if (x->skip_txfm[plane][block >> (tx_size << 1)] ==
                SKIP_TXFM_AC_ONLY) {
@@ -1318,7 +1323,8 @@
                            tx_size);
 #else
     vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                     VP10_XFORM_QUANT_B);
+                     VP10_XFORM_QUANT_FP);
+    vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
 #endif  // CONFIG_NEW_QUANT
     dist_block(args->cpi, x, plane, block, blk_row, blk_col,
                tx_size, &dist, &sse);
@@ -3076,6 +3082,8 @@
   vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                    VP10_XFORM_QUANT_B);
 
+  vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
+
   // TODO(any): Use dist_block to compute distortion
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {