Optimize transform block rate-distortion search

The soft coefficient optimization process would monotonically
increase the transform block distortion and decrease the
coefficient rate cost. Such observation provides a lower bound
on the rate-distortion cost for the given transform block. This
commit compares this lower bound against the best available
rate-distortion cost value and skips unnecessary optimization
process. It speeds up the baseline encoding process by 15%.

Change-Id: Ida8098a2820cef60d59ec1e72f0bbb1acbd98165
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 49b8e9d..063b2f5 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1893,8 +1893,29 @@
 #else
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   coeff_ctx, AV1_XFORM_QUANT_FP);
-  av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
-                 l);
+
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+  const int buffer_length = tx_size_2d[tx_size];
+  int64_t tmp_dist;
+  int64_t tmp;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
+        shift;
+  else
+#endif
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+
+  if (RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
+  } else {
+    args->exit_early = 1;
+    return;
+  }
 #endif  // DISABLE_TRELLISQ_SEARCH
 
   if (!is_inter_block(mbmi)) {
@@ -4307,18 +4328,6 @@
 
   int coeff_ctx = get_entropy_context(tx_size, a, l);
 
-#if DISABLE_TRELLISQ_SEARCH
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_B);
-
-#else
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  coeff_ctx, AV1_XFORM_QUANT_FP);
-
-  av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
-                 l);
-#endif  // DISABLE_TRELLISQ_SEARCH
-
 // TODO(any): Use av1_dist_block to compute distortion
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -4335,6 +4344,33 @@
                     0, bw, bh);
 #endif  // CONFIG_HIGHBITDEPTH
 
+#if DISABLE_TRELLISQ_SEARCH
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_B);
+
+#else
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  coeff_ctx, AV1_XFORM_QUANT_FP);
+
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  const int buffer_length = tx_size_2d[tx_size];
+  int64_t tmp_dist;
+#if CONFIG_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &tmp, xd->bd) >>
+        shift;
+  else
+#endif
+    tmp_dist = av1_block_error(coeff, dqcoeff, buffer_length, &tmp) >> shift;
+
+  if (RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
+    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
+                   a, l);
+  }
+#endif  // DISABLE_TRELLISQ_SEARCH
+
   tmp = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
                         plane_bsize, txm_bsize);
 
@@ -4456,6 +4492,8 @@
                                 [coeff_ctx][EOB_TOKEN];
 #endif
 
+  rd_stats->ref_rdcost = ref_best_rd;
+  rd_stats->zero_rate = zero_blk_rate;
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
     inter_tx_size[0][0] = tx_size;