Record total rate cost in trellis

Record total rate cost when computing trellis optimization.
Reduce redundant rate computation in later stages.

Speed impact: ~6% speed up
Coding performance should not be affected.

Change-Id: I9e940a2d126bb55930fcf22ea04d061eee1fc944
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 1d08265..a9e1d32 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -443,7 +443,7 @@
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
                    int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
                    TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode) {
+                   const ENTROPY_CONTEXT *l, int fast_mode, int *rate_cost) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   const int eob = p->eobs[block];
@@ -462,7 +462,7 @@
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   return av1_optimize_txb(cpi, mb, plane, blk_row, blk_col, block, tx_size,
-                          &txb_ctx, fast_mode);
+                          &txb_ctx, fast_mode, rate_cost);
 #endif  // !CONFIG_LV_MAP
 }
 
@@ -587,6 +587,7 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
+  int dummy_rate_cost = 0;
 
   int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   dst = &pd->dst
@@ -603,7 +604,7 @@
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, AV1_XFORM_QUANT_FP);
       av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, CONFIG_LV_MAP);
+                     tx_size, a, l, CONFIG_LV_MAP, &dummy_rate_cost);
     } else {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
@@ -884,6 +885,7 @@
   const int dst_stride = pd->dst.stride;
   uint8_t *dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  int dummy_rate_cost = 0;
 
   av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
 
@@ -914,7 +916,7 @@
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
     av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                   tx_size, a, l, CONFIG_LV_MAP);
+                   tx_size, a, l, CONFIG_LV_MAP, &dummy_rate_cost);
 
 #if CONFIG_TXK_SEL
     if (plane == 0 && p->eobs[block] == 0) {
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 2f7b109..a40474d 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -53,7 +53,7 @@
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
                    int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
                    TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode);
+                   const ENTROPY_CONTEXT *l, int fast_mode, int *rate_cost);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index c68ca6f..9223b70 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -1538,7 +1538,8 @@
 #if 1
 static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
                         const LV_MAP_EOB_COST *txb_eob_costs,
-                        TxbCache *txb_cache, int dry_run, int fast_mode) {
+                        TxbCache *txb_cache, int dry_run, int fast_mode,
+                        int *rate_cost) {
   (void)fast_mode;
   (void)txb_cache;
   int update = 0;
@@ -1586,7 +1587,7 @@
                                     txb_info->tx_type);
 
   // backward optimize the level-k map
-  int64_t accu_rate = eob_cost;
+  int accu_rate = eob_cost;
   int64_t accu_dist = 0;
   int64_t prev_eob_rd_cost = INT64_MAX;
   int64_t cur_eob_rd_cost = 0;
@@ -1690,6 +1691,11 @@
     txb_info->eob = 0;
   }
 
+  // record total rate cost
+  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
+                   ? zero_blk_rate
+                   : accu_rate + non_zero_blk_rate;
+
 #if TEST_OPTIMIZE_TXB
   int cost_diff = 0;
   int64_t dist_diff = 0;
@@ -1718,7 +1724,8 @@
 
 #else
 static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
-                        TxbCache *txb_cache, int dry_run, int fast_mode) {
+                        TxbCache *txb_cache, int dry_run, int fast_mode,
+                        int *rate_cost) {
   int update = 0;
   if (txb_info->eob == 0) return update;
   int cost_diff = 0;
@@ -1849,12 +1856,13 @@
                   const LV_MAP_EOB_COST *txb_eob_costs,
                   const struct macroblock_plane *p, int block, int fast_mode) {
   const int16_t *scan = txb_info->scan_order->scan;
+  int dummy_rate_cost;
 
   av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
                       txb_info->levels);
   // The hash_based_trellis speed feature requires lv_map_multi, so always true.
-  const int update =
-      optimize_txb(txb_info, txb_costs, txb_eob_costs, NULL, 0, fast_mode);
+  const int update = optimize_txb(txb_info, txb_costs, txb_eob_costs, NULL, 0,
+                                  fast_mode, &dummy_rate_cost);
 
   if (update) {
     // Overwrite old lowest entry
@@ -2026,7 +2034,7 @@
 
 int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode) {
+                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
@@ -2109,8 +2117,8 @@
 
   av1_txb_init_levels(qcoeff, width, height, levels);
 
-  const int update =
-      optimize_txb(&txb_info, &txb_costs, &txb_eob_costs, NULL, 0, fast_mode);
+  const int update = optimize_txb(&txb_info, &txb_costs, &txb_eob_costs, NULL,
+                                  0, fast_mode, rate_cost);
 
   if (update) {
     p->eobs[block] = txb_info.eob;
@@ -2372,6 +2380,7 @@
   uint16_t best_eob = 0;
   RD_STATS best_rd_stats;
   TX_TYPE tx_type;
+  int rate_cost = 0;
 
   av1_invalid_rd_stats(&best_rd_stats);
 
@@ -2396,15 +2405,23 @@
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, AV1_XFORM_QUANT_FP);
       av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, 1);
+                     tx_size, a, l, 1, &rate_cost);
     }
     av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
                    &this_rd_stats.dist, &this_rd_stats.sse,
                    OUTPUT_HAS_PREDICTED_PIXELS);
+
+    const int eob = x->plane[plane].eobs[block];
     const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-    this_rd_stats.rate =
-        av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
-                        scan_order, a, l, use_fast_coef_costing);
+    if (eob)
+      rate_cost +=
+          av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type);
+    else
+      rate_cost =
+          av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                          scan_order, a, l, use_fast_coef_costing);
+    this_rd_stats.rate = rate_cost;
+
     int64_t rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
 
     if (rd < best_rd) {
@@ -2436,7 +2453,7 @@
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, AV1_XFORM_QUANT_FP);
       av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, 1);
+                     tx_size, a, l, 1, &rate_cost);
     }
 
     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index 6ee9bd6..b867464 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -104,7 +104,7 @@
 
 int av1_optimize_txb(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int blk_row,
                      int blk_col, int block, TX_SIZE tx_size, TXB_CTX *txb_ctx,
-                     int fast_mode);
+                     int fast_mode, int *rate_cost);
 #ifdef __cplusplus
 }
 #endif
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 53aae07..2fb4340 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1855,11 +1855,20 @@
   }
 
 #if !CONFIG_TXK_SEL
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size);
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
+  int rate_cost = 0;
+
   // full forward transform and quantization
   if (cpi->sf.optimize_coefficients != FULL_TRELLIS_OPT) {
     av1_xform_quant(
         cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
         USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+
+    rate_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                                scan_order, a, l, args->use_fast_coef_costing);
   } else {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
@@ -1894,7 +1903,13 @@
 #endif
         RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
       av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, CONFIG_LV_MAP);
+                     tx_size, a, l, CONFIG_LV_MAP, &rate_cost);
+
+      const int eob = x->plane[plane].eobs[block];
+      if (!eob)
+        rate_cost =
+            av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                            scan_order, a, l, args->use_fast_coef_costing);
     } else {
       args->exit_early = 1;
       return;
@@ -1917,14 +1932,8 @@
     args->exit_early = 1;
     return;
   }
-  const PLANE_TYPE plane_type = get_plane_type(plane);
-  const TX_TYPE tx_type =
-      av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size);
 
-  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, mbmi);
-  this_rd_stats.rate =
-      av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
-                      scan_order, a, l, args->use_fast_coef_costing);
+  this_rd_stats.rate = rate_cost;
 #else   // !CONFIG_TXK_SEL
   av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, a, l, args->use_fast_coef_costing,
@@ -3539,6 +3548,7 @@
   const int16_t *diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   int txb_coeff_cost;
+  int rate_cost = 0;
 
   assert(tx_size < TX_SIZES_ALL);
 
@@ -3603,6 +3613,8 @@
         cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
         USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
 
+    rate_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block, tx_size,
+                                scan_order, a, l, 0);
   } else {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
@@ -3645,7 +3657,18 @@
 #endif
         RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
       av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, fast);
+                     tx_size, a, l, fast, &rate_cost);
+
+      const int eob = x->plane[plane].eobs[block];
+      if (eob) {
+#if CONFIG_TXK_SEL
+        rate_cost += av1_tx_type_cost(cm, x, xd, xd->mi[0]->mbmi.sb_type, plane,
+                                      tx_size, tx_type);
+#endif
+      } else {
+        rate_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
+                                    tx_size, scan_order, a, l, 0);
+      }
     } else {
       rd_stats->rate += rd_stats->zero_rate;
       rd_stats->dist += tmp << 4;
@@ -3687,8 +3710,7 @@
                      blk_row, blk_col, plane_bsize, txm_bsize);
   }
   cur_dist = tmp * 16;
-  txb_coeff_cost = av1_cost_coeffs(cpi, x, plane, blk_row, blk_col, block,
-                                   tx_size, scan_order, a, l, 0);
+  txb_coeff_cost = rate_cost;
   cur_rate = txb_coeff_cost;
   cur_skip = (eob == 0);