Merge "Clean up write_tx_type()" into nextgenv2
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index ef529fc..b437669 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -27,6 +27,10 @@
 #endif
 #include "aom_dsp/prob.h"
 
+#if CONFIG_RD_DEBUG
+#include "av1/encoder/cost.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -39,6 +43,8 @@
 typedef struct aom_dk_writer aom_writer;
 #endif
 
+typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
+
 static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
 #if CONFIG_ANS
   (void)bc;
@@ -72,10 +78,25 @@
 #endif
 }
 
+static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
+                                    TOKEN_STATS *token_stats) {
+  aom_write(br, bit, probability);
+#if CONFIG_RD_DEBUG
+  token_stats->cost += av1_cost_bit(probability, bit);
+#else
+  (void)token_stats;
+#endif
+}
+
 static INLINE void aom_write_bit(aom_writer *w, int bit) {
   aom_write(w, bit, 128);  // aom_prob_half
 }
 
+static INLINE void aom_write_bit_record(aom_writer *w, int bit,
+                                        TOKEN_STATS *token_stats) {
+  aom_write_record(w, bit, 128, token_stats);  // aom_prob_half
+}
+
 static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
   int bit;
 
@@ -92,6 +113,18 @@
   } while (len);
 }
 
+static INLINE void aom_write_tree_bits_record(aom_writer *w,
+                                              const aom_tree_index *tr,
+                                              const aom_prob *probs, int bits,
+                                              int len, aom_tree_index i,
+                                              TOKEN_STATS *token_stats) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    aom_write_record(w, bit, probs[i >> 1], token_stats);
+    i = tr[i + bit];
+  } while (len);
+}
+
 static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
                                   const aom_prob *probs, int bits, int len,
                                   aom_tree_index i) {
@@ -102,6 +135,19 @@
 #endif
 }
 
+static INLINE void aom_write_tree_record(aom_writer *w,
+                                         const aom_tree_index *tree,
+                                         const aom_prob *probs, int bits,
+                                         int len, aom_tree_index i,
+                                         TOKEN_STATS *token_stats) {
+#if CONFIG_DAALA_EC
+  (void)token_stats;
+  daala_write_tree_bits(w, tree, probs, bits, len, i);
+#else
+  aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
+#endif
+}
+
 #if CONFIG_EC_MULTISYMBOL
 static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
                                     int nsymbs) {
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 4d8f5e2..1d08cfa 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -250,6 +250,11 @@
 #if CONFIG_DELTA_Q
   int current_q_index;
 #endif
+#if CONFIG_RD_DEBUG
+  int64_t txb_coeff_cost[MAX_MB_PLANE];
+  int mi_row;
+  int mi_col;
+#endif
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index 89534de..ad92150 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -139,31 +139,6 @@
   }
 }
 
-static const int16_t filter_4tap[WARPEDPIXEL_PREC_SHIFTS][4] = {
-  { 0, 128, 0, 0 },     { -1, 127, 2, 0 },    { -2, 127, 4, -1 },
-  { -3, 126, 6, -1 },   { -3, 125, 8, -2 },   { -4, 124, 11, -3 },
-  { -5, 123, 13, -3 },  { -5, 121, 15, -3 },  { -6, 120, 18, -4 },
-  { -7, 119, 20, -4 },  { -7, 118, 22, -5 },  { -8, 116, 25, -5 },
-  { -8, 115, 27, -6 },  { -9, 113, 30, -6 },  { -9, 112, 32, -7 },
-  { -9, 110, 34, -7 },  { -10, 108, 37, -7 }, { -10, 107, 39, -8 },
-  { -10, 105, 41, -8 }, { -11, 103, 44, -8 }, { -11, 101, 47, -9 },
-  { -11, 99, 49, -9 },  { -11, 97, 51, -9 },  { -11, 95, 54, -10 },
-  { -11, 93, 56, -10 }, { -12, 91, 59, -10 }, { -12, 89, 61, -10 },
-  { -12, 87, 64, -11 }, { -12, 85, 66, -11 }, { -12, 82, 69, -11 },
-  { -12, 80, 71, -11 }, { -12, 78, 73, -11 }, { -11, 75, 75, -11 },
-  { -11, 73, 78, -12 }, { -11, 71, 80, -12 }, { -11, 69, 82, -12 },
-  { -11, 66, 85, -12 }, { -11, 64, 87, -12 }, { -10, 61, 89, -12 },
-  { -10, 59, 91, -12 }, { -10, 56, 93, -11 }, { -10, 54, 95, -11 },
-  { -9, 51, 97, -11 },  { -9, 49, 99, -11 },  { -9, 47, 101, -11 },
-  { -8, 44, 103, -11 }, { -8, 41, 105, -10 }, { -8, 39, 107, -10 },
-  { -7, 37, 108, -10 }, { -7, 34, 110, -9 },  { -7, 32, 112, -9 },
-  { -6, 30, 113, -9 },  { -6, 27, 115, -8 },  { -5, 25, 116, -8 },
-  { -5, 22, 118, -7 },  { -4, 20, 119, -7 },  { -4, 18, 120, -6 },
-  { -3, 15, 121, -5 },  { -3, 13, 123, -5 },  { -3, 11, 124, -4 },
-  { -2, 8, 125, -3 },   { -1, 6, 126, -3 },   { -1, 4, 127, -2 },
-  { 0, 2, 127, -1 },
-};
-
 static const int16_t
     filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
       { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
@@ -700,7 +675,7 @@
     g = s = scale = 0.0;
     if (i < m) {
       for (k = i; k < m; k++) scale += fabs(u[k][i]);
-      if (scale) {
+      if (scale != 0.) {
         for (k = i; k < m; k++) {
           u[k][i] /= scale;
           s += u[k][i] * u[k][i];
@@ -721,7 +696,7 @@
     g = s = scale = 0.0;
     if (i < m && i != n - 1) {
       for (k = l; k < n; k++) scale += fabs(u[i][k]);
-      if (scale) {
+      if (scale != 0.) {
         for (k = l; k < n; k++) {
           u[i][k] /= scale;
           s += u[i][k] * u[i][k];
@@ -743,7 +718,7 @@
 
   for (i = n - 1; i >= 0; i--) {
     if (i < n - 1) {
-      if (g) {
+      if (g != 0.) {
         for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
         for (j = l; j < n; j++) {
           for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
@@ -760,7 +735,7 @@
     l = i + 1;
     g = w[i];
     for (j = l; j < n; j++) u[i][j] = 0.0;
-    if (g) {
+    if (g != 0.) {
       g = 1.0 / g;
       for (j = l; j < n; j++) {
         for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
@@ -848,7 +823,7 @@
         }
         z = pythag(f, h);
         w[j] = z;
-        if (z) {
+        if (z != 0.) {
           z = 1.0 / z;
           c = f * z;
           s = h * z;
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index e9e1def..026dcbc 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -442,6 +442,10 @@
   // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
   // passing bsize from decode_partition().
   xd->mi[0]->mbmi.sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  xd->mi[0]->mbmi.mi_row = mi_row;
+  xd->mi[0]->mbmi.mi_col = mi_col;
+#endif
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
 
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 81f0c5c..7c78eda 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -717,7 +717,8 @@
 
 static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size) {
+                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+                           TOKEN_STATS *token_stats) {
   const TOKENEXTRA *p = *tp;
 #if CONFIG_VAR_TX
   int count = 0;
@@ -745,10 +746,11 @@
 
 #if CONFIG_EC_MULTISYMBOL
     /* skip one or two nodes */
-    if (!p->skip_eob_node) aom_write(w, token != EOB_TOKEN, p->context_tree[0]);
+    if (!p->skip_eob_node)
+      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
 
     if (token != EOB_TOKEN) {
-      aom_write(w, token != ZERO_TOKEN, p->context_tree[1]);
+      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
 
       if (token != ZERO_TOKEN) {
         aom_write_symbol(w, token - ONE_TOKEN, *p->token_cdf,
@@ -760,19 +762,21 @@
     if (p->skip_eob_node)
       coef_length -= p->skip_eob_node;
     else
-      aom_write(w, token != EOB_TOKEN, p->context_tree[0]);
+      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
 
     if (token != EOB_TOKEN) {
-      aom_write(w, token != ZERO_TOKEN, p->context_tree[1]);
+      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
 
       if (token != ZERO_TOKEN) {
-        aom_write(w, token != ONE_TOKEN, p->context_tree[2]);
+        aom_write_record(w, token != ONE_TOKEN, p->context_tree[2],
+                         token_stats);
 
         if (token != ONE_TOKEN) {
           const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node;
-          aom_write_tree(w, av1_coef_con_tree,
-                         av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
-                         coef_value, coef_length - unconstrained_len, 0);
+          aom_write_tree_record(
+              w, av1_coef_con_tree,
+              av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], coef_value,
+              coef_length - unconstrained_len, 0, token_stats);
         }
       }
     }
@@ -800,12 +804,12 @@
             --skip_bits;
             assert(!bb);
           } else {
-            aom_write(w, bb, pb[index]);
+            aom_write_record(w, bb, pb[index], token_stats);
           }
         }
       }
 
-      aom_write_bit(w, bit_string & 1);
+      aom_write_bit_record(w, bit_string & 1, token_stats);
     }
     ++p;
 
@@ -824,7 +828,7 @@
                             MB_MODE_INFO *mbmi, int plane,
                             BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
                             int block, int blk_row, int blk_col,
-                            TX_SIZE tx_size) {
+                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
@@ -840,7 +844,7 @@
             : mbmi->inter_tx_size[tx_row][tx_col];
 
   if (tx_size == plane_tx_size) {
-    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size);
+    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, token_stats);
   } else {
     const int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
     int i;
@@ -856,7 +860,7 @@
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
       pack_txb_tokens(w, tp, tok_end, xd, mbmi, plane, plane_bsize, bit_depth,
-                      block, offsetr, offsetc, sub_txs);
+                      block, offsetr, offsetc, sub_txs, token_stats);
       block += step;
     }
   }
@@ -1712,6 +1716,20 @@
   write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_RD_DEBUG
+static void dump_mode_info(MODE_INFO *mi) {
+  printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row);
+  printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col);
+  printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type);
+  printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size);
+  if (mi->mbmi.sb_type >= BLOCK_8X8) {
+    printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode);
+  } else {
+    printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode);
+  }
+}
+#endif
+
 static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
                           aom_writer *w, const TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
@@ -1724,6 +1742,9 @@
   MODE_INFO *m;
   int plane;
   int bh, bw;
+#if CONFIG_RD_DEBUG
+  int64_t txb_coeff_cost[MAX_MB_PLANE] = { 0 };
+#endif
 #if CONFIG_RANS
   (void)tok;
   (void)tok_end;
@@ -1823,11 +1844,18 @@
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       TX_SIZE tx_size =
           plane ? get_uv_tx_size(mbmi, &xd->plane[plane]) : mbmi->tx_size;
-
-      if (is_inter_block(mbmi) && !is_rect_tx(tx_size)) {
-#else
-      if (is_inter_block(mbmi)) {
 #endif
+
+      TOKEN_STATS token_stats;
+      token_stats.cost = 0;
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+
+      if (is_inter_block(mbmi) && !is_rect_tx(tx_size))
+#else
+      if (is_inter_block(mbmi))
+#endif
+      {
         const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
         int block = 0;
         const int step =
@@ -1837,7 +1865,8 @@
         for (row = 0; row < num_4x4_h; row += bkh) {
           for (col = 0; col < num_4x4_w; col += bkw) {
             pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
-                            cm->bit_depth, block, row, col, max_tx_size);
+                            cm->bit_depth, block, row, col, max_tx_size,
+                            &token_stats);
             block += step;
           }
         }
@@ -1849,17 +1878,34 @@
 
         for (row = 0; row < num_4x4_h; row += bkh)
           for (col = 0; col < num_4x4_w; col += bkw)
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
       }
 #else
       TX_SIZE tx =
           plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane]) : m->mbmi.tx_size;
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      TOKEN_STATS token_stats;
+      token_stats.cost = 0;
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
 #endif  // CONFIG_VAR_TX
+
+#if CONFIG_RD_DEBUG
+      txb_coeff_cost[plane] += token_stats.cost;
+#else
+      (void)token_stats;
+#endif
+
       assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
       (*tok)++;
     }
   }
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (m->mbmi.txb_coeff_cost[plane] != txb_coeff_cost[plane]) {
+      dump_mode_info(m);
+      assert(0);
+    }
+  }
+#endif
 }
 
 static void write_partition(const AV1_COMMON *const cm,
@@ -2057,9 +2103,11 @@
         BLOCK_SIZE txb_size = txsize_to_bsize[tx];
         int bw = num_4x4_blocks_wide_lookup[txb_size];
 
+        TOKEN_STATS token_stats;
+        token_stats.cost = 0;
         for (row = 0; row < num_4x4_h; row += bw)
           for (col = 0; col < num_4x4_w; col += bw)
-            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
         assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
         (*tok)++;
       }
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 8b0de2f..6f82e83 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1634,6 +1634,10 @@
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
 #if CONFIG_SUPERTX
   // We set tx_size here as skip blocks would otherwise not set it.
   // tx_size needs to be set at this point as supertx_enable in
@@ -6181,11 +6185,7 @@
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     int coeff_ctx = 1;
     RD_STATS this_rd_stats;
-
-    this_rd_stats.rate = 0;
-    this_rd_stats.dist = 0;
-    this_rd_stats.sse = 0;
-    this_rd_stats.skip = 1;
+    av1_init_rd_stats(&this_rd_stats);
 
     tx_size = max_txsize_lookup[bsize];
     tx_size =
@@ -6240,10 +6240,7 @@
     mbmi->tx_type = tx_type;
 
 #if CONFIG_VAR_TX
-    this_rd_stats.rate = 0;
-    this_rd_stats.dist = 0;
-    this_rd_stats.sse = 0;
-    this_rd_stats.skip = 1;
+    av1_init_rd_stats(&this_rd_stats);
 
     av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
     coeff_ctx = combine_entropy_contexts(ctxa[0], ctxl[0]);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 217a365..d23a7ba 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2936,6 +2936,7 @@
   int max_blocks_wide = block_size_wide[plane_bsize];
   const int diff_stride = max_blocks_wide;
   const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  int txb_coeff_cost;
 #if CONFIG_EXT_TX
   assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
@@ -3035,9 +3036,13 @@
     }
   }
   rd_stats->dist += tmp * 16;
-  rd_stats->rate += av1_cost_coeffs(cm, x, plane, block, coeff_ctx, tx_size,
-                                    scan_order->scan, scan_order->neighbors, 0);
+  txb_coeff_cost = av1_cost_coeffs(cm, x, plane, block, coeff_ctx, tx_size,
+                                   scan_order->scan, scan_order->neighbors, 0);
+  rd_stats->rate += txb_coeff_cost;
   rd_stats->skip &= (p->eobs[block] == 0);
+#if CONFIG_RD_DEBUG
+  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+#endif
 }
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
@@ -3067,12 +3072,11 @@
       txfm_partition_context(tx_above + (blk_col >> 1),
                              tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
 
-  int64_t sum_dist = 0, sum_bsse = 0;
   int64_t sum_rd = INT64_MAX;
-  int sum_rate = 0;
-  int all_skip = 1;
   int tmp_eob = 0;
   int zero_blk_rate;
+  RD_STATS sum_rd_stats;
+  av1_init_rd_stats(&sum_rd_stats);
 
 #if CONFIG_EXT_TX
   assert(tx_size < TX_SIZES);
@@ -3085,10 +3089,7 @@
 
   coeff_ctx = get_entropy_context(tx_size, pta, ptl);
 
-  rd_stats->rate = 0;
-  rd_stats->dist = 0;
-  rd_stats->sse = 0;
-  rd_stats->skip = 1;
+  av1_init_rd_stats(rd_stats);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -3129,7 +3130,8 @@
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
 
-    sum_rate = av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+    sum_rd_stats.rate =
+        av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
 #if CONFIG_EXT_TX
     assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
@@ -3143,12 +3145,10 @@
                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
                       &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
 
-      sum_rate += this_rd_stats.rate;
-      sum_dist += this_rd_stats.dist;
-      sum_bsse += this_rd_stats.sse;
-      all_skip &= this_rd_stats.skip;
+      av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
 
-      tmp_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      tmp_rd =
+          RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
       if (this_rd < tmp_rd) break;
       block += sub_step;
     }
@@ -3169,10 +3169,7 @@
     if (this_rd == INT64_MAX) *is_cost_valid = 0;
     x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
   } else {
-    rd_stats->rate = sum_rate;
-    rd_stats->dist = sum_dist;
-    rd_stats->sse = sum_bsse;
-    rd_stats->skip = all_skip;
+    *rd_stats = sum_rd_stats;
     if (sum_rd == INT64_MAX) *is_cost_valid = 0;
   }
 }
@@ -3186,10 +3183,7 @@
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
-  rd_stats->rate = 0;
-  rd_stats->dist = 0;
-  rd_stats->sse = 0;
-  rd_stats->skip = 1;
+  av1_init_rd_stats(rd_stats);
 
   if (is_cost_valid) {
     const struct macroblockd_plane *const pd = &xd->plane[0];
@@ -3208,10 +3202,7 @@
     TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
 
     RD_STATS pn_rd_stats;
-    pn_rd_stats.rate = 0;
-    pn_rd_stats.skip = 1;
-    pn_rd_stats.dist = 0;
-    pn_rd_stats.sse = 0;
+    av1_init_rd_stats(&pn_rd_stats);
 
     av1_get_entropy_contexts(bsize, TX_4X4, pd, ctxa, ctxl);
     memcpy(tx_above, xd->above_txfm_context,
@@ -3225,10 +3216,7 @@
                         mi_height != mi_width, plane_bsize, ctxa, ctxl,
                         tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
                         &is_cost_valid);
-        rd_stats->rate += pn_rd_stats.rate;
-        rd_stats->dist += pn_rd_stats.dist;
-        rd_stats->sse += pn_rd_stats.sse;
-        rd_stats->skip &= pn_rd_stats.skip;
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
         this_rd += AOMMIN(
             RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
             RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
@@ -3243,10 +3231,7 @@
 
   if (!is_cost_valid) {
     // reset cost value
-    rd_stats->rate = INT_MAX;
-    rd_stats->dist = INT64_MAX;
-    rd_stats->sse = INT64_MAX;
-    rd_stats->skip = 0;
+    av1_invalid_rd_stats(rd_stats);
   }
 }
 
@@ -3385,17 +3370,11 @@
     prune = prune_tx_types(cpi, bsize, x, xd, 0);
 #endif
 
-  rd_stats->dist = INT64_MAX;
-  rd_stats->rate = INT_MAX;
-  rd_stats->skip = 0;
-  rd_stats->sse = INT64_MAX;
+  av1_invalid_rd_stats(rd_stats);
 
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
     RD_STATS this_rd_stats;
-    this_rd_stats.rate = 0;
-    this_rd_stats.skip = 1;
-    this_rd_stats.dist = 0;
-    this_rd_stats.sse = 0;
+    av1_init_rd_stats(&this_rd_stats);
 #if CONFIG_EXT_TX
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
@@ -3438,6 +3417,10 @@
     for (idx = 0; idx < xd->n8_w; ++idx)
       mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
   mbmi->tx_size = best_tx;
+#if CONFIG_RD_DEBUG
+  // record plane y's transform block coefficient cost
+  mbmi->txb_coeff_cost[0] = rd_stats->txb_coeff_cost[0];
+#endif
   memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
 }
 
@@ -3512,10 +3495,7 @@
 
   if (ref_best_rd < 0) is_cost_valid = 0;
 
-  rd_stats->rate = 0;
-  rd_stats->dist = 0;
-  rd_stats->sse = 0;
-  rd_stats->skip = 1;
+  av1_init_rd_stats(rd_stats);
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   if (is_rect_tx(mbmi->tx_size)) {
@@ -3544,10 +3524,7 @@
     ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
     ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
     RD_STATS pn_rd_stats;
-    pn_rd_stats.rate = 0;
-    pn_rd_stats.skip = 1;
-    pn_rd_stats.dist = 0;
-    pn_rd_stats.sse = 0;
+    av1_init_rd_stats(&pn_rd_stats);
 
     av1_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
 
@@ -3564,10 +3541,7 @@
       break;
     }
 
-    rd_stats->rate += pn_rd_stats.rate;
-    rd_stats->dist += pn_rd_stats.dist;
-    rd_stats->sse += pn_rd_stats.sse;
-    rd_stats->skip &= pn_rd_stats.skip;
+    av1_merge_rd_stats(rd_stats, &pn_rd_stats);
 
     this_rd =
         AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
@@ -3581,10 +3555,7 @@
 
   if (!is_cost_valid) {
     // reset cost value
-    rd_stats->rate = INT_MAX;
-    rd_stats->dist = INT64_MAX;
-    rd_stats->sse = INT64_MAX;
-    rd_stats->skip = 0;
+    av1_invalid_rd_stats(rd_stats);
   }
 
   return is_cost_valid;
@@ -7554,6 +7525,11 @@
 #if CONFIG_VAR_TX
       is_cost_valid_uv =
           inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, ref_best_rd - rdcosty);
+#if CONFIG_RD_DEBUG
+      // record uv planes' transform block coefficient cost
+      mbmi->txb_coeff_cost[1] = rd_stats_uv.txb_coeff_cost[1];
+      mbmi->txb_coeff_cost[2] = rd_stats_uv.txb_coeff_cost[2];
+#endif
       *rate_uv = rd_stats_uv.rate;
       distortion_uv = rd_stats_uv.dist;
       skippable_uv = rd_stats_uv.skip;
@@ -7569,8 +7545,8 @@
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         continue;
 #else
-      restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      return INT64_MAX;
+        restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        return INT64_MAX;
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
 
@@ -9560,9 +9536,6 @@
         best_mbmode.mode = ZEROMV;
 #if CONFIG_EXT_INTER
     } else {
-      const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
-                                           best_mbmode.ref_frame[1] };
-
       if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int ==
               best_mbmode.mv[0].as_int &&
           frame_mv[NEAREST_NEARESTMV][refs[1]].as_int ==
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 8c65770..cb9666a 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -33,7 +33,53 @@
   int64_t dist;
   int64_t sse;
   int skip;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost[MAX_MB_PLANE];
+#endif
 } RD_STATS;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+    rd_stats->txb_coeff_cost[plane] = 0;
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = INT_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->sse = INT64_MAX;
+  rd_stats->skip = 0;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+    rd_stats->txb_coeff_cost[plane] = INT_MAX;
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+                                      const RD_STATS *rd_stats_src) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats_dst->rate += rd_stats_src->rate;
+  rd_stats_dst->dist += rd_stats_src->dist;
+  rd_stats_dst->sse += rd_stats_src->sse;
+  rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+    rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+#endif
+}
 #endif
 
 int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,