Refactor expand dry_run types to return coef rate

Adds the functionality to return the rate cost due to
coefficients without doing full search of all modes.
This will be subsequently used in various experiments,
including in new_quant experiment to search quantization
profiles at the superblock level without repeating the
full mode/partition search.

Change-Id: I4aad3f3f0c8b8dfdea38f8f4f094a98283f47f08
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 87e7d51..ea865b6 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -61,8 +61,9 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+                              RUN_TYPE dry_run, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                              int *rate);
 
 #if CONFIG_SUPERTX
 static int check_intra_b(PICK_MODE_CONTEXT *ctx);
@@ -80,13 +81,13 @@
 static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
                                const TileInfo *const tile, int mi_row,
                                int mi_col, int mi_row_ori, int mi_col_ori,
-                               int output_enabled, BLOCK_SIZE bsize,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
                                int dst_stride[3], PC_TREE *pc_tree);
 static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
                                     const TileInfo *const tile, int mi_row,
                                     int mi_col, BLOCK_SIZE bsize,
-                                    int output_enabled, PC_TREE *pc_tree);
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
 static void rd_supertx_sb(AV1_COMP *cpi, ThreadData *td,
                           const TileInfo *const tile, int mi_row, int mi_col,
                           BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
@@ -1025,7 +1026,7 @@
 
 static void update_state(AV1_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                         int output_enabled) {
+                         RUN_TYPE dry_run) {
   int i, x_idx, y;
   AV1_COMMON *const cm = &cpi->common;
   RD_COUNTS *const rdc = &td->rd_counts;
@@ -1139,7 +1140,7 @@
            sizeof(uint8_t) * ctx->num_4x4_blk);
 #endif
 
-  if (!output_enabled) return;
+  if (dry_run) return;
 
 #if CONFIG_INTERNAL_STATS
   if (frame_is_intra_only(cm)) {
@@ -1208,7 +1209,7 @@
 #if CONFIG_SUPERTX
 static void update_state_supertx(AV1_COMP *cpi, ThreadData *td,
                                  PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                                 BLOCK_SIZE bsize, int output_enabled) {
+                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int y, x_idx;
 #if CONFIG_VAR_TX || CONFIG_REF_MV
   int i;
@@ -1316,7 +1317,7 @@
   // Turn motion variation off for supertx
   mbmi->motion_variation = SIMPLE_TRANSLATION;
 
-  if (!output_enabled) return;
+  if (dry_run) return;
 
   if (!frame_is_intra_only(cm)) {
     av1_update_mv_count(td);
@@ -1354,7 +1355,7 @@
 static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
                                     const TileInfo *const tile, int mi_row,
                                     int mi_col, BLOCK_SIZE bsize,
-                                    int output_enabled, PC_TREE *pc_tree) {
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1378,27 +1379,27 @@
     case PARTITION_NONE:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
-                           output_enabled);
+                           dry_run);
       break;
     case PARTITION_VERT:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
         update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
-                             mi_col + hbs, subsize, output_enabled);
+                             mi_col + hbs, subsize, dry_run);
       }
       pmc = &pc_tree->vertical_supertx;
       break;
     case PARTITION_HORZ:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
         update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
-                             mi_col, subsize, output_enabled);
+                             mi_col, subsize, dry_run);
       }
       pmc = &pc_tree->horizontal_supertx;
       break;
@@ -1406,20 +1407,20 @@
       if (bsize == BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
         update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
-                             subsize, output_enabled);
+                             subsize, dry_run);
       } else {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize,
-                                output_enabled, pc_tree->split[0]);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+                                pc_tree->split[0]);
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
-                                output_enabled, pc_tree->split[1]);
+                                dry_run, pc_tree->split[1]);
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
-                                output_enabled, pc_tree->split[2]);
+                                dry_run, pc_tree->split[2]);
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                                subsize, output_enabled, pc_tree->split[3]);
+                                subsize, dry_run, pc_tree->split[3]);
       }
       pmc = &pc_tree->split_supertx;
       break;
@@ -1427,49 +1428,49 @@
     case PARTITION_HORZ_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
-                           bsize2, output_enabled);
+                           bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
-                           mi_col, subsize, output_enabled);
+                           mi_col, subsize, dry_run);
       pmc = &pc_tree->horizontala_supertx;
       break;
     case PARTITION_HORZ_B:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
-                           mi_col, bsize2, output_enabled);
+                           mi_col, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       pmc = &pc_tree->horizontalb_supertx;
       break;
     case PARTITION_VERT_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
-                           bsize2, output_enabled);
+                           bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
-                           mi_col, bsize2, output_enabled);
+                           mi_col, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
       update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
-                           mi_col + hbs, subsize, output_enabled);
+                           mi_col + hbs, subsize, dry_run);
       pmc = &pc_tree->verticala_supertx;
       break;
     case PARTITION_VERT_B:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       pmc = &pc_tree->verticalb_supertx;
       break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2109,21 +2110,21 @@
 }
 
 static void encode_b(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td,
-                     TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize,
+                     TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+                     BLOCK_SIZE bsize,
 #if CONFIG_EXT_PARTITION_TYPES
                      PARTITION_TYPE partition,
 #endif
-                     PICK_MODE_CONTEXT *ctx) {
+                     PICK_MODE_CONTEXT *ctx, int *rate) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
 #if CONFIG_EXT_PARTITION_TYPES
   x->e_mbd.mi[0]->mbmi.partition = partition;
 #endif
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
-  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
 
-  if (output_enabled) {
+  if (!dry_run) {
 #if CONFIG_SUPERTX
     update_stats(&cpi->common, td, 0);
 #else
@@ -2133,8 +2134,8 @@
 }
 
 static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+                      TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+                      BLOCK_SIZE bsize, PC_TREE *pc_tree, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2151,7 +2152,7 @@
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (output_enabled) td->counts->partition[ctx][partition]++;
+  if (!dry_run) td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
   if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
@@ -2167,33 +2168,34 @@
       int dst_stride[3];
       set_skip_context(xd, mi_row, mi_col);
       set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
-                              output_enabled, pc_tree);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+                              pc_tree);
 
       av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
       for (i = 0; i < MAX_MB_PLANE; i++) {
         dst_buf[i] = xd->plane[i].dst.buf;
         dst_stride[i] = xd->plane[i].dst.stride;
       }
-      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
-                         output_enabled, bsize, bsize, dst_buf, dst_stride,
-                         pc_tree);
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+                         bsize, bsize, dst_buf, dst_stride, pc_tree);
 
       set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
       set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
 
       if (!x->skip) {
+        int this_rate = 0;
         x->skip_optimize = 0;
         x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
 
         av1_encode_sb_supertx(x, bsize);
-        av1_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
+        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+        if (rate) *rate += this_rate;
       } else {
         xd->mi[0]->mbmi.skip = 1;
-        if (output_enabled) td->counts->skip[av1_get_skip_context(xd)][1]++;
+        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
         reset_skip_context(xd, bsize);
       }
-      if (output_enabled) {
+      if (!dry_run) {
         for (y_idx = 0; y_idx < mi_height; y_idx++)
           for (x_idx = 0; x_idx < mi_width; x_idx++) {
             if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
@@ -2234,7 +2236,7 @@
 #endif  // CONFIG_VAR_TX
       return;
     } else {
-      if (output_enabled) {
+      if (!dry_run) {
         td->counts->supertx[partition_supertx_context_lookup[partition]]
                            [supertx_size][0]++;
       }
@@ -2244,93 +2246,91 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->none);
+               &pc_tree->none, rate);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->vertical[0]);
+               &pc_tree->vertical[0], rate);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
-                 subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 &pc_tree->vertical[1]);
+                 &pc_tree->vertical[1], rate);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->horizontal[0]);
+               &pc_tree->horizontal[0], rate);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
-                 subsize,
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 &pc_tree->horizontal[1]);
+                 &pc_tree->horizontal[1], rate);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 pc_tree->leaf_split[0]);
+                 pc_tree->leaf_split[0], rate);
       } else {
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+                  pc_tree->split[0], rate);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                  pc_tree->split[1], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                  pc_tree->split[2], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                  subsize, pc_tree->split[3], rate);
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
     case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->horizontala[0]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
-               partition, &pc_tree->horizontala[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-               partition, &pc_tree->horizontala[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
       break;
     case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               partition, &pc_tree->horizontalb[0]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->horizontalb[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-               bsize2, partition, &pc_tree->horizontalb[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[2], rate);
       break;
     case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->verticala[0]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->verticala[1]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-               partition, &pc_tree->verticala[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
 
       break;
     case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               partition, &pc_tree->verticalb[0]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
-               partition, &pc_tree->verticalb[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-               bsize2, partition, &pc_tree->verticalb[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[2], rate);
       break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
@@ -2545,8 +2545,9 @@
 #endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2587,8 +2588,9 @@
 #endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2741,8 +2743,8 @@
 #endif
 
       if (i != 3)
-        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
 #if CONFIG_SUPERTX
@@ -2785,9 +2787,17 @@
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == cm->sb_size);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   *rate = chosen_rdc.rate;
@@ -3136,8 +3146,9 @@
   if (sum_rdc.rdcost < best_rdc->rdcost) {
 #endif
     PICK_MODE_CONTEXT *ctx = &ctxs[0];
-    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 0);
-    encode_superblock(cpi, td, tp, 0, mi_row0, mi_col0, subsize0, ctx);
+    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 1);
+    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+                      ctx, NULL);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3176,8 +3187,9 @@
     if (sum_rdc.rdcost < best_rdc->rdcost) {
 #endif
       PICK_MODE_CONTEXT *ctx = &ctxs[1];
-      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row1, mi_col1, subsize1, ctx);
+      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+                        ctx, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3788,8 +3800,9 @@
 #endif  // CONFIG_SUPERTX
         mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        ctx, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3924,9 +3937,9 @@
     if (sum_rdc.rdcost < best_rdc.rdcost &&
 #endif  // CONFIG_SUPERTX
         mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) {
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
-                        &pc_tree->vertical[0]);
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0], NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -4099,9 +4112,13 @@
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == cm->sb_size);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   if (bsize == cm->sb_size) {
@@ -5000,8 +5017,9 @@
 #endif
 
 static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+                              RUN_TYPE dry_run, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                              int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -5023,12 +5041,12 @@
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       av1_encode_intra_block_plane(x, AOMMAX(bsize, BLOCK_8X8), plane, 1);
-    if (output_enabled)
+    if (!dry_run)
       sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm));
 
 #if CONFIG_EXT_INTRA
-    if (output_enabled && bsize >= BLOCK_8X8) {
+    if (!dry_run && bsize >= BLOCK_8X8) {
       FRAME_COUNTS *counts = td->counts;
       if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0)
         ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
@@ -5046,18 +5064,18 @@
     }
 #endif  // CONFIG_EXT_INTRA
 
-    if (bsize >= BLOCK_8X8 && output_enabled) {
+    if (bsize >= BLOCK_8X8 && !dry_run) {
       for (plane = 0; plane <= 1; ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
           mbmi->palette_mode_info.palette_first_color_idx[plane] =
               xd->plane[plane].color_index_map[0];
           // TODO(huisu): this increases the use of token buffer. Needs stretch
           // test to verify.
-          av1_tokenize_palette_sb(td, bsize, plane, t);
+          av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
         }
       }
     }
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -5129,17 +5147,17 @@
 #if CONFIG_VAR_TX
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size))
-      av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+      av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
     else
 #endif
-      av1_tokenize_sb_vartx(cpi, td, t, !output_enabled, mi_row, mi_col,
-                            AOMMAX(bsize, BLOCK_8X8));
+      av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col,
+                            AOMMAX(bsize, BLOCK_8X8), rate);
 #else
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
 #endif
   }
 
-  if (output_enabled) {
+  if (!dry_run) {
     if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
       const int is_inter = is_inter_block(mbmi);
@@ -5222,8 +5240,7 @@
 #if CONFIG_VAR_TX
   if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
       is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
-    if (!output_enabled)
-      tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size)) {
       set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
@@ -5419,7 +5436,7 @@
                              int mi_col_pred, int mi_row_top, int mi_col_top,
                              uint8_t *dst_buf[3], int dst_stride[3],
                              BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             int output_enabled, int b_sub8x8, int bextend) {
+                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
   // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
@@ -5463,13 +5480,13 @@
 #endif  // CONFIG_EXT_INTER
                      mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
 
-  if (output_enabled && !bextend) update_stats(&cpi->common, td, 1);
+  if (!dry_run && !bextend) update_stats(&cpi->common, td, 1);
 }
 
 static void extend_dir(AV1_COMP *cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
                        BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, int output_enabled,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
                        uint8_t *dst_buf[3], int dst_stride[3], int dir) {
   // dir: 0-lower, 1-upper, 2-left, 3-right
   //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
@@ -5493,7 +5510,7 @@
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
 
     if (mi_width > unit) {
       int i;
@@ -5501,8 +5518,8 @@
         mi_col_pred += unit;
         predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                          mi_col_pred, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, output_enabled,
-                         b_sub8x8, 1);
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
       }
     }
   } else if (dir == 2 || dir == 3) {  // left and right
@@ -5515,7 +5532,7 @@
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
 
     if (mi_height > unit) {
       int i;
@@ -5523,8 +5540,8 @@
         mi_row_pred += unit;
         predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                          mi_col_pred, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, output_enabled,
-                         b_sub8x8, 1);
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
       }
     }
   } else {
@@ -5534,32 +5551,32 @@
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
   }
 }
 
 static void extend_all(AV1_COMP *cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
                        BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, int output_enabled,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
                        uint8_t *dst_buf[3], int dst_stride[3]) {
   assert(block >= 0 && block < 4);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+             mi_col_top, dry_run, dst_buf, dst_stride, 0);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+             mi_col_top, dry_run, dst_buf, dst_stride, 1);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+             mi_col_top, dry_run, dst_buf, dst_stride, 2);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+             mi_col_top, dry_run, dst_buf, dst_stride, 3);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+             mi_col_top, dry_run, dst_buf, dst_stride, 4);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+             mi_col_top, dry_run, dst_buf, dst_stride, 5);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+             mi_col_top, dry_run, dst_buf, dst_stride, 6);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 7);
+             mi_col_top, dry_run, dst_buf, dst_stride, 7);
 }
 
 // This function generates prediction for multiple blocks, between which
@@ -5573,7 +5590,7 @@
 static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
                                const TileInfo *const tile, int mi_row,
                                int mi_col, int mi_row_top, int mi_col_top,
-                               int output_enabled, BLOCK_SIZE bsize,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
                                int dst_stride[3], PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5628,8 +5645,7 @@
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-  if (output_enabled && bsize < top_bsize)
-    cm->counts.partition[ctx][partition]++;
+  if (!dry_run && bsize < top_bsize) cm->counts.partition[ctx][partition]++;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
@@ -5641,29 +5657,27 @@
       assert(bsize < top_bsize);
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize, output_enabled, 0, 0);
+                       bsize, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-                 mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_col_top, dry_run, dst_buf, dst_stride);
       break;
     case PARTITION_HORZ:
       if (bsize == BLOCK_8X8) {
         // Fisrt half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
         // Second half
         predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -5676,29 +5690,26 @@
         // First half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, output_enabled, 0, 0);
+                         subsize, dry_run, 0, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
         else
           extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride, 0);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
 
         if (mi_row + hbs < cm->mi_rows) {
           // Second half
           predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                            mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, output_enabled, 0,
-                           0);
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
           if (bsize < top_bsize)
             extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
                        dst_stride1);
           else
             extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
                        dst_stride1, 1);
 
           // Smooth
@@ -5718,20 +5729,18 @@
         // First half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
         // Second half
         predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -5744,29 +5753,26 @@
         // bsize: not important, not useful
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, output_enabled, 0, 0);
+                         subsize, dry_run, 0, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
         else
           extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride, 3);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
 
         if (mi_col + hbs < cm->mi_cols) {
           predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                            mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, output_enabled, 0,
-                           0);
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
           if (bsize < top_bsize)
             extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
-                       dst_buf1, dst_stride1);
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
           else
             extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
-                       dst_buf1, dst_stride1, 2);
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 2);
 
           for (i = 0; i < MAX_MB_PLANE; i++) {
             xd->plane[i].dst.buf = dst_buf[i];
@@ -5783,46 +5789,42 @@
       if (bsize == BLOCK_8X8) {
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf3, dst_stride3,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
 
         if (bsize < top_bsize) {
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
           extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
           extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                     dst_stride2);
+                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
           extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf3,
-                     dst_stride3);
+                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
         }
       } else {
         predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
-                           mi_col_top, output_enabled, subsize, top_bsize,
-                           dst_buf, dst_stride, pc_tree->split[0]);
+                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+                           dst_stride, pc_tree->split[0]);
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
-                             mi_col_top, output_enabled, subsize, top_bsize,
-                             dst_buf1, dst_stride1, pc_tree->split[1]);
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+                             dst_stride1, pc_tree->split[1]);
         if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
-                             mi_col_top, output_enabled, subsize, top_bsize,
-                             dst_buf2, dst_stride2, pc_tree->split[2]);
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+                             dst_stride2, pc_tree->split[2]);
         if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             mi_row_top, mi_col_top, dry_run, subsize,
                              top_bsize, dst_buf3, dst_stride3,
                              pc_tree->split[3]);
       }
@@ -5856,27 +5858,25 @@
     case PARTITION_HORZ_A:
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, output_enabled, 0, 0);
+                       bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       top_bsize, subsize, output_enabled, 0, 0);
+                       top_bsize, subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2, 1);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5898,27 +5898,25 @@
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, output_enabled, 0, 0);
+                       bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, output_enabled, 0, 0);
+                       top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, top_bsize, subsize, output_enabled, 0, 0);
+                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2, 2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5939,27 +5937,25 @@
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, output_enabled, 0, 0);
+                       subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
-                   0);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, output_enabled, 0, 0);
+                       top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
                        mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
-                       0, 0);
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
                  dst_stride2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -5983,27 +5979,25 @@
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, output_enabled, 0, 0);
+                       subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
-                   3);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
                        mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
-                       0, 0);
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
                  dst_stride2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -6059,13 +6053,13 @@
 
   set_skip_context(xd, mi_row, mi_col);
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 0, pc_tree);
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
   av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
   for (plane = 0; plane < MAX_MB_PLANE; plane++) {
     dst_buf[plane] = xd->plane[plane].dst.buf;
     dst_stride[plane] = xd->plane[plane].dst.stride;
   }
-  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 0, bsize,
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
                      bsize, dst_buf, dst_stride, pc_tree);
 
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 86f0c8d..8be7310 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -46,6 +46,7 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
 
 #if CONFIG_DUAL_FILTER
 #if CONFIG_EXT_INTERP
@@ -865,14 +866,14 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
  * were non-zero). */
-static int cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
-                       TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
-                       int use_fast_coef_costing) {
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+                    TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
@@ -1064,8 +1065,9 @@
 
 static int rate_block(int plane, int block, int coeff_ctx, TX_SIZE tx_size,
                       struct rdcost_block_args *args) {
-  return cost_coeffs(args->x, plane, block, coeff_ctx, tx_size, args->so->scan,
-                     args->so->neighbors, args->use_fast_coef_costing);
+  return av1_cost_coeffs(args->x, plane, block, coeff_ctx, tx_size,
+                         args->so->scan, args->so->neighbors,
+                         args->use_fast_coef_costing);
 }
 
 static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
@@ -1946,8 +1948,9 @@
             av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             TX_4X4, AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
-            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            ratey +=
+                av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                so->neighbors, cpi->sf.use_fast_coef_costing);
             *(tempa + idx) = !(p->eobs[block] == 0);
             *(templ + idy) = !(p->eobs[block] == 0);
             can_skip &= (p->eobs[block] == 0);
@@ -1971,8 +1974,9 @@
                             TX_4X4, AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
             av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
-            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            ratey +=
+                av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                so->neighbors, cpi->sf.use_fast_coef_costing);
             *(tempa + idx) = !(p->eobs[block] == 0);
             *(templ + idy) = !(p->eobs[block] == 0);
             can_skip &= (p->eobs[block] == 0);
@@ -2064,8 +2068,9 @@
           av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8, TX_4X4,
                           AV1_XFORM_QUANT_B);
 #endif  // CONFIG_NEW_QUANT
-          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          ratey +=
+              av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                              so->neighbors, cpi->sf.use_fast_coef_costing);
           *(tempa + idx) = !(p->eobs[block] == 0);
           *(templ + idy) = !(p->eobs[block] == 0);
           can_skip &= (p->eobs[block] == 0);
@@ -2088,8 +2093,9 @@
                           AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
           av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
-          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          ratey +=
+              av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                              so->neighbors, cpi->sf.use_fast_coef_costing);
           *(tempa + idx) = !(p->eobs[block] == 0);
           *(templ + idy) = !(p->eobs[block] == 0);
           can_skip &= (p->eobs[block] == 0);
@@ -2964,8 +2970,8 @@
     }
   }
   *dist += tmp * 16;
-  *rate += cost_coeffs(x, plane, block, coeff_ctx, tx_size, scan_order->scan,
-                       scan_order->neighbors, 0);
+  *rate += av1_cost_coeffs(x, plane, block, coeff_ctx, tx_size,
+                           scan_order->scan, scan_order->neighbors, 0);
   *skip &= (p->eobs[block] == 0);
 }
 
@@ -4374,8 +4380,8 @@
                  &dist, &ssz);
       thisdistortion += dist;
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
-                              so->neighbors, cpi->sf.use_fast_coef_costing);
+      thisrate += av1_cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
+                                  so->neighbors, cpi->sf.use_fast_coef_costing);
       *(ta + (k & 1)) = !(p->eobs[block] == 0);
       *(tl + (k >> 1)) = !(p->eobs[block] == 0);
 #if CONFIG_EXT_TX
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index eb0ff9f..584c439 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -26,6 +26,9 @@
 struct macroblock;
 struct RD_COST;
 
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+                    TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing);
 void av1_rd_pick_intra_mode_sb(struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 3bf2410..d659607 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -23,6 +23,7 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
 static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
@@ -346,8 +347,31 @@
   AV1_COMP *cpi;
   ThreadData *td;
   TOKENEXTRA **tp;
+  int this_rate;
 };
 
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const int ref = is_inter_block(mbmi);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order *const so = get_scan(tx_size, tx_type, ref);
+  int pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                               pd->left_context + blk_row);
+  int rate =
+      av1_cost_coeffs(x, plane, block, pt, tx_size, so->scan, so->neighbors, 0);
+  args->this_rate += rate;
+  av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
+                   blk_row);
+}
+
 static void set_entropy_context_b(int plane, int block, int blk_row,
                                   int blk_col, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
@@ -395,8 +419,9 @@
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
-                             int plane, TOKENEXTRA **t) {
+void av1_tokenize_palette_sb(AV1_COMP *cpi, struct ThreadData *const td,
+                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+                             BLOCK_SIZE bsize, int *rate) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -404,7 +429,8 @@
   PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
   int n = pmi->palette_size[plane != 0];
   int i, j, k;
-  int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  int this_rate = 0;
+  int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -419,16 +445,19 @@
           av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
       for (k = 0; k < n; ++k)
         if (color_map[i * cols + j] == color_order[k]) {
-          color_new_idx = k;
+          color_idx = k;
           break;
         }
-      assert(color_new_idx >= 0 && color_new_idx < n);
-      (*t)->token = color_new_idx;
+      assert(color_idx >= 0 && color_idx < n);
+      if (dry_run == DRY_RUN_COSTCOEFFS)
+        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
+      (*t)->token = color_idx;
       (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
     }
   }
+  if (rate) *rate += this_rate;
 }
 
 static void tokenize_b(int plane, int block, int blk_row, int blk_col,
@@ -560,7 +589,7 @@
 }
 
 #if CONFIG_VAR_TX
-void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, int dry_run,
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
                     TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
                     int blk_col, int block, int plane, void *arg) {
   MACROBLOCK *const x = &td->mb;
@@ -593,9 +622,11 @@
     BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
     if (!dry_run)
       tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-    else
+    else if (dry_run == DRY_RUN_NORMAL)
       set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
                             tx_size, arg);
+    else if (dry_run == DRY_RUN_COSTCOEFFS)
+      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
   } else {
     int bsl = b_width_log2_lookup[bsize];
     int i;
@@ -617,8 +648,8 @@
 }
 
 void av1_tokenize_sb_vartx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                           int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize) {
+                           RUN_TYPE dry_run, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -627,7 +658,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   int plane;
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
@@ -667,11 +698,12 @@
       (*t)++;
     }
   }
+  if (rate) *rate += arg.this_rate;
 }
 #endif  // CONFIG_VAR_TX
 
-void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize) {
+void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -679,7 +711,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -697,14 +729,17 @@
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
+  if (rate) *rate += arg.this_rate;
 }
 
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                             int dry_run, BLOCK_SIZE bsize) {
+                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -712,7 +747,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -730,9 +765,12 @@
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
     *t = t_backup;
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
+  if (rate) *rate += arg.this_rate;
 }
 #endif  // CONFIG_SUPERTX
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index a7e30d5..520e1b6 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -56,19 +56,31 @@
 struct AV1_COMP;
 struct ThreadData;
 
+typedef enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
 #if CONFIG_VAR_TX
 void av1_tokenize_sb_vartx(struct AV1_COMP *cpi, struct ThreadData *td,
-                           TOKENEXTRA **t, int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize);
+                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, int *rate);
 #endif
 
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
-                             int plane, TOKENEXTRA **t);
+void av1_tokenize_palette_sb(struct AV1_COMP *cpi, struct ThreadData *const td,
+                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+                             BLOCK_SIZE bsize, int *rate);
 void av1_tokenize_sb(struct AV1_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     int *rate);
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(struct AV1_COMP *cpi, struct ThreadData *td,
-                             TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
 #endif
 
 extern const int16_t *av1_dct_value_cost_ptr;