Introduce runtime switch for dist_8x8

Even if 'dist-8x8' is enabled with configure,
the dist-8x8 is not acutally enabled (so, no change in encoding behaviour)
until the command line option, '--enable-dist-8x8=1" is used.

The cdef-dist and daala-dist can not be enabled by a command line option yet.

This commit is a part of prep-work to remove DIST_8X8, CDEF_DIST,
and DAALA_DIST experimental flags.

Change-Id: I5c2df90f837b32f44e756572a19272dfb4c3dff4
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 12d2aef..ab4f72f 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -18,6 +18,9 @@
 #include "av1/encoder/encint.h"
 #endif
 #include "av1/common/mvref_common.h"
+#if CONFIG_DIST_8X8
+#include "aom/aomcx.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -275,6 +278,8 @@
   int pvq_coded;  // Indicates whether pvq_info needs be stored to tokenize
 #endif
 #if CONFIG_DIST_8X8
+  int using_dist_8x8;
+  aom_tune_metric tune_metric;
 #if CONFIG_CB4X4
 #if CONFIG_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, decoded_8x8[8 * 8]);
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 9e086d6..e9aa9bb 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3711,7 +3711,8 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize == BLOCK_8X8 && this_rdc.rate != INT_MAX) {
+        if (x->using_dist_8x8 && bsize == BLOCK_8X8 &&
+            this_rdc.rate != INT_MAX) {
           assert(this_rdc.dist_y < INT64_MAX);
         }
 #endif
@@ -3729,7 +3730,7 @@
           sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (bsize == BLOCK_8X8) {
+          if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
             assert(this_rdc.dist_y < INT64_MAX);
             sum_rdc.dist_y += this_rdc.dist_y;
           }
@@ -3739,8 +3740,8 @@
       reached_last_index = (idx == 4);
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (reached_last_index && sum_rdc.rdcost != INT64_MAX &&
-          bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && reached_last_index &&
+          sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
         int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
         uint8_t *decoded_8x8;
@@ -3900,7 +3901,7 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->horizontal[1], mi_row + mi_step, mi_col,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row + mi_step, mi_col,
@@ -3921,11 +3922,12 @@
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        sum_rdc.dist_y += this_rdc.dist_y;
+        if (x->using_dist_8x8) sum_rdc.dist_y += this_rdc.dist_y;
 #endif
       }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
+          bsize == BLOCK_8X8) {
         int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
         uint8_t *decoded_8x8;
@@ -4080,7 +4082,7 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
         update_state(cpi, td, &pc_tree->vertical[1], mi_row, mi_col + mi_step,
                      subsize, DRY_RUN_NORMAL);
         encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col + mi_step,
@@ -4101,11 +4103,12 @@
         sum_rate_nocoef += this_rate_nocoef;
 #endif  // CONFIG_SUPERTX
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        sum_rdc.dist_y += this_rdc.dist_y;
+        if (x->using_dist_8x8) sum_rdc.dist_y += this_rdc.dist_y;
 #endif
       }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
+      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
+          bsize == BLOCK_8X8) {
         int64_t dist_8x8;
         const int src_stride = x->plane[0].src.stride;
         uint8_t *decoded_8x8;
@@ -4377,7 +4380,7 @@
   *rd_cost = best_rdc;
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize <= BLOCK_8X8 && rd_cost->rate != INT_MAX) {
+  if (x->using_dist_8x8 && bsize <= BLOCK_8X8 && rd_cost->rate != INT_MAX) {
     assert(rd_cost->dist_y < INT64_MAX);
   }
 #endif  // CONFIG_DIST_8X8 && CONFIG_CB4X4
@@ -4407,8 +4410,8 @@
 #endif
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
-      bsize == BLOCK_4X4 && pc_tree->index == 3) {
+  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
+      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
@@ -5055,6 +5058,10 @@
 
   x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
   x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+#if CONFIG_DIST_8X8
+  x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
+  x->tune_metric = cpi->oxcf.tuning;
+#endif
   cm->setup_mi(cm);
 
   xd->mi = cm->mi_grid_visible;
@@ -6031,7 +6038,7 @@
   }
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize < BLOCK_8X8) {
+  if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
     dist_8x8_set_sub8x8_dst(x, (uint8_t *)x->decoded_8x8, bsize,
                             block_size_wide[bsize], block_size_high[bsize],
                             mi_row, mi_col);
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 506a4f0..24236b9 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -572,27 +572,34 @@
 
 #if CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
   dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
+
 #if CONFIG_PVQ || CONFIG_DIST_8X8
-  pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  if (CONFIG_PVQ
+#if CONFIG_DIST_8X8
+      || x->using_dist_8x8
+#endif  // CONFIG_DIST_8X8
+      ) {
+    pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
 // copy uint8 orig and predicted block to int16 buffer
 // in order to use existing VP10 transform functions
 #if CONFIG_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        pred[diff_stride * j + i] =
-            CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
-  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < txh; j++)
+        for (i = 0; i < txw; i++)
+          pred[diff_stride * j + i] =
+              CONVERT_TO_SHORTPTR(dst)[dst_stride * j + i];
+    } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    for (j = 0; j < txh; j++)
-      for (i = 0; i < txw; i++)
-        pred[diff_stride * j + i] = dst[dst_stride * j + i];
+      for (j = 0; j < txh; j++)
+        for (i = 0; i < txw; i++)
+          pred[diff_stride * j + i] = dst[dst_stride * j + i];
 #if CONFIG_HIGHBITDEPTH
-  }
+    }
 #endif  // CONFIG_HIGHBITDEPTH
+  }
 #endif  // CONFIG_PVQ || CONFIG_DIST_8X8
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8 || CONFIG_LGT || CONFIG_MRC_TX
 
   (void)ctx;
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 45f22d4..acc7b10 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -61,7 +61,7 @@
 #endif  // CONFIG_PVQ
 #if CONFIG_PVQ || CONFIG_DAALA_DIST
 #include "av1/common/pvq.h"
-#endif  // CONFIG_PVQ || CONFIG_DIST_8X8
+#endif  // CONFIG_PVQ
 #if CONFIG_DUAL_FILTER
 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
 #if USE_EXTRA_FILTER
@@ -1801,7 +1801,7 @@
   assert(visible_cols > 0);
 
 #if CONFIG_DIST_8X8
-  if (plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+  if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
     return av1_dist_8x8(cpi, xd, src, src_stride, dst, dst_stride, tx_bsize,
                         txb_cols, txb_rows, visible_cols, visible_rows,
                         x->qindex);
@@ -1850,7 +1850,7 @@
                      NULL, &visible_cols, &visible_rows);
 
 #if CONFIG_DIST_8X8
-  if (plane == 0 && txb_width >= 8 && txb_height >= 8)
+  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
     return av1_dist_8x8_diff(xd, src, src_stride, diff, diff_stride, txb_width,
                              txb_height, visible_cols, visible_rows, x->qindex);
   else
@@ -1906,7 +1906,11 @@
   const struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif  // CONFIG_DIST_8X8
 
-  if (cpi->sf.use_transform_domain_distortion && !CONFIG_DIST_8X8) {
+  if (cpi->sf.use_transform_domain_distortion
+#if CONFIG_DIST_8X8
+      && !x->using_dist_8x8
+#endif
+      ) {
     // Transform domain distortion computation is more efficient as it does
     // not involve an inverse transform, but it is less accurate.
     const int buffer_length = tx_size_2d[tx_size];
@@ -2017,7 +2021,7 @@
                                     tx_type, tx_size, recon, MAX_TX_SIZE, eob);
 
 #if CONFIG_DIST_8X8
-        if (plane == 0 && (bsw < 8 || bsh < 8)) {
+        if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
           // Save decoded pixels for inter block in pd->pred to avoid
           // block_8x8_rd_txfm_daala_dist() need to produce them
           // by calling av1_inverse_transform_block() again.
@@ -2133,7 +2137,7 @@
 
   if (
 #if CONFIG_DIST_8X8
-      sub8x8tx_in_gte8x8blk_in_plane0 ||
+      (x->using_dist_8x8 && sub8x8tx_in_gte8x8blk_in_plane0) ||
 #endif
       RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
@@ -2223,7 +2227,7 @@
   args->this_rd += rd;
 
 #if CONFIG_DIST_8X8
-  if (!sub8x8tx_in_gte8x8blk_in_plane0) {
+  if (!x->using_dist_8x8 || !sub8x8tx_in_gte8x8blk_in_plane0) {
 #endif
     if (args->this_rd > args->best_rd) {
       args->exit_early = 1;
@@ -2330,7 +2334,8 @@
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
 #if CONFIG_DIST_8X8
-  if (!args.exit_early && plane == 0 && bsize >= BLOCK_8X8 &&
+  if (x->using_dist_8x8 && !args.exit_early && plane == 0 &&
+      bsize >= BLOCK_8X8 &&
       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
     dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
 #endif
@@ -3803,9 +3808,11 @@
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
-#if !CONFIG_DIST_8X8
-      if (this_rd >= best_rd - total_rd) return INT64_MAX;
-#endif  // !CONFIG_DIST_8X8
+#if CONFIG_DIST_8X8
+      if (!cpi->oxcf.using_dist_8x8)
+#endif
+        if (this_rd >= best_rd - total_rd) return INT64_MAX;
+
       total_rd += this_rd;
       cost += r;
       total_distortion += d;
@@ -3823,7 +3830,7 @@
   mbmi->mode = mic->bmi[3].as_mode;
 
 #if CONFIG_DIST_8X8
-  {
+  if (cpi->oxcf.using_dist_8x8) {
     const struct macroblock_plane *p = &mb->plane[0];
     const struct macroblockd_plane *pd = &xd->plane[0];
     const int src_stride = p->src.stride;
@@ -4620,7 +4627,7 @@
 #endif  // CONFIG_MRC_TX
   if (
 #if CONFIG_DIST_8X8
-      sub8x8tx_in_gte8x8blk_in_plane0 ||
+      (x->using_dist_8x8 && sub8x8tx_in_gte8x8blk_in_plane0) ||
 #endif
       RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
     av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
@@ -4647,7 +4654,7 @@
 #endif
   if (eob > 0) {
 #if CONFIG_DIST_8X8
-    if (plane == 0 && (bw < 8 && bh < 8)) {
+    if (x->using_dist_8x8 && plane == 0 && (bw < 8 && bh < 8)) {
       // Save sub8x8 luma decoded pixels
       // since 8x8 luma decoded pixels are not available for daala-dist
       // after recursive split of BLOCK_8x8 is done.
@@ -4967,20 +4974,22 @@
                       &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
                       rd_stats_stack);
 #if CONFIG_DIST_8X8
-      if (plane == 0 && tx_size == TX_8X8) {
+      if (x->using_dist_8x8 && plane == 0 && tx_size == TX_8X8) {
         sub8x8_eob[i] = p->eobs[block];
       }
 #endif  // CONFIG_DIST_8X8
       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
 
       tmp_rd = RDCOST(x->rdmult, sum_rd_stats.rate, sum_rd_stats.dist);
-#if !CONFIG_DIST_8X8
-      if (this_rd < tmp_rd) break;
+#if CONFIG_DIST_8X8
+      if (!x->using_dist_8x8)
 #endif
+        if (this_rd < tmp_rd) break;
       block += sub_step;
     }
 #if CONFIG_DIST_8X8
-    if (this_cost_valid && plane == 0 && tx_size == TX_8X8) {
+    if (x->using_dist_8x8 && this_cost_valid && plane == 0 &&
+        tx_size == TX_8X8) {
       const int src_stride = p->src.stride;
       const int dst_stride = pd->dst.stride;
 
@@ -9956,7 +9965,7 @@
     }
     rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    rd_cost->dist_y = dist_y;
+    if (x->using_dist_8x8) rd_cost->dist_y = dist_y;
 #endif
   } else {
     rd_cost->rate = INT_MAX;
@@ -11073,7 +11082,7 @@
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
+      if (x->using_dist_8x8 && bsize < BLOCK_8X8) distortion2_y = distortion_y;
 #endif
     } else {
       int_mv backup_ref_mv[2];
@@ -11182,7 +11191,8 @@
         // combined luma and chroma dist and sse.
         // This can be seen inside motion_mode_rd(), which is called by
         // handle_inter_mode().
-        if (bsize < BLOCK_8X8) av1_init_rd_stats(&rd_stats_y);
+        if (x->using_dist_8x8 && bsize < BLOCK_8X8)
+          av1_init_rd_stats(&rd_stats_y);
 #endif
         rd_stats.rate = rate2;
 
@@ -11206,7 +11216,7 @@
         rate_y = rd_stats_y.rate;
         rate_uv = rd_stats_uv.rate;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) {
+        if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
           if (rd_stats_y.rate != INT_MAX) {
             assert(rd_stats_y.sse < INT64_MAX);
             assert(rd_stats_y.dist < INT64_MAX);
@@ -11394,7 +11404,8 @@
             // tmp_rd_stats.skip = 1 and tmp_rd_stats.dist and .sse
             // represent combined luma and chroma .dist and .sse,
             // we should initialized tmp_rd_stats_y.
-            if (bsize < BLOCK_8X8) av1_init_rd_stats(&tmp_rd_stats_y);
+            if (x->using_dist_8x8 && bsize < BLOCK_8X8)
+              av1_init_rd_stats(&tmp_rd_stats_y);
 #endif
             // Point to variables that are not maintained between iterations
             args.single_newmv = dummy_single_newmv;
@@ -11470,7 +11481,7 @@
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-            if (bsize < BLOCK_8X8) {
+            if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
               if (tmp_rd_stats_y.rate != INT_MAX) {
                 assert(tmp_rd_stats_y.sse < INT64_MAX);
                 assert(tmp_rd_stats_y.dist < INT64_MAX);
@@ -11566,7 +11577,7 @@
           rate_y = 0;
           rate_uv = 0;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-          if (bsize < BLOCK_8X8) {
+          if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
             assert(total_sse_y < INT64_MAX);
             distortion2_y = total_sse_y;
           }
@@ -11591,9 +11602,8 @@
     }
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if ((bsize < BLOCK_8X8) && (rate2 != INT_MAX)) {
+    if (x->using_dist_8x8 && bsize < BLOCK_8X8 && rate2 != INT_MAX)
       assert(distortion2_y < INT64_MAX);
-    }
 #endif
 
     if (ref_frame == INTRA_FRAME) {
@@ -11672,7 +11682,7 @@
                                             this_skip2 || skippable);
         best_rate_uv = rate_uv;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-        if (bsize < BLOCK_8X8) {
+        if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
           assert(distortion2_y < INT64_MAX);
           rd_cost->dist_y = distortion2_y;
         }
@@ -11685,9 +11695,8 @@
       }
     }
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-    if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
+    if (x->using_dist_8x8 && bsize < BLOCK_8X8 && rd_cost->rate != INT_MAX)
       assert(rd_cost->dist_y < INT64_MAX);
-    }
 #endif
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
@@ -11820,7 +11829,7 @@
       rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
       best_skip2 = skip_blk;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-      if (bsize < BLOCK_8X8) {
+      if (x->using_dist_8x8 && bsize < BLOCK_8X8) {
         assert(rd_cost->rate != INT_MAX);
         assert(rd_cost->dist_y < INT64_MAX);
         rd_cost->dist_y = rd_stats_y.dist;
@@ -11830,9 +11839,8 @@
   }
 
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if ((bsize < BLOCK_8X8) && (rd_cost->rate != INT_MAX)) {
+  if (x->using_dist_8x8 && bsize < BLOCK_8X8 && rd_cost->rate != INT_MAX)
     assert(rd_cost->dist_y < INT64_MAX);
-  }
 #endif
 
   // Only try palette mode when the best mode so far is an intra mode.
@@ -12366,7 +12374,7 @@
   rd_cost->dist = distortion2;
   rd_cost->rdcost = this_rd;
 #if CONFIG_DIST_8X8 && CONFIG_CB4X4
-  if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
+  if (x->using_dist_8x8 && bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
 #endif
   if (this_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;