[dist-8x8] Enforce partition and tx size 8x8 or larger

Because of the incompleteness, unreliability and difficult maintainability of the
code, which are deeply coupled with RDO-based mode decision, CONFIG_DIST_8X8 experiment
enforces RDO mode decision to use 8x8 or larger partition and
transform block sizes.

If --tune=[daala-dist|cdef-dist] is given, min partition size is enforced
to 8x8 pixels (4x16 and 16x4 are excluded as well) and tx_size < 8x8 is
excluded during RDO search.

To see the performance change of "--tune=[daala-dist|cdef-dist]",
compare with --enable-dist-8x8, which have the same mode decision options but
RDO uses PSNR as a distortion metric.

Change-Id: If052bd66f82a01e6f3675e9202e71f878b8baf5a
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index cbb23eb..7a57128 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -355,7 +355,6 @@
 #if CONFIG_DIST_8X8
   int using_dist_8x8;
   aom_tune_metric tune_metric;
-  DECLARE_ALIGNED(16, int16_t, pred_luma[MAX_SB_SQUARE]);
 #endif  // CONFIG_DIST_8X8
   int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
   int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 5167d43..a1db574 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2350,45 +2350,6 @@
 #undef RTP_STX_TRY_ARGS
 }
 
-#if CONFIG_DIST_8X8
-static int64_t dist_8x8_yuv(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            uint8_t *src_plane_8x8[MAX_MB_PLANE],
-                            uint8_t *dst_plane_8x8[MAX_MB_PLANE]) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int64_t dist_8x8, dist_8x8_uv, total_dist;
-  const int src_stride = x->plane[0].src.stride;
-  int plane;
-
-  const int dst_stride = xd->plane[0].dst.stride;
-  dist_8x8 =
-      av1_dist_8x8(cpi, x, src_plane_8x8[0], src_stride, dst_plane_8x8[0],
-                   dst_stride, BLOCK_8X8, 8, 8, 8, 8, x->qindex)
-      << 4;
-
-  // Compute chroma distortion for a luma 8x8 block
-  dist_8x8_uv = 0;
-
-  if (num_planes > 1) {
-    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-      unsigned sse;
-      const int src_stride_uv = x->plane[plane].src.stride;
-      const int dst_stride_uv = xd->plane[plane].dst.stride;
-      const int ssx = xd->plane[plane].subsampling_x;
-      const int ssy = xd->plane[plane].subsampling_y;
-      const BLOCK_SIZE plane_bsize = get_plane_block_size(BLOCK_8X8, ssx, ssy);
-
-      cpi->fn_ptr[plane_bsize].vf(src_plane_8x8[plane], src_stride_uv,
-                                  dst_plane_8x8[plane], dst_stride_uv, &sse);
-      dist_8x8_uv += (int64_t)sse << 4;
-    }
-  }
-
-  return total_dist = dist_8x8 + dist_8x8_uv;
-}
-#endif  // CONFIG_DIST_8X8
-
 static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   pc_tree->partitioning = PARTITION_NONE;
   pc_tree->cb_search_range = SEARCH_FULL_PLANE;
@@ -2492,6 +2453,11 @@
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
+#if CONFIG_DIST_8X8
+  if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+    do_square_split = 0;
+#endif
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
@@ -2549,17 +2515,6 @@
   int64_t temp_best_rdcost = best_rdc.rdcost;
   pn_rdc = best_rdc;
 
-#if CONFIG_DIST_8X8
-  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
-  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
-    for (int i = 0; i < MAX_MB_PLANE; i++) {
-      src_plane_8x8[i] = x->plane[i].src.buf;
-      dst_plane_8x8[i] = xd->plane[i].dst.buf;
-    }
-  }
-#endif  // CONFIG_DIST_8X8
-
   // PARTITION_SPLIT
   if (do_square_split) {
     int reached_last_index = 0;
@@ -2597,14 +2552,6 @@
     }
     reached_last_index = (idx == 4);
 
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && reached_last_index &&
-        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-      sum_rdc.dist = dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-    }
-#endif  // CONFIG_DIST_8X8
-
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -2663,14 +2610,6 @@
     }
   }
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
-      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
-    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-              pc_tree, NULL);
-  }
-#endif  // CONFIG_DIST_8X8
-
   if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
@@ -3395,6 +3334,15 @@
   // Partition block source pixel variance.
   unsigned int pb_source_variance = UINT_MAX;
 
+#if CONFIG_DIST_8X8
+  if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
+
+  if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
+
+  if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+    do_square_split = 0;
+#endif
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
@@ -3510,17 +3458,6 @@
   // store estimated motion vector
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
-#if CONFIG_DIST_8X8
-  uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
-
-  if (x->using_dist_8x8 && bsize == BLOCK_8X8) {
-    for (int i = 0; i < num_planes; i++) {
-      src_plane_8x8[i] = x->plane[i].src.buf;
-      dst_plane_8x8[i] = xd->plane[i].dst.buf;
-    }
-  }
-#endif  // CONFIG_DIST_8X8
-
   // PARTITION_SPLIT
   if (do_square_split) {
     av1_init_rd_stats(&sum_rdc);
@@ -3571,21 +3508,6 @@
     }
     const int reached_last_index = (idx == 4);
 
-#if CONFIG_DIST_8X8
-    if (x->using_dist_8x8 && reached_last_index &&
-        sum_rdc.rdcost != INT64_MAX && bsize == BLOCK_8X8) {
-      const int64_t dist_8x8 =
-          dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
-      // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
-      if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
-        assert(sum_rdc.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-      sum_rdc.dist = dist_8x8;
-      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-    }
-#endif  // CONFIG_DIST_8X8
-
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -3660,15 +3582,6 @@
                        best_rdc.rdcost - sum_rdc.rdcost);
       horz_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, tile_data, td, &pc_tree->horizontal[1],
-                     mi_row + mi_step, mi_col, subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
-                          mi_row + mi_step, mi_col, subsize, NULL);
-      }
-#endif  // CONFIG_DIST_8X8
-
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
@@ -3676,20 +3589,6 @@
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
       }
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
-          bsize == BLOCK_8X8) {
-        const int64_t dist_8x8 =
-            dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
-        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
-        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 && 0 /*!CONFIG_CFL*/)
-          assert(sum_rdc.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-        sum_rdc.dist = dist_8x8;
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      }
-#endif  // CONFIG_DIST_8X8
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -3746,15 +3645,6 @@
                        best_rdc.rdcost - sum_rdc.rdcost);
       vert_rd[1] = this_rdc.rdcost;
 
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
-        update_state(cpi, tile_data, td, &pc_tree->vertical[1], mi_row,
-                     mi_col + mi_step, subsize, DRY_RUN_NORMAL);
-        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
-                          mi_col + mi_step, subsize, NULL);
-      }
-#endif  // CONFIG_DIST_8X8
-
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
@@ -3762,21 +3652,6 @@
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
       }
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && sum_rdc.rdcost != INT64_MAX &&
-          bsize == BLOCK_8X8) {
-        const int64_t dist_8x8 =
-            dist_8x8_yuv(cpi, x, src_plane_8x8, dst_plane_8x8);
-#ifdef DEBUG_DIST_8X8
-        // TODO(anyone): Fix dist-8x8 assert failure here when CFL is enabled
-        if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8 &&
-            0 /* !CONFIG_CFL */)
-          assert(sum_rdc.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-        sum_rdc.dist = dist_8x8;
-        sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
-      }
-#endif  // CONFIG_DIST_8X8
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -3810,6 +3685,13 @@
   int horzab_partition_allowed = ext_partition_allowed;
   int vertab_partition_allowed = ext_partition_allowed;
 
+#if CONFIG_DIST_8X8
+  if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
+    horzab_partition_allowed = 0;
+    vertab_partition_allowed = 0;
+  }
+#endif
+
   if (cpi->sf.prune_ext_partition_types_search_level) {
     if (cpi->sf.prune_ext_partition_types_search_level == 1) {
       // TODO(debargha,huisu@google.com): may need to tune the threshold for
@@ -4046,6 +3928,13 @@
                          &partition_vert4_allowed);
   }
 
+#if CONFIG_DIST_8X8
+  if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
+    partition_horz4_allowed = 0;
+    partition_vert4_allowed = 0;
+  }
+#endif
+
   // PARTITION_HORZ_4
   if (partition_horz4_allowed && has_rows &&
       (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
@@ -4158,14 +4047,6 @@
     }
   }
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && best_rdc.rate < INT_MAX &&
-      best_rdc.dist < INT64_MAX && bsize == BLOCK_4X4 && pc_tree->index == 3) {
-    encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
-              pc_tree, NULL);
-  }
-#endif  // CONFIG_DIST_8X8
-
   if (bsize == cm->seq_params.sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 5cf61cd..0c50ba1 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2018,7 +2018,7 @@
   assert(visible_cols > 0);
 
 #if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && txb_cols >= 8 && txb_rows >= 8)
+  if (x->using_dist_8x8 && plane == 0)
     return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
                                   tx_bsize, txb_cols, txb_rows, visible_cols,
                                   visible_rows, x->qindex);
@@ -2036,8 +2036,7 @@
 static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
                                       int blk_row, int blk_col,
                                       const BLOCK_SIZE plane_bsize,
-                                      const BLOCK_SIZE tx_bsize,
-                                      int force_sse) {
+                                      const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -2047,8 +2046,7 @@
 #if CONFIG_DIST_8X8
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
-  if (!force_sse && x->using_dist_8x8 && plane == 0 && txb_width >= 8 &&
-      txb_height >= 8) {
+  if (x->using_dist_8x8 && plane == 0) {
     const int src_stride = x->plane[plane].src.stride;
     const int src_idx = (blk_row * src_stride + blk_col)
                         << tx_size_wide_log2[0];
@@ -2059,8 +2057,6 @@
                          txb_width, txb_height, visible_cols, visible_rows,
                          x->qindex);
   }
-#else
-  (void)force_sse;
 #endif
   diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
   return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
@@ -2216,29 +2212,7 @@
   av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
                               MAX_TX_SIZE, eob,
                               cpi->common.reduced_tx_set_used);
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
-    // Save decoded pixels for inter block in pd->pred to avoid
-    // block_8x8_rd_txfm_daala_dist() need to produce them
-    // by calling av1_inverse_transform_block() again.
-    const int pred_stride = block_size_wide[plane_bsize];
-    const int pred_idx = (blk_row * pred_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    int16_t *pred = &x->pred_luma[pred_idx];
-    int i, j;
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          pred[j * pred_stride + i] =
-              CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
-    } else {
-      for (j = 0; j < bsh; j++)
-        for (i = 0; i < bsw; i++)
-          pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
-    }
-  }
-#endif  // CONFIG_DIST_8X8
   return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
                          blk_row, blk_col, plane_bsize, tx_bsize);
 }
@@ -3069,7 +3043,6 @@
 #if CONFIG_DIST_8X8
   if (x->using_dist_8x8) use_transform_domain_distortion = 0;
 #endif
-
   int calc_pixel_domain_distortion_final =
       cpi->sf.use_transform_domain_distortion == 1 &&
       use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
@@ -3082,7 +3055,7 @@
 
   const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
   int64_t block_sse =
-      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, 1);
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
   block_sse *= 16;
@@ -3256,21 +3229,6 @@
   int64_t rd1, rd2, rd;
   RD_STATS this_rd_stats;
 
-#if CONFIG_DIST_8X8
-  // If sub8x8 tx, 8x8 or larger partition, and luma channel,
-  // dist-8x8 disables early skip, because the distortion metrics for
-  // sub8x8 tx (MSE) and reference distortion from 8x8 or larger partition
-  // (new distortion metric) are different.
-  // Exception is: dist-8x8 is enabled but still MSE is used,
-  // i.e. "--tune=" encoder option is not used.
-  int bw = block_size_wide[plane_bsize];
-  int bh = block_size_high[plane_bsize];
-  int disable_early_skip =
-      x->using_dist_8x8 && plane == AOM_PLANE_Y && bw >= 8 && bh >= 8 &&
-      (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4) &&
-      x->tune_metric != AOM_TUNE_PSNR;
-#endif  // CONFIG_DIST_8X8
-
   av1_init_rd_stats(&this_rd_stats);
 
   if (args->exit_early) return;
@@ -3314,100 +3272,11 @@
 
   args->this_rd += rd;
 
-#if CONFIG_DIST_8X8
-  if (!disable_early_skip)
-#endif
-    if (args->this_rd > args->best_rd) {
-      args->exit_early = 1;
-      return;
-    }
-}
-
-#if CONFIG_DIST_8X8
-static void dist_8x8_sub8x8_txfm_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                    BLOCK_SIZE bsize,
-                                    struct rdcost_block_args *args) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[0];
-  const struct macroblock_plane *const p = &x->plane[0];
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  const uint8_t *src = &p->src.buf[0];
-  const uint8_t *dst = &pd->dst.buf[0];
-  const int16_t *pred = &x->pred_luma[0];
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
-  int visible_w = bw;
-  int visible_h = bh;
-
-  int i, j;
-  int64_t rd, rd1, rd2;
-  int64_t sse = INT64_MAX, dist = INT64_MAX;
-  int qindex = x->qindex;
-
-  assert((bw & 0x07) == 0);
-  assert((bh & 0x07) == 0);
-
-  get_txb_dimensions(xd, 0, bsize, 0, 0, bsize, &bw, &bh, &visible_w,
-                     &visible_h);
-
-  const int diff_stride = block_size_wide[bsize];
-  const int16_t *diff = p->src_diff;
-  sse = dist_8x8_diff(x, src, src_stride, diff, diff_stride, bw, bh, visible_w,
-                      visible_h, qindex);
-  sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-  sse *= 16;
-
-  if (!is_inter_block(mbmi)) {
-    dist = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, bsize, bw, bh,
-                        visible_w, visible_h, qindex);
-    dist *= 16;
-  } else {
-    // For inter mode, the decoded pixels are provided in x->pred_luma,
-    // while the predicted pixels are in dst.
-    uint8_t *pred8;
-    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      pred8 = CONVERT_TO_BYTEPTR(pred16);
-    else
-      pred8 = (uint8_t *)pred16;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (j = 0; j < bh; j++)
-        for (i = 0; i < bw; i++)
-          CONVERT_TO_SHORTPTR(pred8)[j * bw + i] = pred[j * bw + i];
-    } else {
-      for (j = 0; j < bh; j++)
-        for (i = 0; i < bw; i++) pred8[j * bw + i] = (uint8_t)pred[j * bw + i];
-    }
-
-    dist = av1_dist_8x8(cpi, x, src, src_stride, pred8, bw, bsize, bw, bh,
-                        visible_w, visible_h, qindex);
-    dist *= 16;
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
   }
-
-#ifdef DEBUG_DIST_8X8
-  if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8) {
-    assert(args->rd_stats.sse == sse);
-    assert(args->rd_stats.dist == dist);
-  }
-#endif  // DEBUG_DIST_8X8
-
-  args->rd_stats.sse = sse;
-  args->rd_stats.dist = dist;
-
-  rd1 = RDCOST(x->rdmult, args->rd_stats.rate, args->rd_stats.dist);
-  rd2 = RDCOST(x->rdmult, 0, args->rd_stats.sse);
-  rd = AOMMIN(rd1, rd2);
-
-  args->rd_stats.rdcost = rd;
-  args->this_rd = rd;
-
-  if (args->this_rd > args->best_rd) args->exit_early = 1;
 }
-#endif  // CONFIG_DIST_8X8
 
 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
@@ -3431,15 +3300,6 @@
 
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
-#if CONFIG_DIST_8X8
-  int bw = block_size_wide[bsize];
-  int bh = block_size_high[bsize];
-
-  if (x->using_dist_8x8 && !args.exit_early && plane == 0 && bw >= 8 &&
-      bh >= 8 && (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
-    dist_8x8_sub8x8_txfm_rd(cpi, x, bsize, &args);
-#endif
-
   if (args.exit_early) {
     av1_invalid_rd_stats(rd_stats);
   } else {
@@ -3611,6 +3471,11 @@
   prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
 
   for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8) {
+      if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
+    }
+#endif
     RD_STATS this_rd_stats;
     if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
     rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
@@ -4835,11 +4700,7 @@
   RD_STATS this_rd_stats;
   int this_cost_valid = 1;
   int64_t tmp_rd = 0;
-#if CONFIG_DIST_8X8
-  int sub8x8_eob[4] = { 0, 0, 0, 0 };
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
-#endif
+
   split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
 
   assert(tx_size < TX_SIZES_ALL);
@@ -4857,123 +4718,22 @@
           &this_cost_valid, ftxs_mode,
           (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
 
-#if CONFIG_DIST_8X8
-      if (!x->using_dist_8x8)
-#endif
-        if (!this_cost_valid) goto LOOP_EXIT;
-#if CONFIG_DIST_8X8
-      if (x->using_dist_8x8 && tx_size == TX_8X8) {
-        sub8x8_eob[2 * (r / bsh) + (c / bsw)] = p->eobs[block];
-      }
-#endif  // CONFIG_DIST_8X8
+      if (!this_cost_valid) goto LOOP_EXIT;
+
       av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
 
       tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-#if CONFIG_DIST_8X8
-      if (!x->using_dist_8x8)
-#endif
-        if (no_split_rd < tmp_rd) {
-          this_cost_valid = 0;
-          goto LOOP_EXIT;
-        }
+
+      if (no_split_rd < tmp_rd) {
+        this_cost_valid = 0;
+        goto LOOP_EXIT;
+      }
       block += sub_step;
     }
   }
 
 LOOP_EXIT : {}
 
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && this_cost_valid && tx_size == TX_8X8) {
-    const int src_stride = p->src.stride;
-    const int dst_stride = pd->dst.stride;
-
-    const uint8_t *src =
-        &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
-    const uint8_t *dst =
-        &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
-
-    int64_t dist_8x8;
-    const int qindex = x->qindex;
-    const int pred_stride = block_size_wide[plane_bsize];
-    const int pred_idx = (blk_row * pred_stride + blk_col)
-                         << tx_size_wide_log2[0];
-    const int16_t *pred = &x->pred_luma[pred_idx];
-    int i, j;
-    int row, col;
-
-    uint8_t *pred8;
-    DECLARE_ALIGNED(16, uint16_t, pred8_16[8 * 8]);
-
-    dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride, BLOCK_8X8,
-                            8, 8, 8, 8, qindex) *
-               16;
-
-#ifdef DEBUG_DIST_8X8
-    if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
-      assert(sum_rd_stats.sse == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-
-    split_rd_stats->sse = dist_8x8;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      pred8 = CONVERT_TO_BYTEPTR(pred8_16);
-    else
-      pred8 = (uint8_t *)pred8_16;
-
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      for (row = 0; row < 2; ++row) {
-        for (col = 0; col < 2; ++col) {
-          int idx = row * 2 + col;
-          int eob = sub8x8_eob[idx];
-
-          if (eob > 0) {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                CONVERT_TO_SHORTPTR(pred8)
-                [(row * 4 + j) * 8 + 4 * col + i] =
-                    pred[(row * 4 + j) * pred_stride + 4 * col + i];
-          } else {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                CONVERT_TO_SHORTPTR(pred8)
-                [(row * 4 + j) * 8 + 4 * col + i] = CONVERT_TO_SHORTPTR(
-                    dst)[(row * 4 + j) * dst_stride + 4 * col + i];
-          }
-        }
-      }
-    } else {
-      for (row = 0; row < 2; ++row) {
-        for (col = 0; col < 2; ++col) {
-          int idx = row * 2 + col;
-          int eob = sub8x8_eob[idx];
-
-          if (eob > 0) {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                    (uint8_t)pred[(row * 4 + j) * pred_stride + 4 * col + i];
-          } else {
-            for (j = 0; j < 4; j++)
-              for (i = 0; i < 4; i++)
-                pred8[(row * 4 + j) * 8 + 4 * col + i] =
-                    dst[(row * 4 + j) * dst_stride + 4 * col + i];
-          }
-        }
-      }
-    }
-    dist_8x8 = av1_dist_8x8(cpi, x, src, src_stride, pred8, 8, BLOCK_8X8, 8, 8,
-                            8, 8, qindex) *
-               16;
-
-#ifdef DEBUG_DIST_8X8
-    if (x->tune_metric == AOM_TUNE_PSNR && xd->bd == 8)
-      assert(sum_rd_stats.dist == dist_8x8);
-#endif  // DEBUG_DIST_8X8
-
-    split_rd_stats->dist = dist_8x8;
-    tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-  }
-#endif  // CONFIG_DIST_8X8
   if (this_cost_valid) *split_rd = tmp_rd;
 }
 
@@ -5006,7 +4766,10 @@
 
   const int try_no_split = 1;
   int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
-
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8)
+    try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
+#endif
   TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
 
   // TX no split
@@ -5705,7 +5468,7 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
 
-  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, 1);
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
   const int64_t mse = *dist / bw / bh;
   // Normalized quantizer takes the transform upscaling factor (8 for tx size
   // smaller than 32) into account.
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 1aff10f..b7851a0 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -590,5 +590,7 @@
 
 #if CONFIG_DIST_8X8
   if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+
+  if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
 #endif  // CONFIG_DIST_8X8
 }