Bypass tx type search for DC only blocks

This CL adds logic to predict DC only blocks if the
residual variance is below a qstep based threshold.
For such blocks, transform type search is bypassed.

         Instruction Count       BD-Rate Loss(%)
cpu-used   Reduction(%)    avg.psnr  ovr.psnr  ssim
   6          1.367        0.1291    0.1265    0.0428

STATS_CHANGED

Change-Id: I2ea9565906b5cc13cbf88d6ba1846ecf9f000672
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 9f7b2a9..a712e4f 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -468,6 +468,11 @@
    * candidate, then code it as TX_MODE_SELECT.
    */
   TX_MODE tx_mode_search_type;
+
+  /*!
+   * Flag to enable/disable DC block prediction.
+   */
+  unsigned int predict_dc_level;
 } TxfmSearchParams;
 
 /*!\cond */
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 3afbc7a..14ee354 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -259,6 +259,19 @@
 };
 #endif
 
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean) {
+  assert(per_px_mean != INT64_MAX);
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+  memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+  coeff[0] =
+      (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
                      QUANT_PARAM *qparam) {
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 14761db..cdf7318 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -81,6 +81,9 @@
                        const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
                        TX_TYPE tx_type, QUANT_PARAM *qparam);
 
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+                       TxfmParam *txfm_param, int64_t per_px_mean);
+
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
                      QUANT_PARAM *qparam);
@@ -153,6 +156,14 @@
     return false;
   return true;
 }
+
+// Scaling terms (precision of 12 bits) to obtain DC coefficient from block
+// residual mean
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+  1024, 2048, 4096, 4096, 0,    1448, 1448, 2896, 2896, 2896,
+  2896, 0,    0,    2048, 2048, 4096, 4096, 0,    0
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index db15ba0..2b9754e 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1797,6 +1797,13 @@
    * Corresponds to use_skip_flag_prediction speed feature.
    */
   unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+  /*!
+   * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+   * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+   * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+   */
+  unsigned int predict_dc_level[MODE_EVAL_TYPES];
 } WinnerModeParams;
 
 /*!
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 98fb8f6..c7c7d17 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -479,6 +479,8 @@
       txfm_params->use_default_intra_tx_type = 0;
       txfm_params->skip_txfm_level =
           winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[DEFAULT_EVAL];
       // Set default transform domain distortion type
       set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
 
@@ -501,7 +503,8 @@
           cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
       txfm_params->skip_txfm_level =
           winner_mode_params->skip_txfm_level[MODE_EVAL];
-
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[MODE_EVAL];
       // Set transform domain distortion type for mode evaluation
       set_tx_domain_dist_params(
           winner_mode_params, txfm_params,
@@ -530,6 +533,8 @@
       txfm_params->use_default_intra_tx_type = 0;
       txfm_params->skip_txfm_level =
           winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+      txfm_params->predict_dc_level =
+          winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
 
       // Set transform domain distortion type for winner mode evaluation
       set_tx_domain_dist_params(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index fb84022..4ef32b4 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -112,6 +112,15 @@
                                                                 { 1, 1, 1 },
                                                                 { 1, 2, 1 } };
 
+// Predict DC block levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early DC block prediction
+// 1 : Early DC block prediction based on error variance
+static unsigned int predict_dc_levels[2][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                              { 1, 1, 0 } };
+
 // This table holds the maximum number of reference frames for global motion.
 // The table is indexed as per the speed feature 'gm_search_type'.
 // 0 : All reference frames are allowed.
@@ -662,6 +671,7 @@
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 4 : 6;
 
+    sf->winner_mode_sf.enable_dc_only_blk_pred = 1;
     sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
   }
 
@@ -1192,6 +1202,7 @@
   winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
   winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
   winner_mode_sf->multi_winner_mode_type = 0;
+  winner_mode_sf->enable_dc_only_blk_pred = 0;
 }
 
 static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
@@ -1367,6 +1378,9 @@
   memcpy(winner_mode_params->tx_size_search_methods,
          tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
          sizeof(winner_mode_params->tx_size_search_methods));
+  memcpy(winner_mode_params->predict_dc_level,
+         predict_dc_levels[cpi->sf.winner_mode_sf.enable_dc_only_blk_pred],
+         sizeof(winner_mode_params->predict_dc_level));
 
   if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
     if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1926dec..ae252ab 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -886,6 +886,9 @@
   // 0: speed feature OFF
   // 1 / 2 : Use configured number of winner candidates
   int motion_mode_for_winner_cand;
+
+  // Early DC only txfm block prediction
+  int enable_dc_only_blk_pred;
 } WINNER_MODE_SPEED_FEATURES;
 
 typedef struct LOOP_FILTER_SPEED_FEATURES {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index d192ead..258b44c 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -401,6 +401,37 @@
   return sse;
 }
 
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+    MACROBLOCK *x, int plane, int blk_row, int blk_col,
+    const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+    unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse = 0;
+  int sum = 0;
+  sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+  if (visible_cols > 0 && visible_rows > 0) {
+    double norm_factor = 1.0 / (visible_cols * visible_rows);
+    int sign_sum = sum > 0 ? 1 : -1;
+    // Conversion to transform domain
+    *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+    *per_px_mean = sign_sum * (*per_px_mean);
+    *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+    *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+  } else {
+    *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
 // Uses simple features on top of DCT coefficients to quickly predict
 // whether optimal RD decision is to skip encoding the residual.
 // The sse value is stored in dist.
@@ -2104,7 +2135,7 @@
                                           int block, TX_SIZE tx_size,
                                           int quant_b_adapt, int qstep,
                                           unsigned int coeff_opt_satd_threshold,
-                                          int skip_trellis) {
+                                          int skip_trellis, int dc_only_blk) {
   if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
     return skip_trellis;
 
@@ -2113,7 +2144,7 @@
   tran_low_t *const coeff_ptr = p->coeff + block_offset;
   const int n_coeffs = av1_get_max_eob(tx_size);
   const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
-  int satd = aom_satd(coeff_ptr, n_coeffs);
+  int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
   satd = RIGHT_SIGNED_SHIFT(satd, shift);
 
   const int skip_block_trellis =
@@ -2205,22 +2236,90 @@
   int txk_map[TX_TYPES] = {
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   };
-  // Bit mask to indicate which transform types are allowed in the RD search.
-  const uint16_t allowed_tx_mask =
-      get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map);
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
 
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  int64_t block_sse;
   unsigned int block_mse_q8;
-  int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
-                                      txsize_to_bsize[tx_size], &block_mse_q8);
-  assert(block_mse_q8 != UINT_MAX);
+  int dc_only_blk = 0;
+  const bool predict_dc_block =
+      cpi->sf.winner_mode_sf.enable_dc_only_blk_pred && txw != 64 && txh != 64;
+  int64_t per_px_mean = INT64_MAX;
+  uint64_t block_var = UINT64_MAX;
+  if (predict_dc_block) {
+    const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+    block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+                                 txsize_to_bsize[tx_size], &block_mse_q8,
+                                 &per_px_mean, &block_var);
+    assert(block_mse_q8 != UINT_MAX);
+    uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+    if (is_cur_buf_hbd(xd))
+      block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+    // Early prediction of skip block if residual mean and variance are less
+    // than qstep based threshold
+    if (((llabs(per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) &&
+        (block_var < var_threshold)) {
+      // If the normalized mean of residual block is less than the dc qstep and
+      // the  normalized block variance is less than ac qstep, then the block is
+      // assumed to be a skip block and its rdcost is updated accordingly.
+      best_rd_stats->skip_txfm = 1;
+
+      x->plane[plane].eobs[block] = 0;
+
+      best_rd_stats->dist = block_sse << 4;
+      best_rd_stats->sse = best_rd_stats->dist;
+
+      ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+      ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+      av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+      ENTROPY_CONTEXT *ta = ctxa;
+      ENTROPY_CONTEXT *tl = ctxl;
+      const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+      TXB_CTX txb_ctx_tmp;
+      const PLANE_TYPE plane_type = get_plane_type(plane);
+      get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+      const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+                                    .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+      best_rd_stats->rate =
+          zero_blk_rate *
+          (block_size_wide[plane_bsize] >> tx_size_wide_log2[tx_size]) *
+          (block_size_high[plane_bsize] >> tx_size_high_log2[tx_size]);
+
+      best_rd_stats->rdcost =
+          RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+      x->plane[plane].txb_entropy_ctx[block] = 0;
+      return;
+    } else if (block_var < var_threshold) {
+      // Predict DC only blocks based on residual variance.
+      // For chroma plane, this early prediction is disabled for intra blocks.
+      if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) dc_only_blk = 1;
+    }
+  } else {
+    block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                txsize_to_bsize[tx_size], &block_mse_q8);
+    assert(block_mse_q8 != UINT_MAX);
+  }
+
+  // Bit mask to indicate which transform types are allowed in the RD search.
+  uint16_t tx_mask;
+
+  // Use DCT_DCT transform for DC only block.
+  if (dc_only_blk)
+    tx_mask = 1 << DCT_DCT;
+  else
+    tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+                          tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+                          &txk_allowed, txk_map);
+  const uint16_t allowed_tx_mask = tx_mask;
+
   if (is_cur_buf_hbd(xd)) {
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
     block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
   }
   block_sse *= 16;
-  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
-  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
   // Use mse / qstep^2 based threshold logic to take decision of R-D
   // optimization of coeffs. For smaller residuals, coeff optimization
   // would be helpful. For larger residuals, R-D optimization may not be
@@ -2241,7 +2340,9 @@
       // Any 64-pt transforms only preserves half the coefficients.
       // Therefore transform domain distortion is not valid for these
       // transform sizes.
-      txsize_sqr_up_map[tx_size] != TX_64X64;
+      (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+      // Use pixel domain distortion for DC only blocks
+      !dc_only_blk;
   // Flag to indicate if an extra calculation of distortion in the pixel domain
   // should be performed at the end, after the best transform type has been
   // decided.
@@ -2277,11 +2378,15 @@
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
 
-    av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+    if (!dc_only_blk)
+      av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+    else
+      av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
 
     skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
         x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
-        qstep, txfm_params->coeff_opt_satd_threshold, skip_trellis);
+        qstep, txfm_params->coeff_opt_satd_threshold, skip_trellis,
+        dc_only_blk);
 
     av1_quant(x, plane, block, &txfm_param, &quant_param);
 
@@ -2302,6 +2407,10 @@
     if (eobs_ptr[block] == 0) {
       // When eob is 0, pixel domain distortion is more efficient and accurate.
       this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (dc_only_blk) {
+      this_rd_stats.sse = block_sse;
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
     } else if (use_transform_domain_distortion) {
       dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
                            &this_rd_stats.sse);
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 4b96d44..17a0ae6 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -54,7 +54,7 @@
   { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 12.5 },
 #if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0,
-    27.84 },
+    27.74 },
 #endif
   { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
   // Image coding (single frame).