Skip trellis opt based on SATD

Introduced a speed feature 'perform_coeff_opt_based_on_satd'
to skip trellis optimization of coefficients based on sum of
absolute transformed differences. This speed feature is enabled
for cpu-used 5.

             Encode Time          BD-Rate Loss
cpu-used      Reduction    avg.psnr   ovr.psnr   ssim
    5          1.728%     -0.0182%   -0.0148%   -0.0226%

STATS_CHANGED

Change-Id: I2affee4d9d678e1baf50081d07c5afef88b7111d
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 053619d..8e5fb6f 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -294,10 +294,11 @@
   // Try to prune 2d transforms based on 1d transform results.
   int prune_2d_txfm_mode;
 
-  // The following four parameters are copied from WinnerModeParams based on the
+  // The following six parameters are copied from WinnerModeParams based on the
   // current evaluation mode. See the documentation for WinnerModeParams for
   // more detail.
   unsigned int coeff_opt_dist_threshold;
+  unsigned int coeff_opt_satd_threshold;
   unsigned int tx_domain_dist_threshold;
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
   unsigned int use_transform_domain_distortion;
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 2c85615..fe774ef 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -263,6 +263,25 @@
 void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
                      QUANT_PARAM *qparam) {
+  av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param);
+  av1_quant(x, plane, block, txfm_param, qparam);
+}
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  const int diff_stride = block_size_wide[plane_bsize];
+
+  const int src_offset = (blk_row * diff_stride + blk_col);
+  const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
+
+  av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               QUANT_PARAM *qparam) {
   const struct macroblock_plane *const p = &x->plane[plane];
   const SCAN_ORDER *const scan_order =
       get_scan(txfm_param->tx_size, txfm_param->tx_type);
@@ -271,12 +290,6 @@
   tran_low_t *const qcoeff = p->qcoeff + block_offset;
   tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
   uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = block_size_wide[plane_bsize];
-
-  const int src_offset = (blk_row * diff_stride + blk_col);
-  const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
-
-  av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
 
   if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
@@ -300,7 +313,6 @@
     p->txb_entropy_ctx[block] =
         (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
   }
-  return;
 }
 
 void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index a337c83..4160f82 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -76,6 +76,12 @@
                      int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
                      QUANT_PARAM *qparam);
 
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+               BLOCK_SIZE plane_bsize, TxfmParam *txfm_param);
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+               QUANT_PARAM *qparam);
+
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
                    int block, TX_SIZE tx_size, TX_TYPE tx_type,
                    const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 1e1b7a9..db6b2f7 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1094,6 +1094,11 @@
   // Corresponds to enable_winner_mode_for_coeff_opt speed feature.
   unsigned int coeff_opt_dist_threshold[MODE_EVAL_TYPES];
 
+  // Threshold to determine if trellis optimization is to be enabled
+  // based on SATD.
+  // Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+  unsigned int coeff_opt_satd_threshold[MODE_EVAL_TYPES];
+
   // Determines the tx size search method during rdopt.
   // Corresponds to enable_winner_mode_for_tx_size_srch speed feature.
   TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 532c08c..fbed908 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -481,6 +481,9 @@
       // Get default threshold for R-D optimization of coefficients
       txfm_params->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
           winner_mode_params->coeff_opt_dist_threshold, 0, 0);
+      txfm_params->coeff_opt_satd_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_satd_threshold, 0, 0);
+
       // Set default transform size search method
       set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0);
       // Set default transform type prune
@@ -505,6 +508,10 @@
       txfm_params->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
           winner_mode_params->coeff_opt_dist_threshold,
           sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+      txfm_params->coeff_opt_satd_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_satd_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+
       // Set the transform size search method for mode evaluation
       set_tx_size_search_method(
           cm, winner_mode_params, txfm_params,
@@ -530,6 +537,10 @@
       txfm_params->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
           winner_mode_params->coeff_opt_dist_threshold,
           sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+      txfm_params->coeff_opt_satd_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_satd_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+
       // Set the transform size search method for winner mode evaluation
       set_tx_size_search_method(
           cm, winner_mode_params, txfm_params,
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c9cd96c..0495f4c 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -84,6 +84,12 @@
   { 216, 0, UINT_MAX }
 };
 
+static unsigned int coeff_opt_satd_thresholds[3][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX },
+  { 97, 16, UINT_MAX },
+  { 25, 10, UINT_MAX },
+};
+
 // Transform size to be used for default, mode and winner mode evaluation
 // Index 0: Default mode evaluation, Winner mode processing is not applicable
 // (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
@@ -602,6 +608,9 @@
     sf->mv_sf.reduce_search_range = 1;
 
     sf->tpl_sf.prune_starting_mv = 3;
+
+    sf->rd_sf.perform_coeff_opt_based_on_satd =
+        is_boosted_arf2_bwd_type ? 1 : 2;
   }
 
   if (speed >= 6) {
@@ -1103,6 +1112,7 @@
   rd_sf->tx_domain_dist_thres_level = 0;
   rd_sf->use_hash_based_trellis = 0;
   rd_sf->perform_coeff_opt = 0;
+  rd_sf->perform_coeff_opt_based_on_satd = 0;
 }
 
 static AOM_INLINE void init_winner_mode_sf(
@@ -1262,6 +1272,14 @@
          coeff_opt_dist_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
          sizeof(winner_mode_params->coeff_opt_dist_threshold));
 
+  // assert ensures that coeff_opt_satd_thresholds is accessed correctly
+  assert(cpi->sf.rd_sf.perform_coeff_opt_based_on_satd >= 0 &&
+         cpi->sf.rd_sf.perform_coeff_opt_based_on_satd < 3);
+  memcpy(
+      winner_mode_params->coeff_opt_satd_threshold,
+      coeff_opt_satd_thresholds[cpi->sf.rd_sf.perform_coeff_opt_based_on_satd],
+      sizeof(winner_mode_params->coeff_opt_satd_threshold));
+
   // assert ensures that predict_skip_levels is accessed correctly
   assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
          cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 768d634..ad470cf 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -821,6 +821,11 @@
 
   // Flag used to control the extent of coeff R-D optimization
   int perform_coeff_opt;
+
+  // Enable coeff R-D optimization based on SATD values.
+  // 0    : Do not disable coeff R-D opt.
+  // 1, 2 : Disable coeff R-D opt with progressively increasing aggressiveness.
+  int perform_coeff_opt_based_on_satd;
 } RD_CALC_SPEED_FEATURES;
 
 typedef struct WINNER_MODE_SPEED_FEATURES {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 97d1a5f..663e272 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -74,6 +74,12 @@
   TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
 };
 
+// look-up table for sqrt of number of pixels in a transform block
+// rounded up to the nearest integer.
+static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4,  8,  16, 32, 32, 6,  6,
+                                                     12, 12, 23, 23, 32, 32, 8,
+                                                     8,  16, 16, 23, 23 };
+
 static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
                                 const uint32_t hash) {
   // Linear search through the circular buffer to find matching hash.
@@ -2088,6 +2094,37 @@
   return cost;
 }
 
+static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
+                                          QUANT_PARAM *quant_param, int plane,
+                                          int block, TX_SIZE tx_size,
+                                          int quant_b_adapt, int qstep,
+                                          unsigned int coeff_opt_satd_threshold,
+                                          int skip_trellis) {
+  if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
+    return skip_trellis;
+
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff_ptr = p->coeff + block_offset;
+  const int n_coeffs = av1_get_max_eob(tx_size);
+  const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
+  int satd = aom_satd(coeff_ptr, n_coeffs);
+  satd = RIGHT_SIGNED_SHIFT(satd, shift);
+
+  const int skip_block_trellis =
+      ((uint64_t)satd >
+       (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]);
+
+  av1_setup_quant(
+      tx_size, !skip_block_trellis,
+      skip_block_trellis
+          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP)
+          : AV1_XFORM_QUANT_FP,
+      quant_b_adapt, quant_param);
+
+  return skip_block_trellis;
+}
+
 // Search for the best transform type for a given transform block.
 // This function can be used for both inter and intra, both luma and chroma.
 static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
@@ -2215,6 +2252,7 @@
 
   TxfmParam txfm_param;
   QUANT_PARAM quant_param;
+  int skip_trellis_based_on_satd[TX_TYPES] = { 0 };
   av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
   av1_setup_quant(tx_size, !skip_trellis,
                   skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
@@ -2235,8 +2273,13 @@
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
 
-    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
-                    &quant_param);
+    av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+
+    skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
+        x, &quant_param, plane, block, tx_size, cpi->oxcf.quant_b_adapt, qstep,
+        txfm_params->coeff_opt_satd_threshold, skip_trellis);
+
+    av1_quant(x, plane, block, &txfm_param, &quant_param);
 
     // Calculate rate cost of quantized coefficients.
     if (quant_param.use_optimize_b) {
@@ -2386,6 +2429,7 @@
   if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
   x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
   x->plane[plane].eobs[block] = best_eob;
+  skip_trellis = skip_trellis_based_on_satd[best_tx_type];
 
   // Point dqcoeff to the quantized coefficients corresponding to the best
   // transform type, then we can skip transform and quantization, e.g. in the
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 836d648..09f6f47 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -54,7 +54,7 @@
   { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5 },
 #if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0,
-    27.97 },
+    27.84 },
 #endif
   { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
   // Image coding (single frame).