Enable R-D optimization of qcoeff for winner mode

For speed >= 3, R-D optimization of qcoeff is performed
conservatively during mode evaluation and the same is
enabled always for winner mode

          Encode Time
Preset    Reduction       Quality Loss
  3         3.8%           +0.0731%
  4         2.8%           +0.0891%

STATS_CHANGED

Change-Id: I0daeec7705810005021d4d2b833477578d8a084c
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index ad468f2..067ed15 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -410,6 +410,10 @@
 
   CB_COEFF_BUFFER *cb_coef_buff;
 
+  // Threshold used to decide the applicability of R-D optimization of
+  // quantized coeffs
+  uint32_t coeff_opt_dist_threshold;
+
 #if !CONFIG_REALTIME_ONLY
   int quad_tree_idx;
   int cnn_output_valid;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index c1c1ada..0a662f7 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -684,6 +684,11 @@
     x->edge_strength_y = ei.y;
   }
 
+  // Default initialization of the threshold for R-D optimization of
+  // coefficients for mode decision
+  x->coeff_opt_dist_threshold =
+      get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold, 0, 0);
+
   // Save rdmult before it might be changed, so it can be restored later.
   const int orig_rdmult = x->rdmult;
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index a4e4c56..255f5ef 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -976,7 +976,10 @@
 
   // Factor to control R-D optimization of coeffs based on block
   // mse.
-  unsigned int coeff_opt_dist_threshold;
+  // Index 0 corresponds to the modes where winner mode processing is not
+  // applicable (Eg : IntraBc). Index 1 corresponds to the mode evaluation and
+  // is applicable when enable_winner_mode_for_coeff_opt speed feature is ON
+  unsigned int coeff_opt_dist_threshold[2];
 
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index d1bd07c..5162e61 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -256,6 +256,29 @@
   x->errorperbit += (x->errorperbit == 0);
 }
 
+// Get the threshold for R-D optimization of coefficients depending upon mode
+// decision/winner mode processing
+static INLINE uint32_t get_rd_opt_coeff_thresh(
+    const uint32_t *const coeff_opt_dist_threshold,
+    int enable_winner_mode_for_coeff_opt, int is_winner_mode) {
+  // Default initialization of threshold
+  uint32_t coeff_opt_thresh = coeff_opt_dist_threshold[0];
+  // TODO(any): Experiment with coeff_opt_dist_threshold values when
+  // enable_winner_mode_for_coeff_opt is ON
+  // TODO(any): Skip the winner mode processing for blocks with lower residual
+  // energy as R-D optimization of coefficients would have been enabled during
+  // mode decision
+  if (enable_winner_mode_for_coeff_opt) {
+    // Use conservative threshold during mode decision and perform R-D
+    // optimization of coeffs always for winner modes
+    if (is_winner_mode)
+      coeff_opt_thresh = UINT32_MAX;
+    else
+      coeff_opt_thresh = coeff_opt_dist_threshold[1];
+  }
+  return coeff_opt_thresh;
+}
+
 void av1_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 432c225..54875c2 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3140,7 +3140,7 @@
   // coeffs. For smaller residuals, coeff optimization would be helpful. For
   // larger residuals, R-D optimization may not be effective.
   // TODO(any): Experiment with variance and mean based thresholds
-  perform_block_coeff_opt = (block_mse_q8 <= cpi->coeff_opt_dist_threshold);
+  perform_block_coeff_opt = (block_mse_q8 <= x->coeff_opt_dist_threshold);
 
   assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed));
 
@@ -4725,6 +4725,11 @@
   else
     x->use_default_intra_tx_type = 0;
 
+  // Get the threshold for R-D optimization of coefficients during mode decision
+  x->coeff_opt_dist_threshold =
+      get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold,
+                              cpi->sf.enable_winner_mode_for_coeff_opt, 0);
+
   MB_MODE_INFO best_mbmi = *mbmi;
   /* Y Search for intra prediction mode */
   for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
@@ -4802,10 +4807,18 @@
     }
   }
 
-  // If previous searches use only the default tx type, do an extra search for
-  // the best tx type.
-  if (cpi->sf.tx_type_search.fast_intra_tx_type_search &&
-      !cpi->oxcf.use_intra_default_tx_only) {
+  // If previous searches use only the default tx type/no R-D optimization of
+  // quantized coeffs, do an extra search for the best tx type/better R-D
+  // optimization of quantized coeffs
+  if ((cpi->sf.tx_type_search.fast_intra_tx_type_search &&
+       !cpi->oxcf.use_intra_default_tx_only) ||
+      (cpi->sf.enable_winner_mode_for_coeff_opt &&
+       (cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+        cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT))) {
+    // Get the threshold for R-D optimization of coefficients for winner mode
+    x->coeff_opt_dist_threshold =
+        get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold,
+                                cpi->sf.enable_winner_mode_for_coeff_opt, 1);
     *mbmi = best_mbmi;
     x->use_default_intra_tx_type = 0;
     intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly,
@@ -10915,6 +10928,11 @@
       rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
                              &dist_y, &y_skip, bsize, best_rd, ctx);
 
+  // Get the threshold for R-D optimization of coefficients for mode
+  // decision
+  x->coeff_opt_dist_threshold =
+      get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold, 0, 0);
+
   if (intra_yrd < best_rd) {
     // Only store reconstructed luma when there's chroma RDO. When there's no
     // chroma RDO, the reconstructed luma will be stored in encode_superblock().
@@ -11191,7 +11209,10 @@
         !cpi->oxcf.use_inter_dct_only && is_inter_mode(best_mbmode->mode)) ||
        (sf->tx_type_search.fast_intra_tx_type_search &&
         !cpi->oxcf.use_intra_default_tx_only && !cpi->oxcf.use_intra_dct_only &&
-        !is_inter_mode(best_mbmode->mode)))) {
+        !is_inter_mode(best_mbmode->mode)) ||
+       (cpi->sf.enable_winner_mode_for_coeff_opt &&
+        (cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+         cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT)))) {
     int skip_blk = 0;
     RD_STATS rd_stats_y, rd_stats_uv;
     const int skip_ctx = av1_get_skip_context(xd);
@@ -11199,6 +11220,11 @@
     x->use_default_inter_tx_type = 0;
     x->use_default_intra_tx_type = 0;
 
+    // Get the threshold for R-D optimization of coefficients for winner mode
+    x->coeff_opt_dist_threshold =
+        get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold,
+                                cpi->sf.enable_winner_mode_for_coeff_opt, 1);
+
     *mbmi = *best_mbmode;
 
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
@@ -11581,6 +11607,12 @@
     x->use_default_inter_tx_type = 1;
   else
     x->use_default_inter_tx_type = 0;
+
+  // Get the threshold for R-D optimization of coefficients during mode decision
+  x->coeff_opt_dist_threshold =
+      get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold,
+                              cpi->sf.enable_winner_mode_for_coeff_opt, 0);
+
   if (cpi->sf.skip_repeat_interpolation_filter_search) {
     x->interp_filter_stats_idx[0] = 0;
     x->interp_filter_stats_idx[1] = 0;
@@ -13030,6 +13062,10 @@
       &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
       search_state.best_rate_uv, &search_state.best_skip2);
 
+  // Get the threshold for R-D optimization of coefficients for mode evaluation
+  x->coeff_opt_dist_threshold =
+      get_rd_opt_coeff_thresh(cpi->coeff_opt_dist_threshold, 0, 0);
+
   // Only try palette mode when the best mode so far is an intra mode.
   const int try_palette =
       cpi->oxcf.enable_palette &&
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index ca67d33..3119f22 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -55,8 +55,14 @@
 // Threshold values to be used for disabling coeff RD-optimization
 // based on block MSE
 // TODO(any): Experiment the threshold logic based on variance metric
-static unsigned int coeff_opt_dist_thresholds[5] = { UINT_MAX, 442413, 162754,
-                                                     22026, 22026 };
+// Index 0 corresponds to the modes where winner mode processing is not
+// applicable (Eg : IntraBc). Index 1 corresponds to the mode evaluation and is
+// applicable when enable_winner_mode_for_coeff_opt speed feature is ON
+static unsigned int coeff_opt_dist_thresholds[5][2] = { { UINT_MAX, UINT_MAX },
+                                                        { 442413, 36314 },
+                                                        { 162754, 36314 },
+                                                        { 22026, 22026 },
+                                                        { 22026, 22026 } };
 // scaling values to be used for gating wedge/compound segment based on best
 // approximate rd
 static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
@@ -325,6 +331,9 @@
     sf->prune_comp_type_by_model_rd = boosted ? 0 : 1;
     sf->disable_smooth_intra =
         !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+    // TODO(any): Experiment on the dependency of this speed feature with
+    // use_intra_txb_hash, use_inter_txb_hash and use_mb_rd_hash speed features
+    sf->enable_winner_mode_for_coeff_opt = 1;
   }
 
   if (speed >= 4) {
@@ -779,6 +788,7 @@
   sf->prune_warp_using_wmtype = 0;
   sf->disable_wedge_interintra_search = 0;
   sf->perform_coeff_opt = 0;
+  sf->enable_winner_mode_for_coeff_opt = 0;
   sf->prune_comp_type_by_model_rd = 0;
   sf->disable_smooth_intra = 0;
   sf->perform_best_rd_based_gating_for_chroma = 0;
@@ -867,8 +877,9 @@
 
   // assert ensures that coeff_opt_dist_thresholds is accessed correctly
   assert(cpi->sf.perform_coeff_opt >= 0 && cpi->sf.perform_coeff_opt < 5);
-  cpi->coeff_opt_dist_threshold =
-      coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt];
+  memcpy(cpi->coeff_opt_dist_threshold,
+         coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt],
+         sizeof(cpi->coeff_opt_dist_threshold));
 
 #if CONFIG_DIST_8X8
   if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index d787f42..45e63e2 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -670,6 +670,10 @@
   // Flag used to control the extent of coeff R-D optimization
   int perform_coeff_opt;
 
+  // Flag used to control the winner mode processing for better R-D optimization
+  // of quantized coeffs
+  int enable_winner_mode_for_coeff_opt;
+
   // Flag used to control the speed of the eob selection in trellis.
   int trellis_eob_fast;