Introduce selective R-D optimization of coeffs

Used mse based threshold logic to do R-D optimization of coeffs and change is effective for speeds >=2

For speed = 2, 3, 4 and 5 presets, BD-rate impact is seen as -0.02%, 0.05%, 0.01% and 0.09% (as per AWCY runs),
with encode time reduction of 3.46%, 4.48%, 3.9% and 4.92% (averaged across multiple test cases) respectively.

STATS_CHANGED
Change-Id: I300b818f4c304ca0c08eccd4ce3a10b9d6cb4960
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index bbb8253..af02e07 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -841,6 +841,10 @@
 
   unsigned int tx_domain_dist_threshold;
 
+  // Factor to control R-D optimization of coeffs based on block
+  // mse.
+  unsigned int coeff_opt_dist_threshold;
+
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
   AV1LrStruct lr_ctxt;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 93c8c25..2725504 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3046,6 +3046,7 @@
   tran_low_t *best_dqcoeff = this_dqcoeff;
   const int txk_type_idx =
       av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  int perform_block_coeff_opt;
   av1_invalid_rd_stats(best_rd_stats);
 
   TXB_RD_INFO *intra_txb_rd_info = NULL;
@@ -3180,12 +3181,19 @@
 
   const uint16_t *eobs_ptr = x->plane[plane].eobs;
 
+  // Used mse based threshold logic to take decision of R-D of optimization of
+  // coeffs. For snaller residuals, coeff optimization would be helpful. For
+  // larger residuals, R-D optimization may not be effective.
+  // TODO(any): Experiment with variance and mean based thresholds
+  perform_block_coeff_opt = (block_mse_q8 <= cpi->coeff_opt_dist_threshold);
+
   for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
     if (!(allowed_tx_mask & (1 << tx_type))) continue;
     if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
-    if (cpi->optimize_seg_arr[mbmi->segment_id] != FULL_TRELLIS_OPT) {
+    if ((cpi->optimize_seg_arr[mbmi->segment_id] != FULL_TRELLIS_OPT) ||
+        (!perform_block_coeff_opt)) {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index be317db..5f3154e 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -59,7 +59,12 @@
 static unsigned int tx_domain_dist_thresholds[MAX_TX_DOMAIN_EVAL_SPEED + 1] = {
   UINT_MAX, 162754, 22026, 0, 0, 0
 };
-
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE
+// TODO(any): Extend the threshold logic for lower presets and refine the
+// thresholds
+static unsigned int coeff_opt_dist_thresholds[5] = { UINT_MAX, UINT_MAX, 162754,
+                                                     22026, 22026 };
 // scaling values to be used for gating wedge/compound segment based on best
 // approximate rd
 static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
@@ -276,6 +281,7 @@
     sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
     sf->cb_pred_filter_search = 1;
     sf->use_transform_domain_distortion = boosted ? 0 : 1;
+    sf->perform_coeff_opt = boosted ? 0 : 1;
   }
 
   if (speed >= 2) {
@@ -303,6 +309,7 @@
     sf->prune_comp_type_by_comp_avg = 2;
     sf->cb_pred_filter_search = 0;
     sf->adaptive_interp_filter_search = 1;
+    sf->perform_coeff_opt = boosted ? 0 : 2;
   }
 
   if (speed >= 3) {
@@ -326,6 +333,7 @@
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
     sf->disable_wedge_interintra_search = 1;
+    sf->perform_coeff_opt = boosted ? 0 : 3;
   }
 
   if (speed >= 4) {
@@ -343,6 +351,7 @@
     sf->cb_partition_search = !boosted;
     sf->alt_ref_search_fp = 1;
     sf->skip_sharp_interp_filter_search = 1;
+    sf->perform_coeff_opt = boosted ? 0 : 4;
   }
 
   if (speed >= 5) {
@@ -573,6 +582,7 @@
   sf->prune_warp_using_wmtype = 0;
 
   sf->disable_wedge_interintra_search = 0;
+  sf->perform_coeff_opt = 0;
 
   if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
@@ -657,6 +667,11 @@
                             : oxcf->speed;
   cpi->tx_domain_dist_threshold = tx_domain_dist_thresholds[tx_domain_speed];
 
+  // assert ensures that coeff_opt_dist_thresholds is accessed correctly
+  assert(cpi->sf.perform_coeff_opt >= 0 && cpi->sf.perform_coeff_opt < 5);
+  cpi->coeff_opt_dist_threshold =
+      coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt];
+
 #if CONFIG_DIST_8X8
   if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
 
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 9550a96..95b9def 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -646,6 +646,9 @@
 
   // Enable/disable interintra wedge search.
   int disable_wedge_interintra_search;
+
+  // Flag used to control the extent of coeff R-D optimization
+  int perform_coeff_opt;
 } SPEED_FEATURES;
 
 struct AV1_COMP;