Introduce aggressive early txfm skip

Prediction of skip flag is done based only on SSE for normal mode in
presets 3 and 4. Default skip prediction is applicable for winner mode.
This logic is disabled for screen contents.

          Instruction Count
cpu-used       Reduction        Quality Loss
   3             0.98%           -0.0073%
   4		 1.15%		 +0.0440%

STATS_CHANGED

Change-Id: Ia6ab667afcee33d18d0b53c88486509c1631fa5b
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 461bdec..c99da91 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -452,6 +452,10 @@
   int tx_size_search_method;
   TX_MODE tx_mode;
 
+  // Used to control aggressiveness of skip flag prediction for mode processing
+  // (normal/winner mode)
+  unsigned int predict_skip_level;
+
   // Copy out this SB's TPL block stats.
   int valid_cost_b;
   int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 2e6cdf7..8c8474a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1044,6 +1044,11 @@
   // speed feature is ON
   unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
 
+  // Predict transform skip levels to be used for default, mode and winner mode
+  // evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+  // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+  unsigned int predict_skip_level[MODE_EVAL_TYPES];
+
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
   AV1LrStruct lr_ctxt;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 4a024bc..0df6c44 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4033,8 +4033,16 @@
   // smaller than 32) into account.
   const int16_t normalized_dc_q = dc_q >> 3;
   const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
-  // Predict not to skip when mse is larger than threshold.
-  if (mse > mse_thresh) return 0;
+  // For faster early skip decision, use dist to compare against threshold so
+  // that quality risk is less for the skip=1 decision. Otherwise, use mse
+  // since the fwd_txfm coeff checks will take care of quality
+  // TODO(any): Use dist to return 0 when predict_skip_level is 1
+  int64_t pred_err = (x->predict_skip_level >= 2) ? *dist : mse;
+  // Predict not to skip when error is larger than threshold.
+  if (pred_err > mse_thresh) return 0;
+  // Return as skip otherwise for aggressive early skip
+  else if (x->predict_skip_level >= 2)
+    return 1;
 
   const int max_tx_size = max_predict_sf_tx_size[bsize];
   const int tx_h = tx_size_high[max_tx_size];
@@ -4214,7 +4222,7 @@
   // context and terminate early.
   int64_t dist;
 
-  if (cpi->sf.tx_type_search.use_skip_flag_prediction && is_inter &&
+  if (x->predict_skip_level && is_inter &&
       (!xd->lossless[xd->mi[0]->segment_id]) &&
       predict_skip_flag(x, bs, &dist, cpi->common.reduced_tx_set_used)) {
     // Populate rdstats as per skip decision
@@ -6223,7 +6231,7 @@
   // If we predict that skip is the optimal RD decision - set the respective
   // context and terminate early.
   int64_t dist;
-  if (cpi->sf.tx_type_search.use_skip_flag_prediction &&
+  if (x->predict_skip_level &&
       predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
     set_skip_flag(x, rd_stats, bsize, dist);
     // Save the RD search results into tx_rd_record.
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 971ce9d..0ca7384 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -305,6 +305,7 @@
     case DEFAULT_EVAL:
       x->use_default_inter_tx_type = 0;
       x->use_default_intra_tx_type = 0;
+      x->predict_skip_level = cpi->predict_skip_level[DEFAULT_EVAL];
       // Set default transform domain distortion type
       set_tx_domain_dist_params(cpi, x, 0, 0);
 
@@ -320,6 +321,7 @@
            cpi->oxcf.use_intra_default_tx_only);
       x->use_default_inter_tx_type =
           cpi->sf.tx_type_search.fast_inter_tx_type_search;
+      x->predict_skip_level = cpi->predict_skip_level[MODE_EVAL];
 
       // Set transform domain distortion type for mode evaluation
       set_tx_domain_dist_params(
@@ -337,6 +339,7 @@
     case WINNER_MODE_EVAL:
       x->use_default_inter_tx_type = 0;
       x->use_default_intra_tx_type = 0;
+      x->predict_skip_level = cpi->predict_skip_level[WINNER_MODE_EVAL];
 
       // Set transform domain distortion type for winner mode evaluation
       set_tx_domain_dist_params(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 04783fd..5860806 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -94,6 +94,17 @@
   { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }
 };
 
+// Predict transform skip levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early skip prediction
+// 1 : conservative early skip prediction using DCT_DCT
+// 2 : early skip prediction based on SSE
+static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                                { 1, 1, 1 },
+                                                                { 1, 2, 1 } };
+
 // scaling values to be used for gating wedge/compound segment based on best
 // approximate rd
 static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
@@ -395,6 +406,8 @@
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->simple_motion_search_prune_agg = 1;
     sf->disable_sb_level_mv_cost_upd = 1;
+    sf->tx_type_search.use_skip_flag_prediction =
+        cm->allow_screen_content_tools ? 1 : 2;
   }
 
   if (speed >= 4) {
@@ -1006,6 +1019,13 @@
          coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt],
          sizeof(cpi->coeff_opt_dist_threshold));
 
+  // assert ensures that predict_skip_levels is accessed correctly
+  assert(cpi->sf.tx_type_search.use_skip_flag_prediction >= 0 &&
+         cpi->sf.tx_type_search.use_skip_flag_prediction < 3);
+  memcpy(cpi->predict_skip_level,
+         predict_skip_levels[cpi->sf.tx_type_search.use_skip_flag_prediction],
+         sizeof(cpi->predict_skip_level));
+
   // Override speed feature setting for user config
   if (cpi->oxcf.tx_size_search_method != USE_FULL_RD) {
     cpi->sf.enable_winner_mode_for_tx_size_srch = 0;