Correct skip rdy computation for txfm_rd_gate_level sf

This CL fixes the skip rdy computation for the txfm_rd_gate_level speed
feature by properly scaling the luma sse.

STATS_CHANGED

Change-Id: I16c3f62754abf9cf4938bc1f2919f17559407d33
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 83ecdee..fd0f61c 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -1041,7 +1041,7 @@
   *rs2 += get_interinter_compound_mask_rate(x, mbmi);
   best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
   assert(cur_sse != UINT64_MAX);
-  int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, cur_sse);
+  int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
 
   // Although the true rate_mv might be different after motion search, but it
   // is unlikely to be the best mode considering the transform rd cost and other
@@ -1329,7 +1329,8 @@
       int eval_txfm = 1;
       // Check if the mode is good enough based on skip rd
       if (cpi->sf.inter_sf.txfm_rd_gate_level) {
-        int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, sse_y[best_type]);
+        int64_t skip_rd =
+            RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y[best_type] << 4));
         eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
                                     cpi->sf.inter_sf.txfm_rd_gate_level, 1);
       }
@@ -1387,7 +1388,7 @@
           // Check if the mode is good enough based on skip rd
           if (cpi->sf.inter_sf.txfm_rd_gate_level) {
             int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
-            int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, sse_y);
+            int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y << 4));
             eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
                                         cpi->sf.inter_sf.txfm_rd_gate_level, 1);
           }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 41de894..cbd3339 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1502,8 +1502,9 @@
       rd_stats->rdcost = est_rd;
       if (rd_stats->rdcost < *best_est_rd) {
         *best_est_rd = rd_stats->rdcost;
+        assert(sse_y >= 0);
         ref_skip_rd[1] = cpi->sf.inter_sf.txfm_rd_gate_level
-                             ? RDCOST(x->rdmult, mode_rate, sse_y)
+                             ? RDCOST(x->rdmult, mode_rate, (sse_y << 4))
                              : INT64_MAX;
       }
       if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
@@ -1531,7 +1532,7 @@
         // model_rd_sb_fn and compound type rd
         sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
         skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
-        skip_rdy = RDCOST(x->rdmult, rd_stats->rate, sse_y);
+        skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
         int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
                                         cpi->sf.inter_sf.txfm_rd_gate_level, 0);
         if (!eval_txfm) continue;
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 4d0b05e..341ba0d 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -334,24 +334,31 @@
   // Derive aggressiveness factor for gating the transform search
   // Lower value indicates more aggressiveness. Be more conservative (high
   // value) for (i) low quantizers (ii) regions where prediction is poor
-  const int scale[4] = { INT_MAX, 3, 3, 2 };
-
-  int aggr_factor =
-      AOMMAX(1, ((MAXQ - x->qindex) * 2 + QINDEX_RANGE / 2) >> QINDEX_BITS);
+  const int scale[5] = { INT_MAX, 4, 3, 3, 2 };
+  const int qslope = 2 * (!is_luma_only);
+  int aggr_factor = 1;
+  if (!is_luma_only) {
+    aggr_factor = AOMMAX(
+        1, ((MAXQ - x->qindex) * qslope + QINDEX_RANGE / 2) >> QINDEX_BITS);
+  }
   if (best_skip_rd >
       (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS)))
     aggr_factor *= scale[level];
+  // For level setting 1, be more conservative for luma only case even when
+  // prediction is good
+  else if ((level <= 1) && !is_luma_only)
+    aggr_factor *= 2;
 
   // Be more conservative for luma only cases (called from compound type rd)
   // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
   // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
   // interpolation filter search
-  const int luma_mul[4] = { INT_MAX, 16, 15, 11 };
-  int mul_factor = is_luma_only ? luma_mul[level] : 8;
+  const int luma_mul[5] = { INT_MAX, 32, 29, 20, 17 };
+  int mul_factor = is_luma_only ? luma_mul[level] : 16;
   int64_t rd_thresh =
       (best_skip_rd == INT64_MAX)
           ? best_skip_rd
-          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 3);
+          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 4);
   if (skip_rd > rd_thresh) eval_txfm = 0;
   return eval_txfm;
 }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 6035876..5117f52 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -464,7 +464,9 @@
       sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
     sf->inter_sf.txfm_rd_gate_level =
-        (is_boosted_arf2_bwd_type || cm->allow_screen_content_tools) ? 0 : 1;
+        (boosted || cm->allow_screen_content_tools)
+            ? 0
+            : (is_boosted_arf2_bwd_type ? 1 : 2);
 
     sf->intra_sf.prune_palette_search_level = 2;
 
@@ -510,7 +512,7 @@
     sf->inter_sf.alt_ref_search_fp = 1;
     sf->inter_sf.prune_ref_mv_idx_search = 1;
     sf->inter_sf.txfm_rd_gate_level =
-        (boosted || cm->allow_screen_content_tools) ? 0 : 2;
+        (boosted || cm->allow_screen_content_tools) ? 0 : 3;
 
     sf->inter_sf.disable_smooth_interintra = 1;
 
@@ -577,7 +579,7 @@
     sf->inter_sf.disable_obmc = 1;
     sf->inter_sf.disable_onesided_comp = 1;
     sf->inter_sf.txfm_rd_gate_level =
-        (boosted || cm->allow_screen_content_tools) ? 0 : 3;
+        (boosted || cm->allow_screen_content_tools) ? 0 : 4;
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
 
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;