rtc: SAD-based sb-level var-based-part in rd-path

Improved rtc-derf speed 5 and 6 by tuning qindex-thresholds
using block-level SSE-based kLowSad flag. For a low-SAD sb,
qindex thresholds are tuned accordingly

               Instruction Count     BD-Rate Loss(%)
cpu  Test-set    Reduction(%)   avg.psnr  ovr.psnr    ssim
5    rtc_derf      2.309        0.2206     0.2348    0.2168
6    rtc_derf      4.439        0.3192     0.3751    0.3016

STATS_CHANGED

Change-Id: I2ab717942de521cfec9df4aa594107919947e598
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index cca23e3..9692289 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -766,7 +766,10 @@
 } SOURCE_SAD;
 
 typedef struct {
-  SOURCE_SAD source_sad;
+  //! SAD levels in non-rd path for var-based part and inter-mode search
+  SOURCE_SAD source_sad_nonrd;
+  //! SAD levels in rd-path for var-based part qindex thresholds
+  SOURCE_SAD source_sad_rd;
   int lighting_change;
   int low_sumdiff;
 } CONTENT_STATE_SB;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 418c907..d8187f6 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -482,7 +482,7 @@
     if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0)
       av1_source_content_sb(cpi, x, mi_row, mi_col);
     else
-      x->content_state_sb.source_sad = kZeroSad;
+      x->content_state_sb.source_sad_nonrd = kZeroSad;
   }
 #if CONFIG_RT_ML_PARTITIONING
   if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
@@ -637,6 +637,16 @@
   init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
                     1);
 
+  // Grade the temporal variation of the sb, the grade will be used to decide
+  // partition thresholds for coding blocks
+  if ((sf->rt_sf.var_part_based_on_qidx >= 3) &&
+      (cm->width * cm->height <= 352 * 288)) {
+    if (cpi->rc.frame_source_sad > 0)
+      av1_source_content_sb(cpi, x, mi_row, mi_col);
+    else
+      x->content_state_sb.source_sad_rd = kZeroSad;
+  }
+
   // Encode the superblock
   if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
     // partition search starting from a variance-based partition
@@ -851,7 +861,8 @@
     x->color_sensitivity_sb[1] = 0;
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
-    x->content_state_sb.source_sad = kMedSad;
+    x->content_state_sb.source_sad_nonrd = kMedSad;
+    x->content_state_sb.source_sad_rd = kMedSad;
     x->content_state_sb.lighting_change = 0;
     x->content_state_sb.low_sumdiff = 0;
     x->force_zeromv_skip = 0;
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 0128156..09e998c 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -1322,7 +1322,8 @@
   uint8_t *last_src_y = cpi->last_source->y_buffer;
   int last_src_ystride = cpi->last_source->y_stride;
   const int offset = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
-  uint64_t avg_source_sse_threshold = 100000;        // ~5*5*(64*64)
+  uint64_t avg_source_sse_threshold[2] = { 100000,   // ~5*5*(64*64)
+                                           36000 };  // ~3*3*(64*64)
   uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
   uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1333,13 +1334,21 @@
   last_src_y += offset;
   tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
                                             last_src_ystride, &tmp_sse);
+  // rd thresholds
+  if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3) {
+    if (tmp_sse < avg_source_sse_threshold[1])
+      x->content_state_sb.source_sad_rd = kLowSad;
+    return;
+  }
 
+  // nonrd thresholds
   if (tmp_sse == 0)
-    x->content_state_sb.source_sad = kZeroSad;
-  else if (tmp_sse < avg_source_sse_threshold)
-    x->content_state_sb.source_sad = kLowSad;
+    x->content_state_sb.source_sad_nonrd = kZeroSad;
+  else if (tmp_sse < avg_source_sse_threshold[0])
+    x->content_state_sb.source_sad_nonrd = kLowSad;
   else if (tmp_sse > avg_source_sse_threshold_high)
-    x->content_state_sb.source_sad = kHighSad;
+    x->content_state_sb.source_sad_nonrd = kHighSad;
+
   // Detect large lighting change.
   // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
   if (tmp_sse > 0) {
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 46bd24c..54ddce1 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1203,7 +1203,7 @@
     int left_mv_valid = 0;
     int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
     int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
-    if (bsize >= BLOCK_64X64 && content_state_sb.source_sad != kHighSad &&
+    if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
         spatial_variance < 300 &&
         (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
       this_rdc->rdcost = this_rdc->rdcost << 2;
@@ -1971,8 +1971,8 @@
     // Keep golden (longer-term) reference if sb has high source sad, for
     // frames whose average souce_sad is below threshold. This is to try to
     // capture case where only part of frame has high motion.
-    if (x->content_state_sb.source_sad >= kHighSad && bsize <= BLOCK_32X32 &&
-        cpi->rc.frame_source_sad < 50000)
+    if (x->content_state_sb.source_sad_nonrd >= kHighSad &&
+        bsize <= BLOCK_32X32 && cpi->rc.frame_source_sad < 50000)
       use_golden_ref_frame = 1;
   }
 
@@ -2094,13 +2094,13 @@
       do_early_exit_rdthresh = 0;
     }
     if (x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
-        x->content_state_sb.source_sad >= kHighSad)
+        x->content_state_sb.source_sad_nonrd >= kHighSad)
       force_intra_check = 1;
     // For big blocks worth checking intra (since only DC will be checked),
     // even if best_early_term is set.
     if (bsize >= BLOCK_32X32) best_early_term = 0;
   } else if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
-             x->content_state_sb.source_sad == kLowSad) {
+             x->content_state_sb.source_sad_nonrd == kLowSad) {
     perform_intra_pred = 0;
   }
 
@@ -2158,7 +2158,7 @@
       // For spatially flat blocks with zero motion only check
       // DC mode.
       if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
-          x->content_state_sb.source_sad == kZeroSad &&
+          x->content_state_sb.source_sad_nonrd == kZeroSad &&
           x->source_variance == 0 && this_mode != DC_PRED)
         continue;
     }
@@ -2288,7 +2288,7 @@
     return 1;
   }
 
-  if (content_state_sb.source_sad != kHighSad && bsize >= BLOCK_64X64 &&
+  if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
       force_skip_low_temp_var && mode == NEWMV) {
     return 1;
   }
@@ -2669,7 +2669,7 @@
       use_modeled_non_rd_cost =
           (quant_params->base_qindex > 120 && x->source_variance > 100 &&
            bsize <= BLOCK_16X16 && !x->content_state_sb.lighting_change &&
-           x->content_state_sb.source_sad != kHighSad);
+           x->content_state_sb.source_sad_nonrd != kHighSad);
   }
 
 #if COLLECT_PICK_MODE_STAT
@@ -2795,9 +2795,9 @@
       // has motion skip the modes with zero motion for flat blocks.
       if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
         if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
-             x->content_state_sb.source_sad == kZeroSad) ||
+             x->content_state_sb.source_sad_nonrd == kZeroSad) ||
             (frame_mv[this_mode][ref_frame].as_int == 0 &&
-             x->content_state_sb.source_sad != kZeroSad &&
+             x->content_state_sb.source_sad_nonrd != kZeroSad &&
              x->source_variance == 0))
           continue;
       }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 6167a48..8c0fbec 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1502,6 +1502,7 @@
                 FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
                 FLAG_EARLY_TERMINATE;
   sf->rt_sf.var_part_split_threshold_shift = 5;
+  if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 3;
 
   // For SVC: use better mv search on base temporal layers, and only
   // on base spatial layer if highest resolution is above 640x360.
@@ -1528,6 +1529,8 @@
     sf->rt_sf.gf_refresh_based_on_qp = 1;
     sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
     sf->rt_sf.var_part_split_threshold_shift = 7;
+    if (!frame_is_intra_only(&cpi->common))
+      sf->rt_sf.var_part_based_on_qidx = 4;
   }
 
   if (speed >= 7) {
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 0511cf7..c59208a 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -412,7 +412,8 @@
 
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
                                           int q, int content_lowsumdiff,
-                                          int source_sad, int segment_id) {
+                                          int source_sad_nonrd,
+                                          int source_sad_rd, int segment_id) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 120 : 1;
@@ -484,10 +485,15 @@
   if (cm->width >= 1280 && cm->height >= 720)
     thresholds[3] = thresholds[3] << 1;
   if (cm->width * cm->height <= 352 * 288) {
-    const int qindex_thr[3][2] = { { 200, 220 }, { 200, 210 }, { 170, 220 } };
-    assert(cpi->sf.rt_sf.var_part_based_on_qidx < 3);
-    int qindex_low_thr = qindex_thr[cpi->sf.rt_sf.var_part_based_on_qidx][0];
-    int qindex_high_thr = qindex_thr[cpi->sf.rt_sf.var_part_based_on_qidx][1];
+    const int qindex_thr[5][2] = {
+      { 200, 220 }, { 200, 210 }, { 170, 220 }, { 140, 170 }, { 120, 150 }
+    };
+    int th_idx = cpi->sf.rt_sf.var_part_based_on_qidx;
+    if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3)
+      th_idx =
+          (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0;
+    const int qindex_low_thr = qindex_thr[th_idx][0];
+    const int qindex_high_thr = qindex_thr[th_idx][1];
     if (current_qindex >= qindex_high_thr) {
       threshold_base = (5 * threshold_base) >> 1;
       thresholds[1] = threshold_base >> 3;
@@ -541,7 +547,7 @@
       thresholds[3] = INT32_MAX;
       if (segment_id == 0) {
         thresholds[1] <<= 2;
-        thresholds[2] <<= (source_sad == kLowSad) ? 5 : 4;
+        thresholds[2] <<= (source_sad_nonrd == kLowSad) ? 5 : 4;
       } else {
         thresholds[1] <<= 1;
         thresholds[2] <<= 3;
@@ -552,7 +558,8 @@
       // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
       // have high source sad).
     } else if (cm->width * cm->height > 640 * 480 && segment_id == 0 &&
-               (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) {
+               (source_sad_nonrd != kHighSad ||
+                cpi->rc.avg_source_sad > 50000)) {
       thresholds[0] = (3 * thresholds[0]) >> 1;
       thresholds[3] = INT32_MAX;
       if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
@@ -562,7 +569,8 @@
             (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
       }
     } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0 &&
-               (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) {
+               (source_sad_nonrd != kHighSad ||
+                cpi->rc.avg_source_sad > 50000)) {
       thresholds[1] =
           (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
       thresholds[2] =
@@ -857,7 +865,7 @@
     return;
   } else {
     set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0,
-                       0);
+                       0, 0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -1145,11 +1153,13 @@
     const int q =
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
     set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
-                       x->content_state_sb.source_sad, 1);
+                       x->content_state_sb.source_sad_nonrd,
+                       x->content_state_sb.source_sad_rd, 1);
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
                        x->content_state_sb.low_sumdiff,
-                       x->content_state_sb.source_sad, 0);
+                       x->content_state_sb.source_sad_nonrd,
+                       x->content_state_sb.source_sad_rd, 0);
   }
 
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
@@ -1211,7 +1221,7 @@
       cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->cyclic_refresh->apply_cyclic_refresh &&
       segment_id == CR_SEGMENT_ID_BASE &&
-      x->content_state_sb.source_sad == kZeroSad &&
+      x->content_state_sb.source_sad_nonrd == kZeroSad &&
       ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0 &&
       y_sad < thresh_exit_part) {
     const int block_width = mi_size_wide[cm->seq_params->sb_size];
@@ -1288,7 +1298,7 @@
                          (thresholds[2] >> 1) &&
                      maxvar_16x16[m][i] > thresholds[2]) ||
                     (cpi->sf.rt_sf.force_large_partition_blocks &&
-                     x->content_state_sb.source_sad > kLowSad &&
+                     x->content_state_sb.source_sad_nonrd > kLowSad &&
                      cpi->rc.frame_source_sad < 20000 &&
                      maxvar_16x16[m][i] > (thresholds[2] >> 4) &&
                      maxvar_16x16[m][i] > (minvar_16x16[m][i] << 2)))) {