RTC: Collect variance stats to prune compound modes

This commit consists of the following three changes:
  * The variance of residue for each (inter mode, ref_frame) combination
    is recorded.
  * An old speed feature prune_global_globalmv_with_globalmv is expanded
    to prune nearst_nearestmv as well.
  * A new speed feature prune_compoundmode_with_singlecompound_var is
    introduced -- when a compound mode's single mode counter part is
    outperformed by its peer, the compound mode is pruned.

Due to the overhead of stat collection, there is ~0.1% encoding time
increase when the speed features are not turned on. However, the
encoding time is improved by 0.9% on speed 7. The stats collected can be
used to create new speed features for speed 8 - 9 later.

Performance:
| SPD_SET | TESTSET  | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|----------|----------|----------|---------|-------|
|    7    |   rtc    | +0.036%  | -0.067%  | -0.013% | -0.9% |
|---------|----------|----------|----------|---------|-------|
|    8    |   rtc    | +0.006%  | -0.005%  | -0.003% | +0.0% |
|---------|----------|----------|----------|---------|-------|
|    9    |   rtc    | +0.000%  | +0.000%  | +0.000% | +0.1% |
|---------|----------|----------|----------|---------|-------|
|   10    |   rtc    | +0.000%  | +0.000%  | +0.000% | +0.1% |

STATS_CHANGED

Change-Id: Ib43b4db7bc90e89473feff8ea78879346e1777c3
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 29cc7aa..8bbe7fd 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -778,7 +778,8 @@
 
 static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
-                              RD_STATS *rd_stats, int calculate_rd) {
+                              RD_STATS *rd_stats, unsigned int *var_out,
+                              int calculate_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -796,6 +797,9 @@
       p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
   int force_skip = 0;
   xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
+  if (var_out) {
+    *var_out = var;
+  }
 
   if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
     const int bwide = block_size_wide[bsize];
@@ -1531,7 +1535,8 @@
     mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
     xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
-    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
+    unsigned int var;
+    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1);
 
     const int16_t mode_ctx =
         av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
@@ -1597,6 +1602,8 @@
  *                                    for prediction re-use
  * \param[out]   this_early_term      Flag, indicating that transform can be
  *                                    skipped
+ * \param[out]   var                  The residue variance of the current
+ *                                    predictor.
  * \param[in]    use_model_yrd_large  Flag, indicating special logic to handle
  *                                    large blocks
  * \param[in]    best_sse             Best sse so far.
@@ -1611,8 +1618,8 @@
                               int mi_row, int mi_col, PRED_BUFFER *tmp,
                               BLOCK_SIZE bsize, int reuse_inter_pred,
                               PRED_BUFFER **this_mode_pred,
-                              int *this_early_term, int use_model_yrd_large,
-                              int64_t best_sse) {
+                              int *this_early_term, unsigned int *var,
+                              int use_model_yrd_large, int64_t best_sse) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -1635,17 +1642,19 @@
     mi->interp_filters.as_filters.x_filter = filters_ref_set[i].filter_x;
     mi->interp_filters.as_filters.y_filter = filters_ref_set[i].filter_y;
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    unsigned int curr_var = UINT_MAX;
     if (use_model_yrd_large)
       model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
                                 &pf_rd_stats[i], this_early_term, 1, best_sse,
-                                NULL, UINT_MAX);
+                                &curr_var, UINT_MAX);
     else
-      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], &curr_var, 1);
     pf_rd_stats[i].rate += av1_get_switchable_rate(
         x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
     cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
     pf_tx_size[i] = mi->tx_size;
     if (cost < best_cost) {
+      *var = curr_var;
       best_filter_index = i;
       best_cost = cost;
       best_skip = pf_rd_stats[i].skip_txfm;
@@ -1786,7 +1795,7 @@
                                   &pf_rd_stats[i], this_early_term, 1, best_sse,
                                   NULL, UINT_MAX);
       else
-        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1);
       pf_rd_stats[i].rate +=
           av1_get_switchable_rate(x, xd, cm->features.interp_filter,
                                   cm->seq_params->enable_dual_filter);
@@ -1849,7 +1858,7 @@
                                     &pf_rd_stats[i], this_early_term, 1,
                                     best_sse, NULL, UINT_MAX);
         else
-          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1);
 
         pf_rd_stats[i].rate +=
             mode_costs->motion_mode_cost[bsize][mi->motion_mode];
@@ -1883,6 +1892,7 @@
 #define COLLECT_PICK_MODE_STAT 0
 
 #if COLLECT_PICK_MODE_STAT
+#include "aom_ports/aom_timer.h"
 typedef struct _mode_search_stat {
   int32_t num_blocks[BLOCK_SIZES];
   int64_t avg_block_times[BLOCK_SIZES];
@@ -2290,7 +2300,7 @@
     compute_intra_yprediction(cm, this_mode, bsize, x, xd);
     // Look into selecting tx_size here, based on prediction residual.
     if (use_modeled_non_rd_cost)
-      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, 1);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, NULL, 1);
     else
       av1_block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, bsize,
                     mi->tx_size, DCT_DCT, 0);
@@ -2694,6 +2704,62 @@
   return 1;
 }
 
+static AOM_INLINE bool previous_mode_performed_poorly(
+    PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+    const unsigned int (*vars)[REF_FRAMES],
+    const int64_t (*uv_dist)[REF_FRAMES]) {
+  unsigned int best_var = UINT_MAX;
+  int64_t best_uv_dist = INT64_MAX;
+  for (int midx = 0; midx < RTC_INTER_MODES; midx++) {
+    best_var = AOMMIN(best_var, vars[midx][ref_frame]);
+    best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]);
+  }
+  assert(best_var != UINT_MAX && "Invalid variance data.");
+  const float mult = 1.125f;
+  bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame];
+  if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX &&
+      best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) {
+    // If we have chroma info, then take it into account
+    var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame];
+  }
+  return var_bad;
+}
+
+static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
+    PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
+    MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
+    const uint8_t (*mode_checked)[REF_FRAMES],
+    const unsigned int (*vars)[REF_FRAMES],
+    const int64_t (*uv_dist)[REF_FRAMES]) {
+  const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode);
+  const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode);
+
+  bool first_ref_valid = false, second_ref_valid = false;
+  bool first_ref_bad = false, second_ref_bad = false;
+  if (mode_checked[single_mode0][ref_frame] &&
+      frame_mv[single_mode0][ref_frame].as_int ==
+          frame_mv[compound_mode][ref_frame].as_int &&
+      vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) {
+    first_ref_valid = true;
+    first_ref_bad =
+        previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist);
+  }
+  if (mode_checked[single_mode1][ref_frame2] &&
+      frame_mv[single_mode1][ref_frame2].as_int ==
+          frame_mv[compound_mode][ref_frame2].as_int &&
+      vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) {
+    second_ref_valid = true;
+    second_ref_bad =
+        previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist);
+  }
+  if (first_ref_valid && second_ref_valid) {
+    return first_ref_bad && second_ref_bad;
+  } else if (first_ref_valid || second_ref_valid) {
+    return first_ref_bad || second_ref_bad;
+  }
+  return false;
+}
+
 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                   MACROBLOCK *x, RD_STATS *rd_cost,
                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -2760,9 +2826,13 @@
   int use_modeled_non_rd_cost = 0;
   bool comp_use_zero_zeromv_only = 0;
   int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT;
-  unsigned int zeromv_var[REF_FRAMES];
-  for (int idx = 0; idx < REF_FRAMES; idx++) {
-    zeromv_var[idx] = UINT_MAX;
+  unsigned int vars[RTC_INTER_MODES][REF_FRAMES];
+  int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES];
+  for (int idx = 0; idx < RTC_INTER_MODES; idx++) {
+    for (int ref = 0; ref < REF_FRAMES; ref++) {
+      vars[idx][ref] = UINT_MAX;
+      uv_dist[idx][ref] = INT64_MAX;
+    }
   }
 #if CONFIG_AV1_TEMPORAL_DENOISING
   const int denoise_recheck_zeromv = 1;
@@ -2961,6 +3031,13 @@
          ref_frame != LAST_FRAME))
       continue;
 
+    if (cpi->sf.rt_sf.prune_compoundmode_with_singlemode_var && comp_pred &&
+        prune_compoundmode_with_singlemode_var(this_mode, ref_frame, ref_frame2,
+                                               frame_mv, mode_checked, vars,
+                                               uv_dist)) {
+      continue;
+    }
+
     force_mv_inter_layer = 0;
     if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
         ((ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
@@ -3147,6 +3224,7 @@
 #endif
       search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
                         reuse_inter_pred, &this_mode_pred, &this_early_term,
+                        &vars[INTER_OFFSET(this_mode)][ref_frame],
                         use_model_yrd_large, best_pickmode.best_sse);
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_mark(&ms_stat.timer2);
@@ -3193,29 +3271,34 @@
         av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
                                       0);
 
+      unsigned int var_threshold = UINT_MAX;
+      if (cpi->sf.rt_sf.prune_compoundmode_with_singlecompound_var &&
+          comp_pred && use_model_yrd_large) {
+        const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode);
+        const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode);
+        var_threshold =
+            AOMMIN(var_threshold, vars[INTER_OFFSET(single_mode0)][ref_frame]);
+        var_threshold =
+            AOMMIN(var_threshold, vars[INTER_OFFSET(single_mode1)][ref_frame2]);
+      }
       if (use_model_yrd_large) {
-        unsigned int var_threshold = UINT_MAX;
-        if (cpi->sf.rt_sf.prune_global_globalmv_with_zeromv &&
-            this_mode == GLOBAL_GLOBALMV) {
-          var_threshold = AOMMIN(var_threshold, zeromv_var[ref_frame]);
-          var_threshold = AOMMIN(var_threshold, zeromv_var[ref_frame2]);
-        }
-
         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,
                                   &this_early_term, use_modeled_non_rd_cost,
                                   best_pickmode.best_sse, &var, var_threshold);
-        if (!comp_pred && frame_mv[this_mode][ref_frame].as_int == 0) {
-          zeromv_var[ref_frame] = var;
-        } else if (this_mode == GLOBAL_GLOBALMV) {
-          if (var > var_threshold) {
-            if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
-            continue;
-          }
-        }
       } else {
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc,
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var,
                           use_modeled_non_rd_cost);
       }
+      if (!comp_pred) {
+        vars[INTER_OFFSET(this_mode)][ref_frame] = var;
+        if (frame_mv[this_mode][ref_frame].as_int == 0) {
+          vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+        }
+      }
+      if (comp_pred && var > var_threshold) {
+        if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+        continue;
+      }
 #if COLLECT_PICK_MODE_STAT
       aom_usec_timer_mark(&ms_stat.timer2);
       ms_stat.model_rd_time[bsize][this_mode] +=
@@ -3291,6 +3374,9 @@
         if (this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
             nonskip_rdc.rate != INT_MAX)
           this_rdc = nonskip_rdc;
+        if (!comp_pred) {
+          uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist;
+        }
         this_rdc.rate += rdc_uv.rate;
         this_rdc.dist += rdc_uv.dist;
         this_rdc.skip_txfm = this_rdc.skip_txfm && rdc_uv.skip_txfm;
@@ -3326,7 +3412,7 @@
 
     if (!comp_pred && frame_mv[this_mode][ref_frame].as_int == 0 &&
         var < UINT_MAX) {
-      zeromv_var[ref_frame] = var;
+      vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
     }
 
     this_rdc.rate += ref_costs_single[ref_frame];
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 85cc3f7..13112ab 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1670,7 +1670,8 @@
 
     sf->winner_mode_sf.dc_blk_pred_level = 0;
     sf->rt_sf.var_part_based_on_qidx = 3;
-    sf->rt_sf.prune_global_globalmv_with_zeromv = true;
+    sf->rt_sf.prune_compoundmode_with_singlecompound_var = true;
+    sf->rt_sf.prune_compoundmode_with_singlemode_var = true;
   }
 
   if (speed >= 8) {
@@ -1685,6 +1686,7 @@
     sf->interp_sf.cb_pred_filter_search = 1;
     sf->rt_sf.var_part_based_on_qidx = 4;
     sf->rt_sf.partition_direct_merging = 1;
+    sf->rt_sf.prune_compoundmode_with_singlemode_var = false;
   }
   if (speed >= 9) {
     sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
@@ -2037,10 +2039,11 @@
   rt_sf->tx_size_level_based_on_qstep = 0;
   rt_sf->reduce_zeromv_mvres = false;
   rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
-  rt_sf->prune_global_globalmv_with_zeromv = false;
+  rt_sf->prune_compoundmode_with_singlecompound_var = false;
   rt_sf->frame_level_mode_cost_update = false;
   rt_sf->check_only_zero_zeromv_on_large_blocks = false;
   rt_sf->disable_cdf_update_non_reference_frame = false;
+  rt_sf->prune_compoundmode_with_singlemode_var = false;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1b285fb..96a21e4 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1580,9 +1580,9 @@
   // Set to zero to turn off this speed feature.
   int screen_content_cdef_filter_qindex_thresh;
 
-  // Prunes global_globalmv search if its variance is \gt the globalmv's
-  // variance.
-  bool prune_global_globalmv_with_zeromv;
+  // Prune compound mode if its variance is higher than the variance of single
+  // modes.
+  bool prune_compoundmode_with_singlecompound_var;
 
   // Allow mode cost update at frame level every couple frames. This
   // overrides the command line setting --mode-cost-upd-freq=3 (never update
@@ -1596,6 +1596,9 @@
 
   // Allow for disabling cdf update for non reference frames in svc mode.
   bool disable_cdf_update_non_reference_frame;
+
+  // Prune compound modes if the single modes variances do not perform well.
+  bool prune_compoundmode_with_singlemode_var;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */