RTC: Prune GLOBAL_GLOBALMV with var of GLOBALMV

Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   |  SPD  |
|---------|---------|----------|----------|---------|-------|
|    9    |   rtc   | -0.099%  | -0.119%  | -0.115% | +0.7% |
|---------|---------|----------|----------|---------|-------|
|   10    |   rtc   | +0.074%  | +0.091%  | +0.055% | +0.7% |

STATS_CHANGED

Change-Id: I7e9ee26ae83c5fac2e23c4db8bcd965f965321a2
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 04716ad..d5e1853 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -576,7 +576,9 @@
                                       int mi_row, int mi_col, MACROBLOCK *x,
                                       MACROBLOCKD *xd, RD_STATS *rd_stats,
                                       int *early_term, int calculate_rd,
-                                      int64_t best_sse) {
+                                      int64_t best_sse,
+                                      unsigned int *var_output,
+                                      unsigned int var_prune_threshold) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -613,6 +615,12 @@
   block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
                  4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
   var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  if (var_output) {
+    *var_output = var;
+    if (*var_output > var_prune_threshold) {
+      return;
+    }
+  }
 
   rd_stats->sse = sse;
 
@@ -1565,7 +1573,8 @@
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
     if (use_model_yrd_large)
       model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
-                                &pf_rd_stats[i], this_early_term, 1, best_sse);
+                                &pf_rd_stats[i], this_early_term, 1, best_sse,
+                                NULL, UINT_MAX);
     else
       model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
     pf_rd_stats[i].rate += av1_get_switchable_rate(
@@ -1710,8 +1719,8 @@
       av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
       if (use_model_yrd_large)
         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
-                                  &pf_rd_stats[i], this_early_term, 1,
-                                  best_sse);
+                                  &pf_rd_stats[i], this_early_term, 1, best_sse,
+                                  NULL, UINT_MAX);
       else
         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
       pf_rd_stats[i].rate +=
@@ -1774,7 +1783,7 @@
         if (use_model_yrd_large)
           model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
                                     &pf_rd_stats[i], this_early_term, 1,
-                                    best_sse);
+                                    best_sse, NULL, UINT_MAX);
         else
           model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
 
@@ -2604,6 +2613,7 @@
   int tot_num_comp_modes = 9;
   int ref_mv_idx = 0;
   int skip_comp_mode = 0;
+  unsigned int global_mv_var[REF_FRAMES] = { UINT_MAX };
 #if CONFIG_AV1_TEMPORAL_DENOISING
   const int denoise_recheck_zeromv = 1;
   AV1_PICKMODE_CTX_DEN ctx_den;
@@ -3053,9 +3063,29 @@
                                       0);
 
       if (use_model_yrd_large) {
+        unsigned int var = UINT_MAX;
+        unsigned int var_threshold = UINT_MAX;
+        if (cpi->sf.rt_sf.prune_global_globalmv_with_globalmv &&
+            this_mode == GLOBAL_GLOBALMV) {
+          if (mode_checked[GLOBALMV][ref_frame]) {
+            var_threshold = AOMMIN(var_threshold, global_mv_var[ref_frame]);
+          }
+          if (mode_checked[GLOBALMV][ref_frame2]) {
+            var_threshold = AOMMIN(var_threshold, global_mv_var[ref_frame2]);
+          }
+        }
+
         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,
                                   &this_early_term, use_modeled_non_rd_cost,
-                                  best_pickmode.best_sse);
+                                  best_pickmode.best_sse, &var, var_threshold);
+        if (this_mode == GLOBALMV) {
+          global_mv_var[ref_frame] = var;
+        } else if (this_mode == GLOBAL_GLOBALMV) {
+          if (var > var_threshold) {
+            if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+            continue;
+          }
+        }
       } else {
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc,
                           use_modeled_non_rd_cost);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index b164ec3..616172e 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1683,6 +1683,7 @@
     for (int i = 0; i < BLOCK_SIZES; ++i)
       sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
     sf->rt_sf.var_part_based_on_qidx = 0;
+    sf->rt_sf.prune_global_globalmv_with_globalmv = true;
   }
   if (speed >= 10) {
     sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
@@ -2020,6 +2021,7 @@
   rt_sf->tx_size_level_based_on_qstep = 0;
   rt_sf->reduce_zeromv_mvres = false;
   rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
+  rt_sf->prune_global_globalmv_with_globalmv = false;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 4ae3e3b..2f431f0 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1565,6 +1565,10 @@
   // with moderate increased encoding time.
   // Set to zero to turn off this speed feature.
   int screen_content_cdef_filter_qindex_thresh;
+
+  // Prunes global_globalmv search if its variance is \gt the globalmv's
+  // variance.
+  bool prune_global_globalmv_with_globalmv;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */