AV1 RT: Implement low temp variance short circuit

Change-Id: Ia81376a99ee8a5e925e80e6f4fab88f534f63289
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 965e4cc..e462c9b 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -422,6 +422,7 @@
   float log_q;
 #endif
   int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  uint8_t variance_low[105];
 };
 
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index a54b12c..387537d 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1031,6 +1031,60 @@
   }
 }
 
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
+                                              int mi_col, BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  int x, y;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_64X64:
+    case BLOCK_32X32:
+    case BLOCK_16X16:
+      x = mi_col % 32;
+      y = mi_row % 32;
+      if (bsize == BLOCK_64X64) {
+        assert((x == 0 || x == 16) && (y == 0 || y == 16));
+      }
+      x >>= 4;
+      y >>= 4;
+      const int idx64 = y * 2 + x;
+      if (bsize == BLOCK_64X64) {
+        force_skip_low_temp_var = variance_low[1 + idx64];
+        break;
+      }
+
+      x = mi_col % 16;
+      y = mi_row % 16;
+      if (bsize == BLOCK_32X32) {
+        assert((x == 0 || x == 8) && (y == 0 || y == 8));
+      }
+      x >>= 3;
+      y >>= 3;
+      const int idx32 = y * 2 + x;
+      if (bsize == BLOCK_32X32) {
+        force_skip_low_temp_var = variance_low[5 + (idx64 << 2) + idx32];
+        break;
+      }
+
+      x = mi_col % 8;
+      y = mi_row % 8;
+      if (bsize == BLOCK_16X16) {
+        assert((x == 0 || x == 4) && (y == 0 || y == 4));
+      }
+      x >>= 2;
+      y >>= 2;
+      const int idx16 = y * 2 + x;
+      if (bsize == BLOCK_16X16) {
+        force_skip_low_temp_var =
+            variance_low[21 + (idx64 << 4) + (idx32 << 2) + idx16];
+        break;
+      }
+    default: break;
+  }
+  return force_skip_low_temp_var;
+}
+
 void av1_fast_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                        MACROBLOCK *x, int mi_row, int mi_col,
                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -1154,6 +1208,18 @@
     usable_ref_frame = GOLDEN_FRAME;
   }
 
+  if (cpi->sf.short_circuit_low_temp_var) {
+    force_skip_low_temp_var =
+        get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
+    // skip golden reference.
+    if ((cpi->sf.short_circuit_low_temp_var == 1 ||
+         cpi->sf.short_circuit_low_temp_var == 3) &&
+        force_skip_low_temp_var) {
+      usable_ref_frame = LAST_FRAME;
+    }
+  }
+
   if (!(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]))
     use_golden_nonzeromv = 0;
 
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 05ae18b..ad5b305 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -636,6 +636,7 @@
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->tx_size_search_method = USE_FAST_RD;
     sf->estimate_motion_for_var_based_partition = 0;
+    sf->short_circuit_low_temp_var = 3;
 // TODO(kyslov) Enable when better model is available
 // It gives +5% speedup and 11% overall BDRate degradation
 // So, can not enable now until better CurvFit is there
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e64cc86..18fb2d7 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -727,6 +727,10 @@
 
   // Filter mask to allow certain interp_filter type.
   uint16_t interp_filter_search_mask;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  int short_circuit_low_temp_var;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 0aaac58..d17e686 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -380,6 +380,86 @@
   }
 }
 
+static void set_low_temp_var_flag(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                  v128x128 *vt, int64_t thresholds[],
+                                  MV_REFERENCE_FRAME ref_frame_partition,
+                                  int mi_col, int mi_row) {
+  int i, j, k;
+  AV1_COMMON *const cm = &cpi->common;
+  const int mv_thr = cm->width > 640 ? 8 : 4;
+  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
+  // int_pro mv is small. If the temporal variance is small set the flag
+  // variance_low for the block. The variance threshold can be adjusted, the
+  // higher the more aggressive.
+  if (ref_frame_partition == LAST_FRAME &&
+      (cpi->sf.short_circuit_low_temp_var == 1 ||
+       (xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+    if (xd->mi[0]->sb_type == BLOCK_128X128 ||
+        xd->mi[0]->sb_type == BLOCK_64X128 ||
+        xd->mi[0]->sb_type == BLOCK_128X64) {
+      if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+        x->variance_low[0] = 1;
+    } else {
+      for (i = 0; i < 4; i++) {
+        const int idx[4][2] = { { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } };
+        const int idx_str =
+            cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
+        MB_MODE_INFO **mi_64 = cm->mi_grid_base + idx_str;
+
+        if (cm->mi_cols <= mi_col + idx[i][1] ||
+            cm->mi_rows <= mi_row + idx[i][0])
+          continue;
+
+        if ((*mi_64)->sb_type == BLOCK_64X64 ||
+            (*mi_64)->sb_type == BLOCK_64X32 ||
+            (*mi_64)->sb_type == BLOCK_32X64) {
+          int64_t threshold_64x64 = (cpi->sf.short_circuit_low_temp_var == 1 ||
+                                     cpi->sf.short_circuit_low_temp_var == 3)
+                                        ? ((5 * thresholds[1]) >> 3)
+                                        : (thresholds[1] >> 1);
+          if (vt->split[i].part_variances.none.variance < threshold_64x64)
+            x->variance_low[1 + i] = 1;
+        } else {
+          for (k = 0; k < 4; k++) {
+            const int idx1[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+            const int idx_str1 = cm->mi_stride * idx1[k][0] + idx1[k][1];
+            MB_MODE_INFO **mi_32 = cm->mi_grid_base + idx_str + idx_str1;
+
+            if (cm->mi_cols <= mi_col + idx[i][1] + idx1[k][1] ||
+                cm->mi_rows <= mi_row + idx[i][0] + idx1[k][0])
+              continue;
+            if ((*mi_32)->sb_type == BLOCK_32X32) {
+              int64_t threshold_32x32 =
+                  (cpi->sf.short_circuit_low_temp_var == 1 ||
+                   cpi->sf.short_circuit_low_temp_var == 3)
+                      ? ((5 * thresholds[2]) >> 3)
+                      : (thresholds[2] >> 1);
+              if (vt->split[i].split[k].part_variances.none.variance <
+                  threshold_32x32)
+                x->variance_low[5 + (i << 2) + k] = 1;
+            } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
+              if ((*mi_32)->sb_type == BLOCK_16X16 ||
+                  (*mi_32)->sb_type == BLOCK_32X16 ||
+                  (*mi_32)->sb_type == BLOCK_16X32) {
+                for (j = 0; j < 4; j++) {
+                  if (vt->split[i]
+                          .split[k]
+                          .split[j]
+                          .part_variances.none.variance < (thresholds[3] >> 8))
+                    x->variance_low[21 + (i << 4) + (k << 2) + j] = 1;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
                                            int content_state) {
   AV1_COMMON *const cm = &cpi->common;
@@ -454,6 +534,9 @@
   const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
 
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
   CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
 
   int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
@@ -486,6 +569,7 @@
   // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
   // 5-20 for the 16x16 blocks.
   force_split[0] = 0;
+  memset(x->variance_low, 0, sizeof(x->variance_low));
 
   if (!is_key_frame) {
     // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
@@ -768,6 +852,11 @@
     }
   }
 
+  if (cpi->sf.short_circuit_low_temp_var && !is_small_sb) {
+    set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
+                          mi_col, mi_row);
+  }
+
   if (vt2) aom_free(vt2);
   if (vt) aom_free(vt);
   return 0;