AV1 RT: Implement low temp variance short circuit
Change-Id: Ia81376a99ee8a5e925e80e6f4fab88f534f63289
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 965e4cc..e462c9b 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -422,6 +422,7 @@
float log_q;
#endif
int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+ uint8_t variance_low[105];
};
static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index a54b12c..387537d 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -1031,6 +1031,60 @@
}
}
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ int force_skip_low_temp_var = 0;
+ int x, y;
+ // Set force_skip_low_temp_var based on the block size and block offset.
+ switch (bsize) {
+ case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+ case BLOCK_64X64:
+ case BLOCK_32X32:
+ case BLOCK_16X16:
+ x = mi_col % 32;
+ y = mi_row % 32;
+ if (bsize == BLOCK_64X64) {
+ assert((x == 0 || x == 16) && (y == 0 || y == 16));
+ }
+ x >>= 4;
+ y >>= 4;
+ const int idx64 = y * 2 + x;
+ if (bsize == BLOCK_64X64) {
+ force_skip_low_temp_var = variance_low[1 + idx64];
+ break;
+ }
+
+ x = mi_col % 16;
+ y = mi_row % 16;
+ if (bsize == BLOCK_32X32) {
+ assert((x == 0 || x == 8) && (y == 0 || y == 8));
+ }
+ x >>= 3;
+ y >>= 3;
+ const int idx32 = y * 2 + x;
+ if (bsize == BLOCK_32X32) {
+ force_skip_low_temp_var = variance_low[5 + (idx64 << 2) + idx32];
+ break;
+ }
+
+ x = mi_col % 8;
+ y = mi_row % 8;
+ if (bsize == BLOCK_16X16) {
+ assert((x == 0 || x == 4) && (y == 0 || y == 4));
+ }
+ x >>= 2;
+ y >>= 2;
+ const int idx16 = y * 2 + x;
+ if (bsize == BLOCK_16X16) {
+ force_skip_low_temp_var =
+ variance_low[21 + (idx64 << 4) + (idx32 << 2) + idx16];
+ break;
+ }
+ default: break;
+ }
+ return force_skip_low_temp_var;
+}
+
void av1_fast_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
MACROBLOCK *x, int mi_row, int mi_col,
RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -1154,6 +1208,18 @@
usable_ref_frame = GOLDEN_FRAME;
}
+ if (cpi->sf.short_circuit_low_temp_var) {
+ force_skip_low_temp_var =
+ get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+ // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
+ // skip golden reference.
+ if ((cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3) &&
+ force_skip_low_temp_var) {
+ usable_ref_frame = LAST_FRAME;
+ }
+ }
+
if (!(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]))
use_golden_nonzeromv = 0;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 05ae18b..ad5b305 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -636,6 +636,7 @@
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
sf->tx_size_search_method = USE_FAST_RD;
sf->estimate_motion_for_var_based_partition = 0;
+ sf->short_circuit_low_temp_var = 3;
// TODO(kyslov) Enable when better model is available
// It gives +5% speedup and 11% overall BDRate degradation
// So, can not enable now until better CurvFit is there
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e64cc86..18fb2d7 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -727,6 +727,10 @@
// Filter mask to allow certain interp_filter type.
uint16_t interp_filter_search_mask;
+
+ // Skip a number of expensive mode evaluations for blocks with very low
+ // temporal variance.
+ int short_circuit_low_temp_var;
} SPEED_FEATURES;
struct AV1_COMP;
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 0aaac58..d17e686 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -380,6 +380,86 @@
}
}
+static void set_low_temp_var_flag(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ v128x128 *vt, int64_t thresholds[],
+ MV_REFERENCE_FRAME ref_frame_partition,
+ int mi_col, int mi_row) {
+ int i, j, k;
+ AV1_COMMON *const cm = &cpi->common;
+ const int mv_thr = cm->width > 640 ? 8 : 4;
+ // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
+ // int_pro mv is small. If the temporal variance is small set the flag
+ // variance_low for the block. The variance threshold can be adjusted, the
+ // higher the more aggressive.
+ if (ref_frame_partition == LAST_FRAME &&
+ (cpi->sf.short_circuit_low_temp_var == 1 ||
+ (xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+ xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+ xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+ xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+ if (xd->mi[0]->sb_type == BLOCK_128X128 ||
+ xd->mi[0]->sb_type == BLOCK_64X128 ||
+ xd->mi[0]->sb_type == BLOCK_128X64) {
+ if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+ x->variance_low[0] = 1;
+ } else {
+ for (i = 0; i < 4; i++) {
+ const int idx[4][2] = { { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } };
+ const int idx_str =
+ cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
+ MB_MODE_INFO **mi_64 = cm->mi_grid_base + idx_str;
+
+ if (cm->mi_cols <= mi_col + idx[i][1] ||
+ cm->mi_rows <= mi_row + idx[i][0])
+ continue;
+
+ if ((*mi_64)->sb_type == BLOCK_64X64 ||
+ (*mi_64)->sb_type == BLOCK_64X32 ||
+ (*mi_64)->sb_type == BLOCK_32X64) {
+ int64_t threshold_64x64 = (cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3)
+ ? ((5 * thresholds[1]) >> 3)
+ : (thresholds[1] >> 1);
+ if (vt->split[i].part_variances.none.variance < threshold_64x64)
+ x->variance_low[1 + i] = 1;
+ } else {
+ for (k = 0; k < 4; k++) {
+ const int idx1[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+ const int idx_str1 = cm->mi_stride * idx1[k][0] + idx1[k][1];
+ MB_MODE_INFO **mi_32 = cm->mi_grid_base + idx_str + idx_str1;
+
+ if (cm->mi_cols <= mi_col + idx[i][1] + idx1[k][1] ||
+ cm->mi_rows <= mi_row + idx[i][0] + idx1[k][0])
+ continue;
+ if ((*mi_32)->sb_type == BLOCK_32X32) {
+ int64_t threshold_32x32 =
+ (cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3)
+ ? ((5 * thresholds[2]) >> 3)
+ : (thresholds[2] >> 1);
+ if (vt->split[i].split[k].part_variances.none.variance <
+ threshold_32x32)
+ x->variance_low[5 + (i << 2) + k] = 1;
+ } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
+ if ((*mi_32)->sb_type == BLOCK_16X16 ||
+ (*mi_32)->sb_type == BLOCK_32X16 ||
+ (*mi_32)->sb_type == BLOCK_16X32) {
+ for (j = 0; j < 4; j++) {
+ if (vt->split[i]
+ .split[k]
+ .split[j]
+ .part_variances.none.variance < (thresholds[3] >> 8))
+ x->variance_low[21 + (i << 4) + (k << 2) + j] = 1;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
int content_state) {
AV1_COMMON *const cm = &cpi->common;
@@ -454,6 +534,9 @@
const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
const int num_64x64_blocks = is_small_sb ? 1 : 4;
+ // Ref frame used in partitioning.
+ MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
@@ -486,6 +569,7 @@
// Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
// 5-20 for the 16x16 blocks.
force_split[0] = 0;
+ memset(x->variance_low, 0, sizeof(x->variance_low));
if (!is_key_frame) {
// TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
@@ -768,6 +852,11 @@
}
}
+ if (cpi->sf.short_circuit_low_temp_var && !is_small_sb) {
+ set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
+ mi_col, mi_row);
+ }
+
if (vt2) aom_free(vt2);
if (vt) aom_free(vt);
return 0;