rtc: Compound mode pruning for very low SAD
Improved HDRES speed 9 and 10, using low values of 64x64
blk-level SAD to skip compound mode evals for bsize 64x64
and above. The thresholds are tuned conservatively
considering noise presence in bigger blocks.
Instruction Count BD-Rate Loss(%)
cpu Test-set Reduction(%) avg.psnr ovr.psnr ssim
9 HDRES 1.796 0.2792 0.4642 0.2207
10 HDRES 1.412 0.2941 0.3509 0.2986
Details of worst case drops:
cpu Clip BD-Rate Loss (%)
(avg. PSNR)
9 testnoise720p 1.54
10 vidyo1_1280x720_60 1.13
STATS_CHANGED
Change-Id: I4271c34d427eb4ae456d6baf31164d650c964466
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index ad38eb5..a578e80 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -3325,6 +3325,11 @@
* File pointer to second pass log
*/
FILE *second_pass_log_stream;
+
+ /*!
+ * Buffer to store 64x64 SAD
+ */
+ uint64_t *src_sad_blk_64x64;
} AV1_COMP;
/*!
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 89784c5..ae99aee 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -302,6 +302,11 @@
cpi->consec_zero_mv = NULL;
}
+ if (cpi->src_sad_blk_64x64) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+
aom_free(cpi->mb_weber_stats);
cpi->mb_weber_stats = NULL;
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index bd34b2d..e2d7841 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -2375,6 +2375,66 @@
}
}
+static int skip_comp_based_on_sad(AV1_COMP *cpi, MACROBLOCK *x,
+ const int mi_row, const int mi_col,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ assert(!(mi_row % 16) && !(mi_col % 16));
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols =
+ (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const uint64_t sad_skp_comp_th[2][3] = { { 2700, 3100 }, // CPU 9
+ { 2700, 3200 } }; // CPU 10
+ const uint64_t sad_blkwise_var_th = 5000;
+ const float qindex_th_scale[5] = { 0.75f, 0.9f, 1.0f, 1.1f, 1.25f };
+ const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+ assert(qindex_band < 5);
+ const int sp_idx = (cpi->sf.rt_sf.sad_based_comp_prune >= 2);
+ const int bsize_idx = (bsize == BLOCK_128X128);
+ const uint64_t sad_skp_comp_th_val = (uint64_t)(
+ sad_skp_comp_th[sp_idx][bsize_idx] * qindex_th_scale[qindex_band]);
+ uint64_t blk_sad = 0, sad00, sad01, sad10, sad11, min_sad, max_sad;
+ const int sbi_col = mi_col / 16;
+ const int sbi_row = mi_row / 16;
+ const uint64_t *cur_blk_sad =
+ &cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+
+ if (bsize == BLOCK_128X128) {
+ sad00 = cur_blk_sad[0];
+ sad01 = cur_blk_sad[1];
+ sad10 = cur_blk_sad[sb_cols];
+ sad11 = cur_blk_sad[1 + sb_cols];
+ min_sad = AOMMIN(AOMMIN(AOMMIN(sad00, sad01), sad10), sad11);
+ max_sad = AOMMAX(AOMMAX(AOMMAX(sad00, sad01), sad10), sad11);
+ if (max_sad - min_sad > sad_blkwise_var_th) return 0;
+ blk_sad = (sad00 + sad01 + sad10 + sad11 + 2) >> 2;
+ } else if (bsize == BLOCK_128X64) {
+ sad00 = cur_blk_sad[0];
+ sad01 = cur_blk_sad[1];
+ min_sad = AOMMIN(sad00, sad01);
+ max_sad = AOMMAX(sad00, sad01);
+ if (max_sad - min_sad > sad_blkwise_var_th) return 0;
+ blk_sad = (sad00 + sad01 + 1) >> 1;
+ } else if (bsize == BLOCK_64X128) {
+ sad00 = cur_blk_sad[0];
+ sad10 = cur_blk_sad[sb_cols];
+ min_sad = AOMMIN(sad00, sad10);
+ max_sad = AOMMAX(sad00, sad10);
+ if (max_sad - min_sad > sad_blkwise_var_th) return 0;
+ blk_sad = (sad00 + sad10 + 1) >> 1;
+ } else if (bsize <= BLOCK_64X64) {
+ blk_sad = cur_blk_sad[0];
+ } else {
+ assert(0);
+ }
+
+ if (blk_sad < sad_skp_comp_th_val) return 1;
+
+ return 0;
+}
+
void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
MACROBLOCK *x, RD_STATS *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -2442,6 +2502,7 @@
int num_comp_modes_ref = 0;
int tot_num_comp_modes = 9;
int ref_mv_idx = 0;
+ int skip_comp_mode = 0;
#if CONFIG_AV1_TEMPORAL_DENOISING
const int denoise_recheck_zeromv = 1;
AV1_PICKMODE_CTX_DEN ctx_den;
@@ -2573,6 +2634,12 @@
AOMMIN(max_txsize_lookup[bsize],
tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
TX_16X16);
+
+ // Skip compound mode based on sad
+ if ((cpi->sf.rt_sf.sad_based_comp_prune) && (bsize >= BLOCK_64X64) &&
+ (cpi->src_sad_blk_64x64 != NULL))
+ skip_comp_mode = skip_comp_based_on_sad(cpi, x, mi_row, mi_col, bsize);
+
for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
const struct segmentation *const seg = &cm->seg;
@@ -2589,6 +2656,7 @@
sizeof(txfm_info->blk_skip[0]) * num_8x8_blocks);
if (idx >= num_inter_modes) {
+ if (skip_comp_mode) continue;
int comp_index = idx - num_inter_modes;
if (comp_index % 3 == 0) {
int i = 0;
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index c1bff01..5a0c6f5 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -2684,6 +2684,14 @@
int last_src_ystride;
int last_src_width;
int last_src_height;
+ if (cm->spatial_layer_id != 0 || cm->width != cm->render_width ||
+ cm->height != cm->render_height || cpi->unscaled_source == NULL ||
+ cpi->unscaled_last_source == NULL) {
+ if (cpi->src_sad_blk_64x64) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+ }
if (cpi->unscaled_source == NULL || cpi->unscaled_last_source == NULL) return;
src_y = unscaled_src->y_buffer;
src_ystride = unscaled_src->y_stride;
@@ -2693,7 +2701,13 @@
last_src_ystride = unscaled_last_src->y_stride;
last_src_width = unscaled_last_src->y_width;
last_src_height = unscaled_last_src->y_height;
- if (src_width != last_src_width || src_height != last_src_height) return;
+ if (src_width != last_src_width || src_height != last_src_height) {
+ if (cpi->src_sad_blk_64x64) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+ return;
+ }
rc->high_source_sad = 0;
rc->high_num_blocks_with_motion = 0;
rc->prev_avg_source_sad = rc->avg_source_sad;
@@ -2721,6 +2735,17 @@
int light_change = 0;
// Flag to check light change or not.
const int check_light_change = 0;
+ // Store blkwise SAD for later use
+ if (cpi->sf.rt_sf.sad_based_comp_prune && (cm->spatial_layer_id == 0) &&
+ (cm->width == cm->render_width) && (cm->height == cm->render_height)) {
+ full_sampling = 1;
+ if (cpi->src_sad_blk_64x64 == NULL) {
+ cpi->src_sad_blk_64x64 = (uint64_t *)aom_malloc(
+ (sb_cols * sb_rows) * sizeof(*cpi->src_sad_blk_64x64));
+ memset(cpi->src_sad_blk_64x64, 0,
+ (sb_cols * sb_rows) * sizeof(*cpi->src_sad_blk_64x64));
+ }
+ }
for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
// Checker-board pattern, ignore boundary.
@@ -2731,6 +2756,8 @@
(sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
last_src_ystride);
+ if (cpi->src_sad_blk_64x64 != NULL)
+ cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
if (check_light_change) {
unsigned int sse, variance;
variance = cpi->ppi->fn_ptr[bsize].vf(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c266ef3..1a16021 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1291,13 +1291,14 @@
} else {
if (speed >= 9) {
sf->rt_sf.sad_based_adp_altref_lag = 1;
+ sf->rt_sf.sad_based_comp_prune = 1;
}
if (speed >= 10) {
sf->rt_sf.sad_based_adp_altref_lag = 3;
+ sf->rt_sf.sad_based_comp_prune = 2;
}
}
if (cpi->ppi->use_svc) {
- sf->rt_sf.use_comp_ref_nonrd = 0;
if (cpi->svc.ref_frame_comp[0] || cpi->svc.ref_frame_comp[1] ||
cpi->svc.ref_frame_comp[2]) {
sf->rt_sf.use_comp_ref_nonrd = 1;
@@ -1307,6 +1308,9 @@
cpi->svc.ref_frame_comp[1] && cpi->svc.reference[LAST2_FRAME - 1];
sf->rt_sf.ref_frame_comp_nonrd[2] =
cpi->svc.ref_frame_comp[2] && cpi->svc.reference[ALTREF_FRAME - 1];
+ } else {
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.sad_based_comp_prune = 0;
}
}
if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
@@ -1319,6 +1323,7 @@
sf->rt_sf.skip_cdef_sb = 1;
sf->rt_sf.use_rtc_tf = 0;
sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.sad_based_comp_prune = 0;
sf->rt_sf.source_metrics_sb_nonrd = 1;
if (cpi->rc.high_source_sad == 1) {
sf->rt_sf.force_large_partition_blocks = 0;
@@ -1953,6 +1958,7 @@
rt_sf->sad_based_adp_altref_lag = 0;
rt_sf->partition_direct_merging = 0;
rt_sf->var_part_based_on_qidx = 0;
+ rt_sf->sad_based_comp_prune = 0;
}
void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index f0be002..d0512ce 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1479,6 +1479,8 @@
// Enable/disable partition direct merging.
int partition_direct_merging;
+ // SAD based compound mode pruning
+ int sad_based_comp_prune;
} REAL_TIME_SPEED_FEATURES;
/*!\endcond */