Bypass tx type search for DC only blocks
This CL adds logic to predict DC only blocks if the
residual variance is below a qstep based threshold.
For such blocks, transform type search is bypassed.
Instruction Count BD-Rate Loss(%)
cpu-used Reduction(%) avg.psnr ovr.psnr ssim
6 1.367 0.1291 0.1265 0.0428
STATS_CHANGED
Change-Id: I2ea9565906b5cc13cbf88d6ba1846ecf9f000672
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 9f7b2a9..a712e4f 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -468,6 +468,11 @@
* candidate, then code it as TX_MODE_SELECT.
*/
TX_MODE tx_mode_search_type;
+
+ /*!
+ * Flag to enable/disable DC block prediction.
+ */
+ unsigned int predict_dc_level;
} TxfmSearchParams;
/*!\cond */
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 3afbc7a..14ee354 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -259,6 +259,19 @@
};
#endif
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+ TxfmParam *txfm_param, int64_t per_px_mean) {
+ assert(per_px_mean != INT64_MAX);
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+ memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+ coeff[0] =
+ (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
QUANT_PARAM *qparam) {
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 14761db..cdf7318 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -81,6 +81,9 @@
const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
TX_TYPE tx_type, QUANT_PARAM *qparam);
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+ TxfmParam *txfm_param, int64_t per_px_mean);
+
void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
QUANT_PARAM *qparam);
@@ -153,6 +156,14 @@
return false;
return true;
}
+
+// Scaling terms (precision of 12 bits) to obtain DC coefficient from block
+// residual mean
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+ 1024, 2048, 4096, 4096, 0, 1448, 1448, 2896, 2896, 2896,
+ 2896, 0, 0, 2048, 2048, 4096, 4096, 0, 0
+};
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index db15ba0..2b9754e 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1797,6 +1797,13 @@
* Corresponds to use_skip_flag_prediction speed feature.
*/
unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+ /*!
+ * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+ * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+ */
+ unsigned int predict_dc_level[MODE_EVAL_TYPES];
} WinnerModeParams;
/*!
diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 98fb8f6..c7c7d17 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h
@@ -479,6 +479,8 @@
txfm_params->use_default_intra_tx_type = 0;
txfm_params->skip_txfm_level =
winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[DEFAULT_EVAL];
// Set default transform domain distortion type
set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
@@ -501,7 +503,8 @@
cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
txfm_params->skip_txfm_level =
winner_mode_params->skip_txfm_level[MODE_EVAL];
-
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[MODE_EVAL];
// Set transform domain distortion type for mode evaluation
set_tx_domain_dist_params(
winner_mode_params, txfm_params,
@@ -530,6 +533,8 @@
txfm_params->use_default_intra_tx_type = 0;
txfm_params->skip_txfm_level =
winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
// Set transform domain distortion type for winner mode evaluation
set_tx_domain_dist_params(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index fb84022..4ef32b4 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -112,6 +112,15 @@
{ 1, 1, 1 },
{ 1, 2, 1 } };
+// Predict DC block levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early DC block prediction
+// 1 : Early DC block prediction based on error variance
+static unsigned int predict_dc_levels[2][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+ { 1, 1, 0 } };
+
// This table holds the maximum number of reference frames for global motion.
// The table is indexed as per the speed feature 'gm_search_type'.
// 0 : All reference frames are allowed.
@@ -662,6 +671,7 @@
sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 4 : 6;
+ sf->winner_mode_sf.enable_dc_only_blk_pred = 1;
sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
}
@@ -1192,6 +1202,7 @@
winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
winner_mode_sf->multi_winner_mode_type = 0;
+ winner_mode_sf->enable_dc_only_blk_pred = 0;
}
static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
@@ -1367,6 +1378,9 @@
memcpy(winner_mode_params->tx_size_search_methods,
tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
sizeof(winner_mode_params->tx_size_search_methods));
+ memcpy(winner_mode_params->predict_dc_level,
+ predict_dc_levels[cpi->sf.winner_mode_sf.enable_dc_only_blk_pred],
+ sizeof(winner_mode_params->predict_dc_level));
if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1926dec..ae252ab 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -886,6 +886,9 @@
// 0: speed feature OFF
// 1 / 2 : Use configured number of winner candidates
int motion_mode_for_winner_cand;
+
+ // Early DC only txfm block prediction
+ int enable_dc_only_blk_pred;
} WINNER_MODE_SPEED_FEATURES;
typedef struct LOOP_FILTER_SPEED_FEATURES {
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index d192ead..258b44c 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -401,6 +401,37 @@
return sse;
}
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+ MACROBLOCK *x, int plane, int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+ int visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+
+ diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+ uint64_t sse = 0;
+ int sum = 0;
+ sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+ if (visible_cols > 0 && visible_rows > 0) {
+ double norm_factor = 1.0 / (visible_cols * visible_rows);
+ int sign_sum = sum > 0 ? 1 : -1;
+ // Conversion to transform domain
+ *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+ *per_px_mean = sign_sum * (*per_px_mean);
+ *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+ *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+ } else {
+ *block_mse_q8 = UINT_MAX;
+ }
+ return sse;
+}
+
// Uses simple features on top of DCT coefficients to quickly predict
// whether optimal RD decision is to skip encoding the residual.
// The sse value is stored in dist.
@@ -2104,7 +2135,7 @@
int block, TX_SIZE tx_size,
int quant_b_adapt, int qstep,
unsigned int coeff_opt_satd_threshold,
- int skip_trellis) {
+ int skip_trellis, int dc_only_blk) {
if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
return skip_trellis;
@@ -2113,7 +2144,7 @@
tran_low_t *const coeff_ptr = p->coeff + block_offset;
const int n_coeffs = av1_get_max_eob(tx_size);
const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
- int satd = aom_satd(coeff_ptr, n_coeffs);
+ int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
satd = RIGHT_SIGNED_SHIFT(satd, shift);
const int skip_block_trellis =
@@ -2205,22 +2236,90 @@
int txk_map[TX_TYPES] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
- // Bit mask to indicate which transform types are allowed in the RD search.
- const uint16_t allowed_tx_mask =
- get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
- txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map);
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ int64_t block_sse;
unsigned int block_mse_q8;
- int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
- txsize_to_bsize[tx_size], &block_mse_q8);
- assert(block_mse_q8 != UINT_MAX);
+ int dc_only_blk = 0;
+ const bool predict_dc_block =
+ cpi->sf.winner_mode_sf.enable_dc_only_blk_pred && txw != 64 && txh != 64;
+ int64_t per_px_mean = INT64_MAX;
+ uint64_t block_var = UINT64_MAX;
+ if (predict_dc_block) {
+ const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+ block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], &block_mse_q8,
+ &per_px_mean, &block_var);
+ assert(block_mse_q8 != UINT_MAX);
+ uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+ if (is_cur_buf_hbd(xd))
+ block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+ // Early prediction of skip block if residual mean and variance are less
+ // than qstep based threshold
+ if (((llabs(per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) &&
+ (block_var < var_threshold)) {
+ // If the normalized mean of residual block is less than the dc qstep and
+ // the normalized block variance is less than ac qstep, then the block is
+ // assumed to be a skip block and its rdcost is updated accordingly.
+ best_rd_stats->skip_txfm = 1;
+
+ x->plane[plane].eobs[block] = 0;
+
+ best_rd_stats->dist = block_sse << 4;
+ best_rd_stats->sse = best_rd_stats->dist;
+
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+ ENTROPY_CONTEXT *ta = ctxa;
+ ENTROPY_CONTEXT *tl = ctxl;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx_tmp;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+ .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+ best_rd_stats->rate =
+ zero_blk_rate *
+ (block_size_wide[plane_bsize] >> tx_size_wide_log2[tx_size]) *
+ (block_size_high[plane_bsize] >> tx_size_high_log2[tx_size]);
+
+ best_rd_stats->rdcost =
+ RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+ x->plane[plane].txb_entropy_ctx[block] = 0;
+ return;
+ } else if (block_var < var_threshold) {
+ // Predict DC only blocks based on residual variance.
+ // For chroma plane, this early prediction is disabled for intra blocks.
+ if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) dc_only_blk = 1;
+ }
+ } else {
+ block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], &block_mse_q8);
+ assert(block_mse_q8 != UINT_MAX);
+ }
+
+ // Bit mask to indicate which transform types are allowed in the RD search.
+ uint16_t tx_mask;
+
+ // Use DCT_DCT transform for DC only block.
+ if (dc_only_blk)
+ tx_mask = 1 << DCT_DCT;
+ else
+ tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+ &txk_allowed, txk_map);
+ const uint16_t allowed_tx_mask = tx_mask;
+
if (is_cur_buf_hbd(xd)) {
block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
}
block_sse *= 16;
- const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
- const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
// Use mse / qstep^2 based threshold logic to take decision of R-D
// optimization of coeffs. For smaller residuals, coeff optimization
// would be helpful. For larger residuals, R-D optimization may not be
@@ -2241,7 +2340,9 @@
// Any 64-pt transforms only preserves half the coefficients.
// Therefore transform domain distortion is not valid for these
// transform sizes.
- txsize_sqr_up_map[tx_size] != TX_64X64;
+ (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+ // Use pixel domain distortion for DC only blocks
+ !dc_only_blk;
// Flag to indicate if an extra calculation of distortion in the pixel domain
// should be performed at the end, after the best transform type has been
// decided.
@@ -2277,11 +2378,15 @@
RD_STATS this_rd_stats;
av1_invalid_rd_stats(&this_rd_stats);
- av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+ if (!dc_only_blk)
+ av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+ else
+ av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
- qstep, txfm_params->coeff_opt_satd_threshold, skip_trellis);
+ qstep, txfm_params->coeff_opt_satd_threshold, skip_trellis,
+ dc_only_blk);
av1_quant(x, plane, block, &txfm_param, &quant_param);
@@ -2302,6 +2407,10 @@
if (eobs_ptr[block] == 0) {
// When eob is 0, pixel domain distortion is more efficient and accurate.
this_rd_stats.dist = this_rd_stats.sse = block_sse;
+ } else if (dc_only_blk) {
+ this_rd_stats.sse = block_sse;
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
} else if (use_transform_domain_distortion) {
dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
&this_rd_stats.sse);
diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 4b96d44..17a0ae6 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc
@@ -54,7 +54,7 @@
{ "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 12.5 },
#if CONFIG_AV1_HIGHBITDEPTH
{ "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0,
- 27.84 },
+ 27.74 },
#endif
{ "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
// Image coding (single frame).