Add a speed feature to use tx type prune based on estimated RD.
The speed feature is turned on for speed 4 and 5.
On speed 4, speed loss is ~2%, with BD-rate gain of ~0.2% in overall
PSNR.
BD-rate on cpu-used=4 (%):
avgPSNR overPSNR ssim VMAF
lowres -0.202 -0.212 -0.190 -0.535
midres -0.162 -0.181 -0.161 -0.437
ugc360p -0.249 -0.225 -0.136 -0.329
Also fixed a few implicit conversions.
STATS_CHANGED
Change-Id: I2964dfb3cabe6f0090418f18cc9a46dd4dd578bf
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b683e9e..9ef22d8 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2353,6 +2353,10 @@
}
#endif // CONFIG_COLLECT_RD_STATS >= 2
#endif // CONFIG_COLLECT_RD_STATS
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100
// R-D costs are sorted in ascending order.
static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
int i, j, k;
@@ -2422,10 +2426,10 @@
num_cand++;
}
- if (num_cand == 0) return 0xFFFF;
+ if (num_cand == 0) return (uint16_t)0xFFFF;
sort_rd(rds, txk_map, num_cand);
- uint16_t prune = ~(1 << txk_map[0]);
+ uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
// 0 < prune_factor <= 1000 controls aggressiveness
int64_t factor = 0;
@@ -2439,13 +2443,13 @@
return prune;
}
-int16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
- int block, TX_SIZE tx_size, int blk_row,
- int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
- int16_t allowed_tx_mask, int prune_factor,
- const TXB_CTX *const txb_ctx,
- int reduced_tx_set_used, int64_t ref_best_rd,
- int num_sel) {
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+ int16_t allowed_tx_mask, int prune_factor,
+ const TXB_CTX *const txb_ctx,
+ int reduced_tx_set_used, int64_t ref_best_rd,
+ int num_sel) {
const AV1_COMMON *cm = &cpi->common;
int idx;
@@ -2506,7 +2510,7 @@
if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
}
- if (skip_h[idx_h[0]]) return 0xFFFF;
+ if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
// evaluate vertical with the best horizontal chosen
rds_v[0] = rds_h[0];
@@ -2557,7 +2561,7 @@
}
sort_rd(rds, txk_map, num_cand);
- uint16_t prune = ~(1 << txk_map[0]);
+ uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
num_sel = AOMMIN(num_sel, num_cand);
for (int i = 1; i < num_sel; i++) {
@@ -2744,18 +2748,37 @@
}
allowed_tx_mask &= (~prune);
}
-
for (i = 0; i < TX_TYPES; i++) {
if (allowed_tx_mask & (1 << i)) num_allowed++;
}
assert(num_allowed > 0);
- int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
- // !fast_tx_search && txk_end != txk_start && plane == 0
- if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
- num_allowed > allowed_tx_count) {
- prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
- x->prune_mode, txk_map, &allowed_tx_mask);
+ if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+ int pf = prune_factors[x->prune_mode];
+ int mf = mul_factors[x->prune_mode];
+ if (num_allowed <= 7) {
+ const uint16_t prune = prune_txk_type(
+ cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+ txk_map, allowed_tx_mask, pf, txb_ctx, cm->reduced_tx_set_used);
+ allowed_tx_mask &= (~prune);
+ } else {
+ const int num_sel = (num_allowed * mf + 50) / 100;
+ const uint16_t prune = prune_txk_type_separ(
+ cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+ txk_map, allowed_tx_mask, pf, txb_ctx, cm->reduced_tx_set_used,
+ ref_best_rd, num_sel);
+
+ allowed_tx_mask &= (~prune);
+ }
+ } else {
+ assert(num_allowed > 0);
+ int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
+ // !fast_tx_search && txk_end != txk_start && plane == 0
+ if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
+ num_allowed > allowed_tx_count) {
+ prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+ x->prune_mode, txk_map, &allowed_tx_mask);
+ }
}
}
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 57f56ac..c99c727 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -494,6 +494,7 @@
sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1;
sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+ sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
// TODO(any): Experiment with enabling of this speed feature as hash state
// is reset during winner mode processing
sf->tx_sf.use_intra_txb_hash = 0;
@@ -987,6 +988,7 @@
tx_sf->tx_type_search.fast_inter_tx_type_search = 0;
tx_sf->tx_type_search.skip_tx_search = 0;
tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+ tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0;
tx_sf->txb_split_cap = 1;
tx_sf->adaptive_txb_search_level = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index bb6784f..e2c7d5e 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -227,6 +227,8 @@
// Prune tx type search using previous frame stats.
int prune_tx_type_using_stats;
+ // Prune tx type search using estimated RDcost
+ int prune_tx_type_est_rd;
// Flag used to control the winner mode processing for tx type pruning for
// inter blocks. It enables further tx type mode pruning based on ML model for