Extend early_term_luma_palette_size_search sf to speed 0
Extended the speed feature early_term_luma_palette_size_search to
speed 0 with a less aggressive header rdcost based breakout
condition for speed 0 path. Existing sf=1 logic is now moved under
sf=2 path. Also, the name of the speed feature is changed to
prune_luma_palette_size_search_level to indicate the level of
aggressiveness of pruning.
For allintra video encode (on screen content set),
Instruction Count BD-Rate Loss(%)
cpu-used Reduction(%) avg.psnr ovr.psnr ssim
0 2.490 0.0104 0.0095 0.0084
For AVIF still image encode,
Instruction Count BD-Rate Loss(%)
cpu-used Reduction(%) psnr ssim
0 1.937 0.0000 0.0002
BUG=aomedia:3096
STATS_CHANGED
Change-Id: Ib5e65555e53d2dfb7790953a2f0b0f00052101b1
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index af8a255..a76f4ef 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -261,9 +261,14 @@
const int palette_mode_rate =
intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+ // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+ const int header_rd_shift =
+ (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
// Terminate further palette_size search, if the header cost corresponding
- // to lower palette_size is more than best_rd.
- if (header_rd > *best_rd) {
+ // to lower palette_size is more than *best_rd << header_rd_shift. This
+ // logic is implemented with a right shift in the LHS to prevent a possible
+ // overflow with the left shift in RHS.
+ if ((header_rd >> header_rd_shift) > *best_rd) {
*do_header_rd_based_breakout = true;
return;
}
@@ -550,11 +555,12 @@
const int min_n = start_n_lookup_table[max_n];
const int step_size = step_size_lookup_table[max_n];
assert(min_n >= PALETTE_MIN_SIZE);
- // Header rdcost based early gating is currently enabled only for coarse
- // palette size search. For all other cases, the do_header_rd_based_gating
- // is explicitly passed as 'false'.
+ // Header rdcost based gating for early termination is currently enabled
+ // only for coarse palette size search when prune_palette_search_level
+ // is 1 and colors > PALETTE_MIN_SIZE. For finer search, the
+ // do_header_rd_based_gating parameter is explicitly passed as 'false'.
const bool do_header_rd_based_gating =
- cpi->sf.intra_sf.early_term_luma_palette_size_search != 0;
+ cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
// Perform top color coarse palette search to find the winner candidate
const int top_color_winner = perform_top_color_palette_search(
@@ -603,10 +609,14 @@
} else if (cpi->sf.intra_sf.prune_palette_search_level == 0) {
const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
min_n = PALETTE_MIN_SIZE;
+ // Perform gating based on header rdcost for
+ // prune_palette_search_level == 0.
+ const bool do_header_rd_based_gating =
+ cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
// Perform top color palette search in ascending order.
perform_top_color_palette_search(
cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
- 1, /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+ 1, do_header_rd_based_gating, &unused, color_cache, n_cache,
best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
// K-means clustering.
@@ -624,7 +634,7 @@
// Perform k-means palette search in ascending order.
perform_k_means_palette_search(
cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
- min_n, max_n + 1, 1, /*do_header_rd_based_gating=*/false, &unused,
+ min_n, max_n + 1, 1, do_header_rd_based_gating, &unused,
color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
best_blk_skip, tx_type_map, color_map, rows * cols);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 6f662b3..f4e05d3 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -311,6 +311,7 @@
sf->part_sf.use_best_rd_for_pruning = 1;
sf->intra_sf.intra_pruning_with_hog = 1;
+ sf->intra_sf.prune_luma_palette_size_search_level = 1;
sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
sf->intra_sf.early_term_chroma_palette_size_search = 1;
@@ -346,7 +347,7 @@
sf->mv_sf.exhaustive_searches_thresh <<= 1;
sf->intra_sf.prune_palette_search_level = 1;
- sf->intra_sf.early_term_luma_palette_size_search = 1;
+ sf->intra_sf.prune_luma_palette_size_search_level = 2;
sf->intra_sf.top_intra_model_count_allowed = 3;
sf->tx_sf.adaptive_txb_search_level = 2;
@@ -1651,7 +1652,7 @@
intra_sf->intra_pruning_with_hog = 0;
intra_sf->chroma_intra_pruning_with_hog = 0;
intra_sf->prune_palette_search_level = 0;
- intra_sf->early_term_luma_palette_size_search = 0;
+ intra_sf->prune_luma_palette_size_search_level = 0;
for (int i = 0; i < TX_SIZES; i++) {
intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 7a6c26a..84ef1fc 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -943,16 +943,22 @@
// palette colors is not the winner.
int prune_palette_search_level;
- // Terminate early in luma palette_size search.
+ // Terminate early in luma palette_size search. Speed feature values indicate
+ // increasing level of pruning.
// 0: No early termination
// 1: Terminate early for higher luma palette_size, if header rd cost of lower
+ // palette_size is more than 2 * best_rd. This level of pruning is more
+ // conservative when compared to sf level 2 as the cases which will get pruned
+ // with sf level 1 is a subset of the cases which will get pruned with sf
+ // level 2.
+ // 2: Terminate early for higher luma palette_size, if header rd cost of lower
// palette_size is more than best_rd.
- // For allintra encode, this sf reduces instruction count by 1.07% and 2.76%
- // for speed 1 and 2 on screen content set with coding performance change less
- // than 0.01%. For AVIF image encode, this sf reduces instruction count
- // by 1.13% and 1.29% for speed 1 and 2 on a typical image dataset with coding
- // performance change less than 0.01%.
- int early_term_luma_palette_size_search;
+ // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%
+ // and 2.76% for speed 0, 1 and 2 on screen content set with coding
+ // performance change less than 0.01%. For AVIF image encode, this sf reduces
+ // instruction count by 1.94%, 1.13% and 1.29% for speed 0, 1 and 2 on a
+ // typical image dataset with coding performance change less than 0.01%.
+ int prune_luma_palette_size_search_level;
// Prune chroma intra modes based on luma intra mode winner.
// 0: No pruning