Extend early_term_luma_palette_size_search sf to speed 0

Extended the speed feature early_term_luma_palette_size_search to
speed 0 with a less aggressive header rdcost based breakout
condition for speed 0 path. Existing sf=1 logic is now moved under
sf=2 path. Also, the name of the speed feature is changed to
prune_luma_palette_size_search_level to indicate the level of
aggressiveness of pruning.

For allintra video encode (on screen content set),

          Instruction Count        BD-Rate Loss(%)
cpu-used     Reduction(%)   avg.psnr  ovr.psnr    ssim
   0           2.490        0.0104    0.0095      0.0084

For AVIF still image encode,

          Instruction Count    BD-Rate Loss(%)
cpu-used     Reduction(%)      psnr       ssim
   0           1.937           0.0000     0.0002

BUG=aomedia:3096

STATS_CHANGED

Change-Id: Ib5e65555e53d2dfb7790953a2f0b0f00052101b1
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index af8a255..a76f4ef 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -261,9 +261,14 @@
     const int palette_mode_rate =
         intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
     const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+    // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+    const int header_rd_shift =
+        (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
     // Terminate further palette_size search, if the header cost corresponding
-    // to lower palette_size is more than best_rd.
-    if (header_rd > *best_rd) {
+    // to lower palette_size is more than *best_rd << header_rd_shift. This
+    // logic is implemented with a right shift in the LHS to prevent a possible
+    // overflow with the left shift in RHS.
+    if ((header_rd >> header_rd_shift) > *best_rd) {
       *do_header_rd_based_breakout = true;
       return;
     }
@@ -550,11 +555,12 @@
       const int min_n = start_n_lookup_table[max_n];
       const int step_size = step_size_lookup_table[max_n];
       assert(min_n >= PALETTE_MIN_SIZE);
-      // Header rdcost based early gating is currently enabled only for coarse
-      // palette size search. For all other cases, the do_header_rd_based_gating
-      // is explicitly passed as 'false'.
+      // Header rdcost based gating for early termination is currently enabled
+      // only for coarse palette size search when prune_palette_search_level
+      // is 1 and colors > PALETTE_MIN_SIZE. For finer search, the
+      // do_header_rd_based_gating parameter is explicitly passed as 'false'.
       const bool do_header_rd_based_gating =
-          cpi->sf.intra_sf.early_term_luma_palette_size_search != 0;
+          cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
 
       // Perform top color coarse palette search to find the winner candidate
       const int top_color_winner = perform_top_color_palette_search(
@@ -603,10 +609,14 @@
     } else if (cpi->sf.intra_sf.prune_palette_search_level == 0) {
       const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
                 min_n = PALETTE_MIN_SIZE;
+      // Perform gating based on header rdcost for
+      // prune_palette_search_level == 0.
+      const bool do_header_rd_based_gating =
+          cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
       // Perform top color palette search in ascending order.
       perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
-          1, /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+          1, do_header_rd_based_gating, &unused, color_cache, n_cache,
           best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
           distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
       // K-means clustering.
@@ -624,7 +634,7 @@
         // Perform k-means palette search in ascending order.
         perform_k_means_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
-            min_n, max_n + 1, 1, /*do_header_rd_based_gating=*/false, &unused,
+            min_n, max_n + 1, 1, do_header_rd_based_gating, &unused,
             color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
             rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
             best_blk_skip, tx_type_map, color_map, rows * cols);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 6f662b3..f4e05d3 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -311,6 +311,7 @@
   sf->part_sf.use_best_rd_for_pruning = 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.prune_luma_palette_size_search_level = 1;
   sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
   sf->intra_sf.early_term_chroma_palette_size_search = 1;
 
@@ -346,7 +347,7 @@
     sf->mv_sf.exhaustive_searches_thresh <<= 1;
 
     sf->intra_sf.prune_palette_search_level = 1;
-    sf->intra_sf.early_term_luma_palette_size_search = 1;
+    sf->intra_sf.prune_luma_palette_size_search_level = 2;
     sf->intra_sf.top_intra_model_count_allowed = 3;
 
     sf->tx_sf.adaptive_txb_search_level = 2;
@@ -1651,7 +1652,7 @@
   intra_sf->intra_pruning_with_hog = 0;
   intra_sf->chroma_intra_pruning_with_hog = 0;
   intra_sf->prune_palette_search_level = 0;
-  intra_sf->early_term_luma_palette_size_search = 0;
+  intra_sf->prune_luma_palette_size_search_level = 0;
 
   for (int i = 0; i < TX_SIZES; i++) {
     intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 7a6c26a..84ef1fc 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -943,16 +943,22 @@
   // palette colors is not the winner.
   int prune_palette_search_level;
 
-  // Terminate early in luma palette_size search.
+  // Terminate early in luma palette_size search. Speed feature values indicate
+  // increasing level of pruning.
   // 0: No early termination
   // 1: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than 2 * best_rd. This level of pruning is more
+  // conservative when compared to sf level 2 as the cases which will get pruned
+  // with sf level 1 is a subset of the cases which will get pruned with sf
+  // level 2.
+  // 2: Terminate early for higher luma palette_size, if header rd cost of lower
   // palette_size is more than best_rd.
-  // For allintra encode, this sf reduces instruction count by 1.07% and 2.76%
-  // for speed 1 and 2 on screen content set with coding performance change less
-  // than 0.01%. For AVIF image encode, this sf reduces instruction count
-  // by 1.13% and 1.29% for speed 1 and 2 on a typical image dataset with coding
-  // performance change less than 0.01%.
-  int early_term_luma_palette_size_search;
+  // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%
+  // and 2.76% for speed 0, 1 and 2 on screen content set with coding
+  // performance change less than 0.01%. For AVIF image encode, this sf reduces
+  // instruction count by 1.94%, 1.13% and 1.29% for speed 0, 1 and 2 on a
+  // typical image dataset with coding performance change less than 0.01%.
+  int prune_luma_palette_size_search_level;
 
   // Prune chroma intra modes based on luma intra mode winner.
   // 0: No pruning