Extend prune_luma_palette_size_search_level sf to speed >= 3

Extended the speed feature prune_luma_palette_size_search_level to
speed >= 3 with an aggressive header rdcost based breakout.

For allintra video encode (on screen content set),

          Instruction Count       BD-Rate Loss(%)
cpu-used     Reduction(%)   avg.psnr  ovr.psnr  ssim
   3           2.305        0.0302    0.0313    0.0800
   4           1.842       -0.0287   -0.0283   -0.3707
   5           2.687        0.0155    0.0160    0.0132
   6           2.042        0.0071    0.0076    0.0097
   7           2.050        0.0211    0.0210    0.1402
   8           1.442       -0.0015   -0.0007    0.0010

For AVIF still image encode,

          Instruction Count    BD-Rate Loss(%)
cpu-used     Reduction(%)      psnr       ssim
   3            0.933          0.0004    -0.0008
   4            0.886          0.0009     0.0011
   5            1.031          0.0004     0.0028
   6            1.068          0.0002     0.0000
   7            1.196          0.0000     0.0000
   8            0.180          0.0000     0.0000

BUG=aomedia:3096

STATS_CHANGED

Change-Id: Ib0172c97942733fb94a75c9bd7f3c6e845e08dd4
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index a76f4ef..7b653b2 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -329,7 +329,7 @@
     MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
     int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
     int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map) {
+    uint8_t *tx_type_map, bool *is_header_rd_based_term) {
   int centroids[PALETTE_MAX_SIZE];
   int n = start_n;
   int top_color_winner = end_n;
@@ -348,7 +348,10 @@
                  tx_type_map, &beat_best_palette_rd,
                  &do_header_rd_based_breakout);
     *last_n_searched = n;
-    if (do_header_rd_based_breakout) break;
+    if (do_header_rd_based_breakout) {
+      if (is_header_rd_based_term != NULL) *is_header_rd_based_term = true;
+      break;
+    }
     if (beat_best_palette_rd) {
       top_color_winner = n;
     } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
@@ -373,7 +376,7 @@
     int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
     int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
     uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
+    int data_points, bool *is_header_rd_based_term) {
   int centroids[PALETTE_MAX_SIZE];
   const int max_itr = 50;
   int n = start_n;
@@ -397,7 +400,10 @@
                  tx_type_map, &beat_best_palette_rd,
                  &do_header_rd_based_breakout);
     *last_n_searched = n;
-    if (do_header_rd_based_breakout) break;
+    if (do_header_rd_based_breakout) {
+      if (is_header_rd_based_term != NULL) *is_header_rd_based_term = true;
+      break;
+    }
     if (beat_best_palette_rd) {
       top_color_winner = n;
     } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
@@ -522,6 +528,19 @@
       count_buf[top_colors[i]] = 0;
     }
 
+    // The following are the approaches used for header rdcost based gating
+    // for early termination for different values of prune_palette_search_level.
+    // 0: Pruning based on header rdcost for ascending order palette_size
+    // search.
+    // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size
+    // search and for finer search do_header_rd_based_gating parameter is
+    // explicitly passed as 'false'.
+    // 2: Enabled only for ascending order palette_size search and for
+    // descending order search do_header_rd_based_gating parameter is explicitly
+    // passed as 'false'.
+    const bool do_header_rd_based_gating =
+        cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
+
     // TODO(huisu@google.com): Try to avoid duplicate computation in cases
     // where the dominant colors and the k-means results are similar.
     if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
@@ -555,19 +574,13 @@
       const int min_n = start_n_lookup_table[max_n];
       const int step_size = step_size_lookup_table[max_n];
       assert(min_n >= PALETTE_MIN_SIZE);
-      // Header rdcost based gating for early termination is currently enabled
-      // only for coarse palette size search when prune_palette_search_level
-      // is 1 and colors > PALETTE_MIN_SIZE. For finer search, the
-      // do_header_rd_based_gating parameter is explicitly passed as 'false'.
-      const bool do_header_rd_based_gating =
-          cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
-
       // Perform top color coarse palette search to find the winner candidate
       const int top_color_winner = perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
           step_size, do_header_rd_based_gating, &unused, color_cache, n_cache,
           best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          NULL);
       // Evaluate neighbors for the winner color (if winner is found) in the
       // above coarse search for dominant colors
       if (top_color_winner <= max_n) {
@@ -581,7 +594,7 @@
             /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
             best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
             distortion, skippable, beat_best_rd, ctx, best_blk_skip,
-            tx_type_map);
+            tx_type_map, NULL);
       }
       // K-means clustering.
       // Perform k-means coarse palette search to find the winner candidate
@@ -590,7 +603,7 @@
           min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused,
           color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
           rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-          best_blk_skip, tx_type_map, color_map, rows * cols);
+          best_blk_skip, tx_type_map, color_map, rows * cols, NULL);
       // Evaluate neighbors for the winner color (if winner is found) in the
       // above coarse search for k-means
       if (k_means_winner <= max_n) {
@@ -604,21 +617,18 @@
             /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
             best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
             distortion, skippable, beat_best_rd, ctx, best_blk_skip,
-            tx_type_map, color_map, rows * cols);
+            tx_type_map, color_map, rows * cols, NULL);
       }
     } else if (cpi->sf.intra_sf.prune_palette_search_level == 0) {
       const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
                 min_n = PALETTE_MIN_SIZE;
-      // Perform gating based on header rdcost for
-      // prune_palette_search_level == 0.
-      const bool do_header_rd_based_gating =
-          cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
       // Perform top color palette search in ascending order.
       perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
           1, do_header_rd_based_gating, &unused, color_cache, n_cache,
           best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          NULL);
       // K-means clustering.
       if (colors == PALETTE_MIN_SIZE) {
         // Special case: These colors automatically become the centroids.
@@ -637,27 +647,29 @@
             min_n, max_n + 1, 1, do_header_rd_based_gating, &unused,
             color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
             rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map, color_map, rows * cols);
+            best_blk_skip, tx_type_map, color_map, rows * cols, NULL);
       }
     } else {
       const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
                 min_n = PALETTE_MIN_SIZE;
       // Perform top color palette search in ascending order
       int last_n_searched = min_n;
+      // Flag to terminate further palette_size search based on header rdcost.
+      bool is_header_rd_based_term = false;
       perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
-          1, /*do_header_rd_based_gating=*/false, &last_n_searched, color_cache,
-          n_cache, best_mbmi, best_palette_color_map, best_rd, rate,
-          rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-          best_blk_skip, tx_type_map);
-      if (last_n_searched < max_n) {
+          1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+          &is_header_rd_based_term);
+      if (last_n_searched < max_n && !is_header_rd_based_term) {
         // Search in descending order until we get to the previous best
         perform_top_color_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n,
             last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused,
             color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
             rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
+            best_blk_skip, tx_type_map, NULL);
       }
       // K-means clustering.
       if (colors == PALETTE_MIN_SIZE) {
@@ -673,14 +685,15 @@
       } else {
         // Perform k-means palette search in ascending order
         last_n_searched = min_n;
+        is_header_rd_based_term = false;
         perform_k_means_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
-            min_n, max_n + 1, 1, /*do_header_rd_based_gating=*/false,
-            &last_n_searched, color_cache, n_cache, best_mbmi,
-            best_palette_color_map, best_rd, rate, rate_tokenonly, distortion,
-            skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-            rows * cols);
-        if (last_n_searched < max_n) {
+            min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols,
+            &is_header_rd_based_term);
+        if (last_n_searched < max_n && !is_header_rd_based_term) {
           // Search in descending order until we get to the previous best
           perform_k_means_palette_search(
               cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
@@ -688,7 +701,7 @@
               &unused, color_cache, n_cache, best_mbmi, best_palette_color_map,
               best_rd, rate, rate_tokenonly, distortion, skippable,
               beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-              rows * cols);
+              rows * cols, NULL);
         }
       }
     }
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 84ef1fc..e30b459 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -953,11 +953,14 @@
   // level 2.
   // 2: Terminate early for higher luma palette_size, if header rd cost of lower
   // palette_size is more than best_rd.
-  // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%
-  // and 2.76% for speed 0, 1 and 2 on screen content set with coding
-  // performance change less than 0.01%. For AVIF image encode, this sf reduces
-  // instruction count by 1.94%, 1.13% and 1.29% for speed 0, 1 and 2 on a
-  // typical image dataset with coding performance change less than 0.01%.
+  // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%,
+  // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4,
+  // 5, 6, 7 and 8 on screen content set with coding performance change less
+  // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF
+  // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%,
+  // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6,
+  // 7 and 8 on a typical image dataset with coding performance change less than
+  // 0.01%.
   int prune_luma_palette_size_search_level;
 
   // Prune chroma intra modes based on luma intra mode winner.