Improve cpu-used 8 all intra

This change splits `discount_color_cost` RDO decision into its own
speed feature. It has been found that `discount_color_cost` causes
excessive file size inflation on natural content that benefits from
palette mode, resulting in a net BD-rate loss (mainly in the low to
medium-high quality range).

Disable `discount_color_cost` for all intra mode, and keep the
current behavior for realtime mode.

Additionally, tweak cpu-used 8 speed features for a better
speed/quality trade-off:
- hybrid_intra_pickmode 1 -> 2
- prune_palette_search_nonrd 0 -> 1
These new settings help compensate the speed loss from disabling
`discount_color_cost`, but with a minimal BD-Rate loss.

Approximate BD-Rate gains over `discount_color_cost` enabled, plus
original speed feature settings - cpu-used=8 (Daala's subset1):
- SSIMULACRA2 60: -4.73%
- SSIMULACRA2 70: -3.34%
- SSIMULACRA2 80: -1.52%
- SSIMULACRA2 90: +0.04%

Before and after encoding example (subset1: Lufthansa - QP 20):

| cpu-used   | Size (bytes) | Time (ms) |
|------------|--------------|-----------|
| 7          | 233238       | 362       |
| 8 (before) | 359941       | 296       |
| 8 (after)  | 231500       | 244       |
| 9          | 250258       | 39        |

Bug: aomedia:421196988
Change-Id: I1ca4482e10089f5eab6134ab9b2fe705bfb8f712
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 838bf90..79a2fa5 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -561,7 +561,7 @@
   const SequenceHeader *const seq_params = cpi->common.seq_params;
   const int is_hbd = seq_params->use_highbitdepth;
   const int bit_depth = seq_params->bit_depth;
-  const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode;
+  const int discount_color_cost = cpi->sf.rt_sf.discount_color_cost;
   int unused;
 
   int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index bfee784..609bb19 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -358,6 +358,7 @@
   sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
 
   sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.discount_color_cost = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
   if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
@@ -554,10 +555,11 @@
   }
 
   if (speed >= 8) {
-    sf->rt_sf.hybrid_intra_pickmode = 1;
+    sf->rt_sf.hybrid_intra_pickmode = 2;
     sf->rt_sf.use_nonrd_pick_mode = 1;
     sf->rt_sf.nonrd_check_partition_merge_mode = 1;
     sf->rt_sf.var_part_split_threshold_shift = 8;
+    sf->rt_sf.prune_palette_search_nonrd = 1;
     // Set mask for intra modes.
     for (int i = 0; i < BLOCK_SIZES; ++i)
       if (i >= BLOCK_32X32)
@@ -1061,6 +1063,7 @@
   sf->tpl_sf.search_method = NSTEP_8PT;
 
   sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.discount_color_cost = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
   if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
@@ -2001,6 +2004,7 @@
     sf->rt_sf.use_nonrd_altref_frame =
         (cpi->svc.number_spatial_layers > 1) ? 0 : 1;
     sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.discount_color_cost = 1;
     sf->rt_sf.nonrd_check_partition_merge_mode = 3;
     sf->rt_sf.skip_intra_pred = 1;
     sf->rt_sf.source_metrics_sb_nonrd = 1;
@@ -2369,6 +2373,7 @@
   rt_sf->mode_search_skip_flags = 0;
   rt_sf->nonrd_prune_ref_frame_search = 0;
   rt_sf->use_nonrd_pick_mode = 0;
+  rt_sf->discount_color_cost = 0;
   rt_sf->use_nonrd_altref_frame = 0;
   rt_sf->use_comp_ref_nonrd = 0;
   rt_sf->use_real_time_ref_set = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1b455d4..1c1831a 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1633,6 +1633,11 @@
   // This flag controls the use of non-RD mode decision.
   int use_nonrd_pick_mode;
 
+  // Flag that controls discounting for color map cost during palette search.
+  // This saves about 5% of CPU and in non-RD speeds delivers better results
+  // across rtc_screen set (on speed 10 overall BDRate growth is 13%)
+  int discount_color_cost;
+
   // Use ALTREF frame in non-RD mode decision.
   int use_nonrd_altref_frame;