Refactor intra_mode_search files

This CL factors palette mode functions out of intra_mode_search.c and
creates intra_mode_search_utils.h for some common rdcost estimation
functions.

BUG=aomedia:2712

Change-Id: I065cc811c6b3979569bda726cf3f233061255c5e
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 94d2102..73814e7 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -229,6 +229,7 @@
             "${AOM_ROOT}/av1/encoder/tx_search.h"
             "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
             "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
             "${AOM_ROOT}/av1/encoder/wedge_utils.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.c"
             "${AOM_ROOT}/av1/encoder/var_based_part.h"
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 6079799..2f14635 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1403,15 +1403,23 @@
   // Special handling for chroma sub8x8.
   const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
   const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
-  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
-  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (width) {
+    *width = plane_block_width + 2 * is_chroma_sub8_x;
+    assert(*width >= 0);
+  }
+  if (height) {
+    *height = plane_block_height + 2 * is_chroma_sub8_y;
+    assert(*height >= 0);
+  }
   if (rows_within_bounds) {
     *rows_within_bounds =
         (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+    assert(*rows_within_bounds >= 0);
   }
   if (cols_within_bounds) {
     *cols_within_bounds =
         (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+    assert(*cols_within_bounds >= 0);
   }
 }
 
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index bef99ca..f243233 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -32,7 +32,7 @@
 } PC_TREE_SHARED_BUFFERS;
 
 // Structure to hold snapshot of coding context during the mode picking process
-typedef struct {
+typedef struct PICK_MODE_CONTEXT {
   MB_MODE_INFO mic;
   MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
   uint8_t *color_index_map[2];
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 0f59a7e..c3563b8 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -9,11 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/encoder/intra_mode_search.h"
-#include "av1/encoder/model_rd.h"
-#include "av1/encoder/palette.h"
-#include "av1/common/pred_common.h"
 #include "av1/common/reconintra.h"
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
 #include "av1/encoder/tx_search.h"
 
 /*!\cond */
@@ -31,243 +31,6 @@
 };
 /*!\endcond */
 
-#define BINS 32
-static const float intra_hog_model_bias[DIRECTIONAL_MODES] = {
-  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
-  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
-};
-
-static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
-  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
-  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
-  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
-  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
-  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
-  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
-  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
-  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
-  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
-  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
-  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
-  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
-  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
-  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
-  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
-  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
-  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
-  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
-  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
-  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
-  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
-  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
-  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
-  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
-  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
-  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
-  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
-  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
-  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
-  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
-  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
-  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
-  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
-  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
-  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
-  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
-  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
-  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
-  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
-  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
-  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
-  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
-  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
-};
-
-static void generate_hog(const uint8_t *src, int stride, int rows, int cols,
-                         float *hist) {
-  const float step = (float)PI / BINS;
-  float total = 0.1f;
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint8_t *above = &src[c - stride];
-      const uint8_t *below = &src[c + stride];
-      const uint8_t *left = &src[c - 1];
-      const uint8_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
-      } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
-        hist[idx] += temp;
-      }
-    }
-    src += stride;
-  }
-
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
-}
-
-static void generate_hog_hbd(const uint8_t *src8, int stride, int rows,
-                             int cols, float *hist) {
-  const float step = (float)PI / BINS;
-  float total = 0.1f;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src += stride;
-  for (int r = 1; r < rows - 1; ++r) {
-    for (int c = 1; c < cols - 1; ++c) {
-      const uint16_t *above = &src[c - stride];
-      const uint16_t *below = &src[c + stride];
-      const uint16_t *left = &src[c - 1];
-      const uint16_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
-      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
-                     (left[-stride] + 2 * left[0] + left[stride]);
-      const int dy = (below[-1] + 2 * below[0] + below[1]) -
-                     (above[-1] + 2 * above[0] + above[1]);
-      if (dx == 0 && dy == 0) continue;
-      const int temp = abs(dx) + abs(dy);
-      if (!temp) continue;
-      total += temp;
-      if (dx == 0) {
-        hist[0] += temp / 2;
-        hist[BINS - 1] += temp / 2;
-      } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
-        hist[idx] += temp;
-      }
-    }
-    src += stride;
-  }
-
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
-}
-
-static void prune_intra_mode_with_hog(const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                      float th,
-                                      uint8_t *directional_mode_skip_mask) {
-  aom_clear_system_state();
-
-  const int bh = block_size_high[bsize];
-  const int bw = block_size_wide[bsize];
-  const MACROBLOCKD *xd = &x->e_mbd;
-  const int rows =
-      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
-  const int cols =
-      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *src = x->plane[0].src.buf;
-  float hist[BINS] = { 0.0f };
-  if (is_cur_buf_hbd(xd)) {
-    generate_hog_hbd(src, src_stride, rows, cols, hist);
-  } else {
-    generate_hog(src, src_stride, rows, cols, hist);
-  }
-
-  for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
-    float this_score = intra_hog_model_bias[i];
-    const float *weights = &intra_hog_model_weights[i * BINS];
-    for (int j = 0; j < BINS; ++j) {
-      this_score += weights[j] * hist[j];
-    }
-    if (this_score < th) directional_mode_skip_mask[i + 1] = 1;
-  }
-
-  aom_clear_system_state();
-}
-
-#undef BINS
-
-// Makes a quick luma prediction and estimate the rdcost with a model without
-// going through the whole txfm/quantize/itxfm process.
-static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                               BLOCK_SIZE bsize, int mode_cost) {
-  const AV1_COMMON *cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  RD_STATS this_rd_stats;
-  int row, col;
-  int64_t temp_sse, this_rd;
-  const ModeCosts *mode_costs = &x->mode_costs;
-  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
-  TX_SIZE tx_size =
-      tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
-  const int stepr = tx_size_high_unit[tx_size];
-  const int stepc = tx_size_wide_unit[tx_size];
-  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
-  const int max_blocks_high = max_block_high(xd, bsize, 0);
-  mbmi->tx_size = tx_size;
-  // Prediction.
-  for (row = 0; row < max_blocks_high; row += stepr) {
-    for (col = 0; col < max_blocks_wide; col += stepc) {
-      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
-    }
-  }
-  // RD estimation.
-  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
-                                                   : MODELRD_TYPE_INTRA](
-      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
-      &this_rd_stats.skip_txfm, &temp_sse, NULL, NULL, NULL);
-  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
-    mode_cost += mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
-                                             [MAX_ANGLE_DELTA +
-                                              mbmi->angle_delta[PLANE_TYPE_Y]];
-  }
-  if (mbmi->mode == DC_PRED &&
-      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
-    if (mbmi->filter_intra_mode_info.use_filter_intra) {
-      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
-      mode_cost += mode_costs->filter_intra_cost[mbmi->sb_type][1] +
-                   mode_costs->filter_intra_mode_cost[mode];
-    } else {
-      mode_cost += mode_costs->filter_intra_cost[mbmi->sb_type][0];
-    }
-  }
-  this_rd =
-      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
-  return this_rd;
-}
-
-/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
- *
- * \ingroup intra_mode_search
- * \callergraph
- * This function first make a quick luma prediction and estimate the rdcost with
- * a model without going through the txfm, then try to prune the current mode if
- * the new estimate y_rd > 1.5 * best_model_rd.
- *
- * \return Returns 1 if the given mode is prune; 0 otherwise.
- */
-static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
-                                                MACROBLOCK *x, BLOCK_SIZE bsize,
-                                                int mode_info_cost,
-                                                int64_t *best_model_rd) {
-  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
-  if (*best_model_rd != INT64_MAX &&
-      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
-    return 1;
-  } else if (this_model_rd < *best_model_rd) {
-    *best_model_rd = this_model_rd;
-  }
-  return 0;
-}
-
 /*!\brief Calculate the rdcost of a given luma intra angle
  *
  * \ingroup intra_mode_search
@@ -321,124 +84,6 @@
   return this_rd;
 }
 
-static INLINE int write_uniform_cost(int n, int v) {
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  if (l == 0) return 0;
-  if (v < m)
-    return av1_cost_literal(l - 1);
-  else
-    return av1_cost_literal(l);
-}
-
-// Return the rate cost for luma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                  int mode_cost) {
-  int total_rate = mode_cost;
-  const ModeCosts *mode_costs = &x->mode_costs;
-  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
-  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
-  const int use_intrabc = mbmi->use_intrabc;
-  // Can only activate one mode.
-  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
-          use_filter_intra) <= 1);
-  const int try_palette = av1_allow_palette(
-      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mbmi->mode == DC_PRED) {
-    const MACROBLOCKD *xd = &x->e_mbd;
-    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-    const int mode_ctx = av1_get_palette_mode_ctx(xd);
-    total_rate +=
-        mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
-    if (use_palette) {
-      const uint8_t *const color_map = xd->plane[0].color_index_map;
-      int block_width, block_height, rows, cols;
-      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                               &cols);
-      const int plt_size = mbmi->palette_mode_info.palette_size[0];
-      int palette_mode_cost =
-          mode_costs
-              ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-      palette_mode_cost +=
-          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
-    total_rate +=
-        mode_costs->filter_intra_cost[mbmi->sb_type][use_filter_intra];
-    if (use_filter_intra) {
-      total_rate +=
-          mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
-                                                 .filter_intra_mode];
-    }
-  }
-  if (av1_is_directional_mode(mbmi->mode)) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate +=
-          mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
-                                      [MAX_ANGLE_DELTA +
-                                       mbmi->angle_delta[PLANE_TYPE_Y]];
-    }
-  }
-  if (av1_allow_intrabc(&cpi->common))
-    total_rate += mode_costs->intrabc_cost[use_intrabc];
-  return total_rate;
-}
-
-// Return the rate cost for chroma prediction mode info. of intra blocks.
-static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
-                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                                   int mode_cost) {
-  int total_rate = mode_cost;
-  const ModeCosts *mode_costs = &x->mode_costs;
-  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
-  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
-  // Can only activate one mode.
-  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
-
-  const int try_palette = av1_allow_palette(
-      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
-  if (try_palette && mode == UV_DC_PRED) {
-    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-    total_rate +=
-        mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
-    if (use_palette) {
-      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
-      const int plt_size = pmi->palette_size[1];
-      const MACROBLOCKD *xd = &x->e_mbd;
-      const uint8_t *const color_map = xd->plane[1].color_index_map;
-      int palette_mode_cost =
-          mode_costs
-              ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
-          write_uniform_cost(plt_size, color_map[0]);
-      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-      palette_mode_cost += av1_palette_color_cost_uv(
-          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
-      palette_mode_cost +=
-          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
-      total_rate += palette_mode_cost;
-    }
-  }
-  if (av1_is_directional_mode(get_uv_mode(mode))) {
-    if (av1_use_angle_delta(bsize)) {
-      total_rate +=
-          mode_costs->angle_delta_cost[mode - V_PRED]
-                                      [mbmi->angle_delta[PLANE_TYPE_UV] +
-                                       MAX_ANGLE_DELTA];
-    }
-  }
-  return total_rate;
-}
-
 /*!\brief Search for the best filter_intra mode when coding intra frame.
  *
  * \ingroup intra_mode_search
@@ -553,700 +198,6 @@
   return n;
 }
 
-// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
-// new_height'. Extra rows and columns are filled in by copying last valid
-// row/column.
-static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
-                                                int orig_width, int orig_height,
-                                                int new_width, int new_height) {
-  int j;
-  assert(new_width >= orig_width);
-  assert(new_height >= orig_height);
-  if (new_width == orig_width && new_height == orig_height) return;
-
-  for (j = orig_height - 1; j >= 0; --j) {
-    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
-    // Copy last column to extra columns.
-    memset(color_map + j * new_width + orig_width,
-           color_map[j * new_width + orig_width - 1], new_width - orig_width);
-  }
-  // Copy last row to extra rows.
-  for (j = orig_height; j < new_height; ++j) {
-    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
-           new_width);
-  }
-}
-
-// Bias toward using colors in the cache.
-// TODO(huisu): Try other schemes to improve compression.
-static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
-                                               int n_cache, int n_colors,
-                                               int stride, int *centroids) {
-  if (n_cache <= 0) return;
-  for (int i = 0; i < n_colors * stride; i += stride) {
-    int min_diff = abs(centroids[i] - (int)color_cache[0]);
-    int idx = 0;
-    for (int j = 1; j < n_cache; ++j) {
-      const int this_diff = abs(centroids[i] - color_cache[j]);
-      if (this_diff < min_diff) {
-        min_diff = this_diff;
-        idx = j;
-      }
-    }
-    if (min_diff <= 1) centroids[i] = color_cache[idx];
-  }
-}
-
-// Given the base colors as specified in centroids[], calculate the RD cost
-// of palette mode.
-static AOM_INLINE void palette_rd_y(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
-    uint8_t *tx_type_map, int *beat_best_pallette_rd) {
-  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
-  const int num_unique_colors = av1_remove_duplicates(centroids, n);
-  if (num_unique_colors < PALETTE_MIN_SIZE) {
-    // Too few unique colors to create a palette. And DC_PRED will work
-    // well for that case anyway. So skip.
-    return;
-  }
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth) {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
-    }
-  } else {
-    for (int i = 0; i < num_unique_colors; ++i) {
-      pmi->palette_colors[i] = clip_pixel(centroids[i]);
-    }
-  }
-  pmi->palette_size[0] = num_unique_colors;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
-                   1);
-  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
-
-  const int palette_mode_cost =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
-  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
-                                best_model_rd)) {
-    return;
-  }
-
-  RD_STATS tokenonly_rd_stats;
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    *best_rd);
-  if (tokenonly_rd_stats.rate == INT_MAX) return;
-  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
-  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
-    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
-  }
-  // Collect mode stats for multiwinner mode processing
-  const int txfm_search_done = 1;
-  store_winner_mode_stats(
-      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
-      this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
-      txfm_search_done);
-  if (this_rd < *best_rd) {
-    *best_rd = this_rd;
-    // Setting beat_best_rd flag because current mode rd is better than best_rd.
-    // This flag need to be updated only for palette evaluation in key frames
-    if (beat_best_rd) *beat_best_rd = 1;
-    memcpy(best_palette_color_map, color_map,
-           block_width * block_height * sizeof(color_map[0]));
-    *best_mbmi = *mbmi;
-    memcpy(blk_skip, x->txfm_search_info.blk_skip,
-           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
-    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
-    if (rate) *rate = this_rate;
-    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
-    if (distortion) *distortion = tokenonly_rd_stats.dist;
-    if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
-    if (beat_best_pallette_rd) *beat_best_pallette_rd = 1;
-  }
-}
-
-static AOM_INLINE int perform_top_color_coarse_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data,
-    const int *const top_colors, int start_n, int end_n, int step_size,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
-    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
-    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map) {
-  int centroids[PALETTE_MAX_SIZE];
-  int n = start_n;
-  int top_color_winner = end_n + 1;
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if (beat_best_pallette_rd) top_color_winner = n;
-    n += step_size;
-    if (n > end_n) break;
-  }
-  return top_color_winner;
-}
-
-static AOM_INLINE int perform_k_means_coarse_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
-  int centroids[PALETTE_MAX_SIZE];
-  const int max_itr = 50;
-  int n = start_n;
-  int k_means_winner = end_n + 1;
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) {
-      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-    }
-    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if (beat_best_pallette_rd) k_means_winner = n;
-    n += step_size;
-    if (n > end_n) break;
-  }
-  return k_means_winner;
-}
-
-// Perform palette search for top colors from minimum palette colors (/maximum)
-// with a step-size of 1 (/-1)
-static AOM_INLINE int perform_top_color_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
-  int centroids[PALETTE_MAX_SIZE];
-  int n = start_n;
-  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
-         (step_size == 2));
-  assert(IMPLIES(step_size == -1, start_n > end_n));
-  assert(IMPLIES(step_size == 1, start_n < end_n));
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
-        !beat_best_pallette_rd)
-      return n;
-    n += step_size;
-    if (n == end_n) break;
-  }
-  return n;
-}
-// Perform k-means based palette search from minimum palette colors (/maximum)
-// with a step-size of 1 (/-1)
-static AOM_INLINE int perform_k_means_palette_search(
-    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
-    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
-    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
-    int data_points) {
-  int centroids[PALETTE_MAX_SIZE];
-  const int max_itr = 50;
-  int n = start_n;
-  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
-         (step_size == 2));
-  assert(IMPLIES(step_size == -1, start_n > end_n));
-  assert(IMPLIES(step_size == 1, start_n < end_n));
-  while (1) {
-    int beat_best_pallette_rd = 0;
-    for (int i = 0; i < n; ++i) {
-      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
-    }
-    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
-    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_pallette_rd);
-    // Break if current palette colors is not winning
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
-        !beat_best_pallette_rd)
-      return n;
-    n += step_size;
-    if (n == end_n) break;
-  }
-  return n;
-}
-
-#define START_N_STAGE2(x)                         \
-  ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \
-                           : AOMMAX(x - 1, PALETTE_MIN_SIZE));
-#define END_N_STAGE2(x, end_n) \
-  ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE));
-
-static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2,
-                                                int *end_n_stage2,
-                                                int *step_size_stage2,
-                                                int winner, int end_n) {
-  *start_n_stage2 = START_N_STAGE2(winner);
-  *end_n_stage2 = END_N_STAGE2(winner, end_n);
-  *step_size_stage2 = *end_n_stage2 - *start_n_stage2;
-}
-
-// Start index and step size below are chosen to evaluate unique
-// candidates in neighbor search, in case a winner candidate is found in
-// coarse search. Example,
-// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
-// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
-// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
-// (3) and 8 (7).
-// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
-// as for 8 colors) then step size should also be 2, to cover all
-// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
-// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
-// coarse search will evaluate 3 and 6. For the winner, unique neighbors
-// (3: 2,4 or 6: 5,7) would be evaluated.
-
-// start index for coarse palette search for dominant colors and k-means
-static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
-                                                                    3, 3, 2,
-                                                                    3, 3, 2 };
-// step size for coarse palette search for dominant colors and k-means
-static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
-                                                                      3, 3, 3,
-                                                                      3, 3, 3 };
-
-/*!\brief Search for the best palette in the luma plane.
- *
- * \ingroup intra_mode_search
- * \callergraph
- * This function is used in both inter and intra frame coding.
- */
-static void rd_pick_palette_intra_sby(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
-    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, int *beat_best_rd,
-    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                           bsize));
-
-  const int src_stride = x->plane[0].src.stride;
-  const uint8_t *const src = x->plane[0].src.buf;
-  int block_width, block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
-                           &cols);
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  const int is_hbd = seq_params->use_highbitdepth;
-  const int bit_depth = seq_params->bit_depth;
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  int colors;
-  if (is_hbd) {
-    colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth,
-                                     count_buf);
-  } else {
-    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
-  }
-
-  uint8_t *const color_map = xd->plane[0].color_index_map;
-  if (colors > 1 && colors <= 64) {
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[PALETTE_MAX_SIZE];
-    int lb, ub;
-    if (is_hbd) {
-      int *data_pt = data;
-      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
-      lb = ub = src_pt[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
-    } else {
-      int *data_pt = data;
-      const uint8_t *src_pt = src;
-      lb = ub = src[0];
-      for (int r = 0; r < rows; ++r) {
-        for (int c = 0; c < cols; ++c) {
-          const int val = src_pt[c];
-          data_pt[c] = val;
-          lb = AOMMIN(lb, val);
-          ub = AOMMAX(ub, val);
-        }
-        src_pt += src_stride;
-        data_pt += cols;
-      }
-    }
-
-    mbmi->mode = DC_PRED;
-    mbmi->filter_intra_mode_info.use_filter_intra = 0;
-
-    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
-
-    // Find the dominant colors, stored in top_colors[].
-    int top_colors[PALETTE_MAX_SIZE] = { 0 };
-    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
-      int max_count = 0;
-      for (int j = 0; j < (1 << bit_depth); ++j) {
-        if (count_buf[j] > max_count) {
-          max_count = count_buf[j];
-          top_colors[i] = j;
-        }
-      }
-      assert(max_count > 0);
-      count_buf[top_colors[i]] = 0;
-    }
-
-    // Try the dominant colors directly.
-    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
-    // where the dominant colors and the k-means results are similar.
-    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
-        (colors > PALETTE_MIN_SIZE)) {
-      const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE);
-      assert(PALETTE_MAX_SIZE == 8);
-      assert(PALETTE_MIN_SIZE == 2);
-      // Choose the start index and step size for coarse search based on number
-      // of colors
-      const int start_n = start_n_lookup_table[end_n];
-      const int step_size = step_size_lookup_table[end_n];
-      // Perform top color coarse palette search to find the winner candidate
-      const int top_color_winner = perform_top_color_coarse_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n,
-          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
-          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-          beat_best_rd, ctx, best_blk_skip, tx_type_map);
-      // Evaluate neighbors for the winner color (if winner is found) in the
-      // above coarse search for dominant colors
-      if (top_color_winner <= end_n) {
-        int start_n_stage2, end_n_stage2, step_size_stage2;
-        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
-                                 &step_size_stage2, top_color_winner, end_n);
-        // perform finer search for the winner candidate
-        perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2,
-            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
-            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
-            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
-      }
-      // K-means clustering.
-      // Perform k-means coarse palette search to find the winner candidate
-      const int k_means_winner = perform_k_means_coarse_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n,
-          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
-          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-          beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-          rows * cols);
-      // Evaluate neighbors for the winner color (if winner is found) in the
-      // above coarse search for k-means
-      if (k_means_winner <= end_n) {
-        int start_n_stage2, end_n_stage2, step_size_stage2;
-        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
-                                 &step_size_stage2, k_means_winner, end_n);
-        // perform finer search for the winner candidate
-        perform_k_means_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2,
-            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
-            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
-            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map, color_map, rows * cols);
-      }
-    } else {
-      const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE),
-                end_n = PALETTE_MIN_SIZE;
-      // Perform top color palette search from start_n
-      const int top_color_winner = perform_top_color_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n,
-          end_n - 1, -1, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
-
-      if (top_color_winner > end_n) {
-        // Perform top color palette search in reverse order for the remaining
-        // colors
-        perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n,
-            top_color_winner, 1, color_cache, n_cache, best_mbmi,
-            best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
-      }
-      // K-means clustering.
-      if (colors == PALETTE_MIN_SIZE) {
-        // Special case: These colors automatically become the centroids.
-        assert(colors == 2);
-        centroids[0] = lb;
-        centroids[1] = ub;
-        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
-                     color_cache, n_cache, best_mbmi, best_palette_color_map,
-                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                     NULL);
-      } else {
-        // Perform k-means palette search from start_n
-        const int k_means_winner = perform_k_means_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1,
-            -1, color_cache, n_cache, best_mbmi, best_palette_color_map,
-            best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
-            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-            rows * cols);
-        if (k_means_winner > end_n) {
-          // Perform k-means palette search in reverse order for the remaining
-          // colors
-          perform_k_means_palette_search(
-              cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n,
-              k_means_winner, 1, color_cache, n_cache, best_mbmi,
-              best_palette_color_map, best_rd, best_model_rd, rate,
-              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-              best_blk_skip, tx_type_map, color_map, rows * cols);
-        }
-      }
-    }
-  }
-
-  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           block_width * block_height * sizeof(best_palette_color_map[0]));
-  }
-  *mbmi = *best_mbmi;
-}
-
-/*!\brief Search for the best palette in the chroma plane.
- *
- * \ingroup intra_mode_search
- * \callergraph
- * This function is used in both inter and intra frame coding.
- */
-static AOM_INLINE void rd_pick_palette_intra_sbuv(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost,
-    uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi,
-    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
-                           mbmi->sb_type));
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
-  int this_rate;
-  int64_t this_rd;
-  int colors_u, colors_v, colors;
-  const int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  RD_STATS tokenonly_rd_stats;
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  mbmi->uv_mode = UV_DC_PRED;
-
-  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
-  if (seq_params->use_highbitdepth) {
-    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
-                                       seq_params->bit_depth, count_buf);
-  } else {
-    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
-    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
-  }
-
-  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
-  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
-
-  colors = colors_u > colors_v ? colors_u : colors_v;
-  if (colors > 1 && colors <= 64) {
-    int r, c, n, i, j;
-    const int max_itr = 50;
-    int lb_u, ub_u, val_u;
-    int lb_v, ub_v, val_v;
-    int *const data = x->palette_buffer->kmeans_data_buf;
-    int centroids[2 * PALETTE_MAX_SIZE];
-
-    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
-    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
-    if (seq_params->use_highbitdepth) {
-      lb_u = src_u16[0];
-      ub_u = src_u16[0];
-      lb_v = src_v16[0];
-      ub_v = src_v16[0];
-    } else {
-      lb_u = src_u[0];
-      ub_u = src_u[0];
-      lb_v = src_v[0];
-      ub_v = src_v[0];
-    }
-
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) {
-        if (seq_params->use_highbitdepth) {
-          val_u = src_u16[r * src_stride + c];
-          val_v = src_v16[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        } else {
-          val_u = src_u[r * src_stride + c];
-          val_v = src_v[r * src_stride + c];
-          data[(r * cols + c) * 2] = val_u;
-          data[(r * cols + c) * 2 + 1] = val_v;
-        }
-        if (val_u < lb_u)
-          lb_u = val_u;
-        else if (val_u > ub_u)
-          ub_u = val_u;
-        if (val_v < lb_v)
-          lb_v = val_v;
-        else if (val_v > ub_v)
-          ub_v = val_v;
-      }
-    }
-
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
-      for (i = 0; i < n; ++i) {
-        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
-        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
-      }
-      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
-      // Sort the U channel colors in ascending order.
-      for (i = 0; i < 2 * (n - 1); i += 2) {
-        int min_idx = i;
-        int min_val = centroids[i];
-        for (j = i + 2; j < 2 * n; j += 2)
-          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
-        if (min_idx != i) {
-          int temp_u = centroids[i], temp_v = centroids[i + 1];
-          centroids[i] = centroids[min_idx];
-          centroids[i + 1] = centroids[min_idx + 1];
-          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
-        }
-      }
-      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
-      extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                               plane_block_height);
-      pmi->palette_size[1] = n;
-      for (i = 1; i < 3; ++i) {
-        for (j = 0; j < n; ++j) {
-          if (seq_params->use_highbitdepth)
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
-                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
-          else
-            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel((int)centroids[j * 2 + i - 1]);
-        }
-      }
-
-      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate +
-                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
-      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-      if (this_rd < *best_rd) {
-        *best_rd = this_rd;
-        *best_mbmi = *mbmi;
-        memcpy(best_palette_color_map, color_map,
-               plane_block_width * plane_block_height *
-                   sizeof(best_palette_color_map[0]));
-        *rate = this_rate;
-        *distortion = tokenonly_rd_stats.dist;
-        *rate_tokenonly = tokenonly_rd_stats.rate;
-        *skippable = tokenonly_rd_stats.skip_txfm;
-      }
-    }
-  }
-  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
-    memcpy(color_map, best_palette_color_map,
-           plane_block_width * plane_block_height *
-               sizeof(best_palette_color_map[0]));
-  }
-}
-
-void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  int src_stride = x->plane[1].src.stride;
-  const uint8_t *const src_u = x->plane[1].src.buf;
-  const uint8_t *const src_v = x->plane[2].src.buf;
-  int *const data = x->palette_buffer->kmeans_data_buf;
-  int centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t *const color_map = xd->plane[1].color_index_map;
-  int r, c;
-  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
-  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-  int plane_block_width, plane_block_height, rows, cols;
-  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
-                           &plane_block_height, &rows, &cols);
-
-  for (r = 0; r < rows; ++r) {
-    for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
-        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
-      } else {
-        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
-        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
-      }
-    }
-  }
-
-  for (r = 1; r < 3; ++r) {
-    for (c = 0; c < pmi->palette_size[1]; ++c) {
-      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
-    }
-  }
-
-  av1_calc_indices(data, centroids, color_map, rows * cols,
-                   pmi->palette_size[1], 2);
-  extend_palette_color_map(color_map, cols, rows, plane_block_width,
-                           plane_block_height);
-}
-
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
@@ -1576,7 +527,7 @@
                         mbmi->sb_type);
   if (try_palette) {
     uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
-    rd_pick_palette_intra_sbuv(
+    av1_rd_pick_palette_intra_sbuv(
         cpi, x,
         mode_costs
             ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
@@ -1623,7 +574,7 @@
   mbmi->ref_frame[1] = NONE_FRAME;
   RD_STATS rd_stats_y;
   av1_invalid_rd_stats(&rd_stats_y);
-  rd_pick_palette_intra_sby(
+  av1_rd_pick_palette_intra_sby(
       cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
       best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
       &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
@@ -2182,7 +1133,7 @@
 
   // Searches palette
   if (try_palette) {
-    rd_pick_palette_intra_sby(
+    av1_rd_pick_palette_intra_sby(
         cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
         &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
         &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index a08c0a9..ab71401 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -9,6 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+/*!\file
+ * \brief Declares high level functions to search through intra modes.
+ */
 #ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
 #define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
 
@@ -234,9 +237,6 @@
 int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
                             int bit_depth, int *val_count);
 
-// Resets palette color map for chroma channels.
-void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
new file mode 100644
index 0000000..131f3cf
--- /dev/null
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Defines utility functions used in intra mode search.
+ *
+ * This includes rdcost estimations, histogram based pruning, etc.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define BINS 32
+static const float intra_hog_model_bias[DIRECTIONAL_MODES] = {
+  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
+  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+};
+
+static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
+  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
+  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
+  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
+  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
+  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
+  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
+  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
+  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
+  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
+  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
+  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
+  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
+  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
+  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
+                                    int cols, float *hist) {
+  const float step = (float)PI / BINS;
+  float total = 0.1f;
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const float angle = atanf(dy * 1.0f / dx);
+        int idx = (int)roundf(angle / step) + BINS / 2;
+        idx = AOMMIN(idx, BINS - 1);
+        idx = AOMMAX(idx, 0);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
+                                        int rows, int cols, float *hist) {
+  const float step = (float)PI / BINS;
+  float total = 0.1f;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const float angle = atanf(dy * 1.0f / dx);
+        int idx = (int)roundf(angle / step) + BINS / 2;
+        idx = AOMMIN(idx, BINS - 1);
+        idx = AOMMAX(idx, 0);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void prune_intra_mode_with_hog(
+    const MACROBLOCK *x, BLOCK_SIZE bsize, float th,
+    uint8_t *directional_mode_skip_mask) {
+  aom_clear_system_state();
+
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int rows =
+      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
+  const int cols =
+      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+  float hist[BINS] = { 0.0f };
+  if (is_cur_buf_hbd(xd)) {
+    generate_hog_hbd(src, src_stride, rows, cols, hist);
+  } else {
+    generate_hog(src, src_stride, rows, cols, hist);
+  }
+
+  for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
+    float this_score = intra_hog_model_bias[i];
+    const float *weights = &intra_hog_model_weights[i * BINS];
+    for (int j = 0; j < BINS; ++j) {
+      this_score += weights[j] * hist[j];
+    }
+    if (this_score < th) directional_mode_skip_mask[i + 1] = 1;
+  }
+
+  aom_clear_system_state();
+}
+#undef BINS
+
+// Returns the cost needed to send a uniformly distributed r.v.
+static AOM_INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+/*!\endcond */
+
+/*!\brief Returns the rate cost for luma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
+                                             const MACROBLOCK *x,
+                                             const MB_MODE_INFO *mbmi,
+                                             BLOCK_SIZE bsize, int mode_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate +=
+        mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          mode_costs
+              ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate +=
+        mode_costs->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate +=
+          mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                 .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+                                      [MAX_ANGLE_DELTA +
+                                       mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += mode_costs->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+/*!\brief Return the rate cost for chroma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              const MB_MODE_INFO *mbmi,
+                                              BLOCK_SIZE bsize, int mode_cost) {
+  int total_rate = mode_cost;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          mode_costs
+              ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          mode_costs->angle_delta_cost[mode - V_PRED]
+                                      [mbmi->angle_delta[PLANE_TYPE_UV] +
+                                       MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+/*!\cond */
+// Makes a quick luma prediction and estimate the rdcost with a model without
+// going through the whole txfm/quantize/itxfm process.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  RD_STATS this_rd_stats;
+  int row, col;
+  int64_t temp_sse, this_rd;
+  const ModeCosts *mode_costs = &x->mode_costs;
+  const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+  TX_SIZE tx_size =
+      tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  mbmi->tx_size = tx_size;
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+    }
+  }
+  // RD estimation.
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
+                                                   : MODELRD_TYPE_INTRA](
+      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
+      &this_rd_stats.skip_txfm, &temp_sse, NULL, NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost += mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+                                             [MAX_ANGLE_DELTA +
+                                              mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += mode_costs->filter_intra_cost[mbmi->sb_type][1] +
+                   mode_costs->filter_intra_mode_cost[mode];
+    } else {
+      mode_cost += mode_costs->filter_intra_cost[mbmi->sb_type][0];
+    }
+  }
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
+  return this_rd;
+}
+/*!\endcond */
+
+/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function first makes a quick luma prediction and estimates the rdcost
+ * with a model without going through the txfm, then try to prune the current
+ * mode if the new estimate y_rd > 1.5 * best_model_rd.
+ *
+ * \return Returns 1 if the given mode is prune; 0 otherwise.
+ */
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int mode_info_cost,
+                                                int64_t *best_model_rd) {
+  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
+    return 1;
+  } else if (this_model_rd < *best_model_rd) {
+    *best_model_rd = this_model_rd;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index e61cd02..21506ba 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -12,9 +12,17 @@
 #include <math.h>
 #include <stdlib.h>
 
+#include "av1/common/pred_common.h"
+
+#include "av1/encoder/block.h"
 #include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_search.h"
 
 #define AV1_K_MEANS_DIM 1
 #include "av1/encoder/k_means_template.h"
@@ -152,3 +160,688 @@
   total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
   return av1_cost_literal(total_bits);
 }
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+                                                int orig_width, int orig_height,
+                                                int new_width, int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+                                               int n_cache, int n_colors,
+                                               int stride, int *centroids) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
+  }
+}
+
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static AOM_INLINE void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+    uint8_t *tx_type_map, int *beat_best_pallette_rd) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+  const int num_unique_colors = av1_remove_duplicates(centroids, n);
+  if (num_unique_colors < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params.use_highbitdepth) {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params.bit_depth);
+    }
+  } else {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+    }
+  }
+  pmi->palette_size[0] = num_unique_colors;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+                   1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
+                                best_model_rd)) {
+    return;
+  }
+
+  RD_STATS tokenonly_rd_stats;
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                    *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+      this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+      txfm_search_done);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    // Setting beat_best_rd flag because current mode rd is better than best_rd.
+    // This flag need to be updated only for palette evaluation in key frames
+    if (beat_best_rd) *beat_best_rd = 1;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->txfm_search_info.blk_skip,
+           sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
+    if (beat_best_pallette_rd) *beat_best_pallette_rd = 1;
+  }
+}
+
+static AOM_INLINE int perform_top_color_coarse_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data,
+    const int *const top_colors, int start_n, int end_n, int step_size,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  int top_color_winner = end_n + 1;
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if (beat_best_pallette_rd) top_color_winner = n;
+    n += step_size;
+    if (n > end_n) break;
+  }
+  return top_color_winner;
+}
+
+static AOM_INLINE int perform_k_means_coarse_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  int k_means_winner = end_n + 1;
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if (beat_best_pallette_rd) k_means_winner = n;
+    n += step_size;
+    if (n > end_n) break;
+  }
+  return k_means_winner;
+}
+
+// Perform palette search for top colors from minimum palette colors (/maximum)
+// with a step-size of 1 (/-1)
+static AOM_INLINE int perform_top_color_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
+         (step_size == 2));
+  assert(IMPLIES(step_size == -1, start_n > end_n));
+  assert(IMPLIES(step_size == 1, start_n < end_n));
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
+        !beat_best_pallette_rd)
+      return n;
+    n += step_size;
+    if (n == end_n) break;
+  }
+  return n;
+}
+// Perform k-means based palette search from minimum palette colors (/maximum)
+// with a step-size of 1 (/-1)
+static AOM_INLINE int perform_k_means_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
+         (step_size == 2));
+  assert(IMPLIES(step_size == -1, start_n > end_n));
+  assert(IMPLIES(step_size == 1, start_n < end_n));
+  assert(start_n > 0);
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
+        !beat_best_pallette_rd)
+      return n;
+    n += step_size;
+    if (n == end_n) break;
+  }
+  return n;
+}
+
+#define START_N_STAGE2(x)                         \
+  ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \
+                           : AOMMAX(x - 1, PALETTE_MIN_SIZE));
+#define END_N_STAGE2(x, end_n) \
+  ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE));
+
+static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2,
+                                                int *end_n_stage2,
+                                                int *step_size_stage2,
+                                                int winner, int end_n) {
+  *start_n_stage2 = START_N_STAGE2(winner);
+  *end_n_stage2 = END_N_STAGE2(winner, end_n);
+  *step_size_stage2 = *end_n_stage2 - *start_n_stage2;
+}
+
+// Start index and step size below are chosen to evaluate unique
+// candidates in neighbor search, in case a winner candidate is found in
+// coarse search. Example,
+// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+// (3) and 8 (7).
+// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+// as for 8 colors) then step size should also be 2, to cover all
+// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+// coarse search will evaluate 3 and 6. For the winner, unique neighbors
+// (3: 2,4 or 6: 5,7) would be evaluated.
+
+// start index for coarse palette search for dominant colors and k-means
+static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                    3, 3, 2,
+                                                                    3, 3, 2 };
+// step size for coarse palette search for dominant colors and k-means
+static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                      3, 3, 3,
+                                                                      3, 3, 3 };
+
+void av1_rd_pick_palette_intra_sby(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           bsize));
+
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const int is_hbd = seq_params->use_highbitdepth;
+  const int bit_depth = seq_params->bit_depth;
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  int colors;
+  if (is_hbd) {
+    colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth,
+                                     count_buf);
+  } else {
+    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+  }
+
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  if (colors > 1 && colors <= 64) {
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub;
+    if (is_hbd) {
+      int *data_pt = data;
+      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
+      lb = ub = src_pt[0];
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          const int val = src_pt[c];
+          data_pt[c] = val;
+          lb = AOMMIN(lb, val);
+          ub = AOMMAX(ub, val);
+        }
+        src_pt += src_stride;
+        data_pt += cols;
+      }
+    } else {
+      int *data_pt = data;
+      const uint8_t *src_pt = src;
+      lb = ub = src[0];
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          const int val = src_pt[c];
+          data_pt[c] = val;
+          lb = AOMMIN(lb, val);
+          ub = AOMMAX(ub, val);
+        }
+        src_pt += src_stride;
+        data_pt += cols;
+      }
+    }
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // Try the dominant colors directly.
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+        (colors > PALETTE_MIN_SIZE)) {
+      const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+      assert(PALETTE_MAX_SIZE == 8);
+      assert(PALETTE_MIN_SIZE == 2);
+      // Choose the start index and step size for coarse search based on number
+      // of colors
+      const int start_n = start_n_lookup_table[end_n];
+      const int step_size = step_size_lookup_table[end_n];
+      // Perform top color coarse palette search to find the winner candidate
+      const int top_color_winner = perform_top_color_coarse_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n,
+          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
+          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+          beat_best_rd, ctx, best_blk_skip, tx_type_map);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for dominant colors
+      if (top_color_winner <= end_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
+                                 &step_size_stage2, top_color_winner, end_n);
+        // perform finer search for the winner candidate
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2,
+            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
+            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      // Perform k-means coarse palette search to find the winner candidate
+      const int k_means_winner = perform_k_means_coarse_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n,
+          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
+          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+          beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+          rows * cols);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for k-means
+      if (k_means_winner <= end_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
+                                 &step_size_stage2, k_means_winner, end_n);
+        // perform finer search for the winner candidate
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2,
+            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
+            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols);
+      }
+    } else {
+      const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+                end_n = PALETTE_MIN_SIZE;
+      // Perform top color palette search from start_n
+      const int top_color_winner = perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n,
+          end_n - 1, -1, color_cache, n_cache, best_mbmi,
+          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+
+      if (top_color_winner > end_n) {
+        // Perform top color palette search in reverse order for the remaining
+        // colors
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n,
+            top_color_winner, 1, color_cache, n_cache, best_mbmi,
+            best_palette_color_map, best_rd, best_model_rd, rate,
+            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == 2);
+        centroids[0] = lb;
+        centroids[1] = ub;
+        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+                     color_cache, n_cache, best_mbmi, best_palette_color_map,
+                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                     NULL);
+      } else {
+        // Perform k-means palette search from start_n
+        const int k_means_winner = perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1,
+            -1, color_cache, n_cache, best_mbmi, best_palette_color_map,
+            best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+            rows * cols);
+        if (k_means_winner > end_n) {
+          // Perform k-means palette search in reverse order for the remaining
+          // colors
+          perform_k_means_palette_search(
+              cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n,
+              k_means_winner, 1, color_cache, n_cache, best_mbmi,
+              best_palette_color_map, best_rd, best_model_rd, rate,
+              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+              best_blk_skip, tx_type_map, color_map, rows * cols);
+        }
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+}
+
+void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+                                    int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           mbmi->sb_type));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  if (seq_params->use_highbitdepth) {
+    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+  } else {
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip_txfm;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index 8f21b1d..49e682e 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -18,6 +18,10 @@
 extern "C" {
 #endif
 
+struct AV1_COMP;
+struct PICK_MODE_CONTEXT;
+struct macroblock;
+
 #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
 
 void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
@@ -37,6 +41,8 @@
 // calculate the centroid 'indices' for the data points.
 static INLINE void av1_calc_indices(const int *data, const int *centroids,
                                     uint8_t *indices, int n, int k, int dim) {
+  assert(n > 0);
+  assert(k > 0);
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
   } else if (dim == 2) {
@@ -53,6 +59,8 @@
 static INLINE void av1_k_means(const int *data, int *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
+  assert(n > 0);
+  assert(k > 0);
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
   } else if (dim == 2) {
@@ -88,6 +96,38 @@
 int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
                               uint16_t *color_cache, int n_cache,
                               int bit_depth);
+
+/*!\brief Search for the best palette in the luma plane.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sby(
+    const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, int *beat_best_rd,
+    struct PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map);
+
+/*!\brief Search for the best palette in the chroma plane.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
+                                    struct macroblock *x, int dc_mode_cost,
+                                    uint8_t *best_palette_color_map,
+                                    MB_MODE_INFO *const best_mbmi,
+                                    int64_t *best_rd, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable);
+
+// Resets palette color map for chroma channels.
+void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/docs.cmake b/docs.cmake
index bd339c9..41b448a 100644
--- a/docs.cmake
+++ b/docs.cmake
@@ -120,7 +120,10 @@
       "${AOM_ROOT}/av1/encoder/firstpass.h"
       "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
       "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+      "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h"
       "${AOM_ROOT}/av1/encoder/lookahead.h"
+      "${AOM_ROOT}/av1/encoder/palette.h"
+      "${AOM_ROOT}/av1/encoder/palette.c"
       "${AOM_ROOT}/av1/encoder/picklpf.h"
       "${AOM_ROOT}/av1/encoder/rdopt.h"
       "${AOM_ROOT}/av1/encoder/rdopt.c"