Enable more intra mode search

In CWG-E171, we allow more intra mode searches for the encoder. This patch applies the same change to "research-alt-v1-anchor".

Note that to match with the changes to intra mode search, a previous change that restructure the intra mode search has also been added.

STATS_CHANGED
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 1a37cbb..bbf109d 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -465,6 +465,15 @@
   UV_MODE_INVALID,  // For uv_mode in inter blocks
 } UENUM1BYTE(UV_PREDICTION_MODE);
 
+// Number of top model rd to store for pruning y modes in intra mode decision
+#define TOP_INTRA_MODEL_COUNT 6
+#define TOP_TX_PART_COUNT 4
+// Total number of luma intra prediction modes (include both directional and
+// non-directional modes)
+// 61 = PAETH_PRED - DC_PRED + 1 + 6 * 8
+// Because there are 8 directional modes, each has additional 6 delta angles.
+#define LUMA_MODE_COUNT 61
+
 enum {
   SIMPLE_TRANSLATION,
   OBMC_CAUSAL,    // 2-sided OBMC
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 5196c56..aa36867 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1177,6 +1177,11 @@
   unsigned int source_variance;
   //! SSE of the current predictor.
   unsigned int pred_sse[REF_FRAMES];
+
+  /*! \brief Whether to prune current transform partition search. */
+  int prune_tx_partition;
+  /*! \brief Keep records of top rdcosts of transform partition search. */
+  int64_t top_tx_part_rd[TOP_TX_PART_COUNT];
   /**@}*/
 #if CONFIG_SCC_DETERMINATION
   /*!\brief Number of pixels in current thread that choose palette mode in the
diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index f40fe96..e6b105c 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c
@@ -16,80 +16,6 @@
 #include "av1/encoder/palette.h"
 #include "av1/encoder/tx_search.h"
 
-/*!\cond */
-static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
-  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
-  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
-  D67_PRED,      D113_PRED,     D45_PRED,
-};
-
-static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
-  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
-  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
-  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
-  UV_D113_PRED,   UV_D45_PRED,
-};
-/*!\endcond */
-
-/*!\brief Calculate the rdcost of a given luma intra angle
- *
- * \ingroup intra_mode_search
- * \callergraph
- * This function runs rd calculation for a given luma intra prediction angle.
- * This is used to select the best angle delta.
- *
- * \return Returns the rdcost of the angle and updates the mbmi if the
- * new rdcost is better.
- */
-static int64_t calc_rd_given_intra_angle(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
-    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
-    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map,
-    uint8_t *best_blk_skip, int skip_model_rd) {
-  RD_STATS tokenonly_rd_stats;
-  int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  assert(!is_inter_block(mbmi));
-  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
-  if (!skip_model_rd) {
-    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
-      return INT64_MAX;
-    }
-  }
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    best_rd_in);
-  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
-#if CONFIG_SDP
-  int this_rate =
-      mode_cost + tokenonly_rd_stats.rate +
-      x->mode_costs.angle_delta_cost[PLANE_TYPE_Y][mbmi->mode - V_PRED]
-                                    [max_angle_delta + angle_delta];
-#else
-  int this_rate =
-      mode_cost + tokenonly_rd_stats.rate +
-      x->mode_costs
-          .angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
-#endif
-  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-
-  if (this_rd < *best_rd) {
-    memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
-           sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
-    *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
-    *best_tx_size = mbmi->tx_size;
-    *rate = this_rate;
-    rd_stats->rate = tokenonly_rd_stats.rate;
-    rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
-  }
-  return this_rd;
-}
-
 /*!\brief Search for the best filter_intra mode when coding intra frame.
  *
  * \ingroup intra_mode_search
@@ -125,6 +51,8 @@
     if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
       continue;
     }
+    x->prune_tx_partition = 0;
+
     av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
                                       *best_rd);
     if (tokenonly_rd_stats.rate == INT_MAX) continue;
@@ -225,6 +153,39 @@
   }
 }
 
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi) {
+  if (mode_idx < INTRA_MODE_END) {
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  } else {
+    mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+    int angle_delta = (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+    mbmi->angle_delta[PLANE_TYPE_Y] =
+        (angle_delta < 3 ? (angle_delta - 3) : (angle_delta - 2));
+  }
+}
+
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[]) {
+  const double thresh_top = 1.00;
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    if (this_model_rd < top_intra_model_rd[i]) {
+      for (int j = TOP_INTRA_MODEL_COUNT - 1; j > i; j--) {
+        top_intra_model_rd[j] = top_intra_model_rd[j - 1];
+      }
+      top_intra_model_rd[i] = this_model_rd;
+      break;
+    }
+  }
+  if (top_intra_model_rd[TOP_INTRA_MODEL_COUNT - 1] != INT64_MAX &&
+      this_model_rd >
+          thresh_top * top_intra_model_rd[TOP_INTRA_MODEL_COUNT - 1])
+    return 1;
+
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  return 0;
+}
+
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
@@ -713,6 +674,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   RD_STATS rd_stats;
+  x->prune_tx_partition = 0;
   // In order to improve txfm search avoid rd based breakouts during winner
   // mode evaluation. Hence passing ref_best_rd as a maximum value
   av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
@@ -749,81 +711,6 @@
   return 0;
 }
 
-/*!\brief Search for the best angle delta for luma prediction
- *
- * \ingroup intra_mode_search
- * \callergraph
- * Given a luma directional intra prediction mode, this function will try to
- * estimate the best delta_angle.
- *
- * \return Returns the new rdcost of the best intra angle.
- */
-static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int *rate, RD_STATS *rd_stats,
-                                       BLOCK_SIZE bsize, int mode_cost,
-                                       int64_t best_rd, int64_t *best_model_rd,
-                                       int skip_model_rd_for_zero_deg) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-
-  int best_angle_delta = 0;
-  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mbmi->tx_size;
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
-
-  int first_try = 1;
-  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      const int64_t best_rd_in =
-          (best_rd == INT64_MAX) ? INT64_MAX
-                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-      const int64_t this_rd = calc_rd_given_intra_angle(
-          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
-          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-          &best_rd, best_model_rd, best_tx_type_map, best_blk_skip,
-          (skip_model_rd_for_zero_deg & !angle_delta));
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (first_try && this_rd == INT64_MAX) return best_rd;
-      first_try = 0;
-      if (angle_delta == 0) {
-        rd_cost[1] = this_rd;
-        break;
-      }
-    }
-  }
-
-  assert(best_rd != INT64_MAX);
-  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      int skip_search = 0;
-      const int64_t rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        calc_rd_given_intra_angle(
-            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
-            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-            &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0);
-      }
-    }
-  }
-
-  if (rd_stats->rate != INT_MAX) {
-    mbmi->tx_size = best_tx_size;
-    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
-    const int n4 = bsize_to_num_blk(bsize);
-    memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
-           sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
-  }
-  return best_rd;
-}
-
 /*!\brief Search for the best filter_intra mode when coding inter frame.
  *
  * \ingroup intra_mode_search
@@ -856,6 +743,7 @@
   for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES;
        ++fi_mode) {
     mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+    x->prune_tx_partition = 0;
     av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd);
     if (rd_stats_y_fi.rate == INT_MAX) continue;
     const int this_rate_tmp =
@@ -897,7 +785,9 @@
                               BLOCK_SIZE bsize, unsigned int ref_frame_cost,
                               const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats,
                               RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                              int64_t best_rd, int64_t *best_intra_rd) {
+                              int64_t best_rd, int64_t *best_intra_rd,
+                              int64_t *best_model_rd,
+                              int64_t top_intra_model_rd[]) {
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -936,18 +826,14 @@
       intra_search_state->dir_mode_skip_mask_ready = 1;
     }
     if (intra_search_state->directional_mode_skip_mask[mode]) return INT64_MAX;
-    av1_init_rd_stats(rd_stats_y);
-    rd_stats_y->rate = INT_MAX;
-    int64_t model_rd = INT64_MAX;
-    int rate_dummy;
-    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost,
-                            best_rd, &model_rd, 0);
-
-  } else {
-    av1_init_rd_stats(rd_stats_y);
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
   }
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
+  if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd))
+    return INT64_MAX;
+  av1_init_rd_stats(rd_stats_y);
+  x->prune_tx_partition = 0;
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
 
   // Pick filter intra modes.
   if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
@@ -1052,9 +938,6 @@
         rd_stats_uv->rate +
         intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
   }
-  if (mode != DC_PRED && mode != PAETH_PRED) {
-    rd_stats->rate += intra_cost_penalty;
-  }
 
   // Intra block is always coded as non-skip
   rd_stats->skip_txfm = 0;
@@ -1137,11 +1020,23 @@
   x->winner_mode_count = 0;
 
   // Searches the intra-modes except for intrabc, palette, and filter_intra.
-  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+  // for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END;
+  // ++mode_idx) {
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  x->prune_tx_partition = 1;
+  for (int i = 0; i < TOP_TX_PART_COUNT; i++) {
+    x->top_tx_part_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int64_t this_distortion, this_rd;
-    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    // mbmi->mode = intra_rd_search_mode_order[mode_idx];
     if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
          cpi->sf.intra_sf.disable_smooth_intra) &&
         (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
@@ -1150,29 +1045,23 @@
     if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
         mbmi->mode == PAETH_PRED)
       continue;
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-
-    if (model_intra_yrd_and_prune(cpi, x, bsize, bmode_costs[mbmi->mode],
-                                  &best_model_rd)) {
-      continue;
-    }
 
     is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode && av1_use_angle_delta(bsize) &&
-        cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
-      // Searches through the best angle_delta if this option is available.
-      this_rd_stats.rate = INT_MAX;
-      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
-                              bmode_costs[mbmi->mode], best_rd, &best_model_rd,
-                              1);
-    } else {
-      // Builds the actual prediction. The prediction from
-      // model_intra_yrd_and_prune was just an estimation that did not take into
-      // account the effect of txfm pipeline, so we need to redo it for real
-      // here.
-      av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-    }
+    if (is_directional_mode && av1_use_angle_delta(bsize) == 0 &&
+        mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+    const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+    int64_t this_model_rd =
+        intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
+    if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd))
+      continue;
+
+    // Builds the actual prediction. The prediction from
+    // model_intra_yrd_and_prune was just an estimation that did not take into
+    // account the effect of txfm pipeline, so we need to redo it for real
+    // here.
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
     s = this_rd_stats.skip_txfm;
diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index 4e52f02..6bdc8aa 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h
@@ -21,6 +21,21 @@
 extern "C" {
 #endif
 
+/*!\cond */
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
+};
+/*!\endcond */
+
 /*! \brief Variables related to intra-mode search during inter frame coding.
  *
  * \ingroup intra_mode_search
@@ -121,7 +136,9 @@
                               BLOCK_SIZE bsize, unsigned int ref_frame_cost,
                               const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats,
                               RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
-                              int64_t best_rd, int64_t *best_intra_rd);
+                              int64_t best_rd, int64_t *best_intra_rd,
+                              int64_t *best_model_rd,
+                              int64_t top_intra_model_rd[]);
 
 /*!\brief Evaluate luma palette mode for inter frames.
  *
@@ -238,6 +255,27 @@
                              int cols, int bit_depth, int *val_count,
                              int *val_count_8bit, int *num_color_bins,
                              int *num_colors);
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in]    mode_idx           mode index in intra mode decision
+ *                                  process.
+ * \param[in]    mbmi               Pointer to structure holding
+ *                                  the mode info for the current macroblock.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi);
+
+/*! \brief prune luma intra mode    based on the model rd.
+ * \param[in]    this_model_rd      model rd for current mode.
+ * \param[in]    best_model_rd      Best model RD seen for this block so
+ *                                  far.
+ * \param[in]    top_intra_model_rd Top intra model RD seen for this
+ *                                  block so far.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 00ec8d0..ad62706 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4758,22 +4758,7 @@
       return 1;
   }
 
-  // Speed features to prune out INTRA frames
-  if (ref_frame == INTRA_FRAME) {
-    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
-         sf->intra_sf.disable_smooth_intra) &&
-        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-         mbmi->mode == SMOOTH_V_PRED))
-      return 1;
-    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
-        mbmi->mode == PAETH_PRED)
-      return 1;
-
-    // Intra modes will be handled in another loop later.
-    assert(*args->intra_mode_num < INTRA_MODES);
-    args->intra_mode_idx_ls[(*args->intra_mode_num)++] = mode_enum;
-    return 1;
-  }
+  if (ref_frame == INTRA_FRAME) return 1;
 
   if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
     // After we done with single reference modes, find the 2nd best RD
@@ -5106,6 +5091,9 @@
   int intra_mode_num = 0;
   int num_single_modes_processed = 0;
   int intra_mode_idx_ls[INTRA_MODES];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    intra_mode_idx_ls[i] = i + THR_DC;
+  }
 
   // Temporary buffers used by handle_inter_mode().
   uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
@@ -5398,13 +5386,37 @@
   }
 
   const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
-  for (int j = 0; j < intra_mode_num; ++j) {
+  int64_t best_model_rd = INT64_MAX;
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
+    // for (int j = 0; j < intra_mode_num; ++j) {
     if (sf->intra_sf.skip_intra_in_interframe &&
         search_state.intra_search_state.skip_intra_modes)
       break;
-    const THR_MODES mode_enum = intra_mode_idx_ls[j];
-    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
-    const PREDICTION_MODE this_mode = mode_def->mode;
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
+    THR_MODES mode_enum = 0;
+    for (int i = 0; i < INTRA_MODE_END; ++i) {
+      if (mbmi->mode == av1_mode_defs[intra_mode_idx_ls[i]].mode) {
+        mode_enum = intra_mode_idx_ls[i];
+        break;
+      }
+    }
+    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
+        mbmi->mode == PAETH_PRED)
+      continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        av1_use_angle_delta(bsize) == 0 && mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+    const PREDICTION_MODE this_mode = mbmi->mode;
 
     assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
     assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
@@ -5431,7 +5443,8 @@
     intra_rd_stats.rdcost = av1_handle_intra_mode(
         &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
         ctx, &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv,
-        search_state.best_rd, &search_state.best_intra_rd);
+        search_state.best_rd, &search_state.best_intra_rd, &best_model_rd,
+        top_intra_model_rd);
 
     // Collect mode stats for multiwinner mode processing
     const int txfm_search_done = 1;
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 4de0ce6..8f3259f 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -3061,6 +3061,7 @@
   x->rd_model = FULL_TXFM_RD;
   int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+  int iter = 0;
   for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
        depth++, tx_size = sub_tx_size_map[tx_size]) {
     if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
@@ -3085,6 +3086,21 @@
         x->source_variance < 256) {
       if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
     }
+    if (x->prune_tx_partition && iter == 0) {
+      for (int i = 0; i < TOP_TX_PART_COUNT; i++) {
+        if (rd[depth] < x->top_tx_part_rd[i]) {
+          for (int j = TOP_TX_PART_COUNT - 1; j > i; j--) {
+            x->top_tx_part_rd[j] = x->top_tx_part_rd[j - 1];
+          }
+          x->top_tx_part_rd[i] = rd[depth];
+          break;
+        }
+      }
+      if (x->top_tx_part_rd[TOP_TX_PART_COUNT - 1] != INT64_MAX &&
+          rd[depth] > x->top_tx_part_rd[TOP_TX_PART_COUNT - 1])
+        break;
+    }
+    ++iter;
   }
 
   if (rd_stats->rate != INT_MAX) {