Add a speed feature for intra tx type search

Add a speed feature to seperate prediction mode and tx type search
for intra modes: search for best intra prediction mode with fixed
default tx type first, then choose the best tx type for the
selected mode.

Coding performance drop:
baseline
  lowres 0.10% midres 0.08% hdres 0.14%
with ext-tx
  lowres 0.14% midres 0.25% hdres 0.20%

Speed improvement is 20% for baseline and 17% for ext-tx.

It is turned on for speed >= 1.

Change-Id: Ia5e8d39e8a4e2e42c521bfde938f8b6a98ab24f9
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index 1ddc3d8..9412526 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -180,6 +180,9 @@
   // Strong color activity detection. Used in RTC coding mode to enhance
   // the visual quality at the boundary of moving color objects.
   uint8_t color_sensitivity[2];
+
+  // use default transform and skip transform type search for intra modes
+  int use_default_intra_tx_type;
 };
 
 #ifdef __cplusplus
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index da3efe6..58a7042 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -780,7 +780,7 @@
   const int tx_set_1D[TX_TYPES_1D] = {0};
 #endif
 
-  switch (cpi->sf.tx_type_search) {
+  switch (cpi->sf.tx_type_search.prune_mode) {
     case NO_PRUNE:
       return 0;
       break;
@@ -1576,6 +1576,9 @@
   for (n = start_tx; n >= end_tx; --n) {
     if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n))
       continue;
+    if (!is_inter && x->use_default_intra_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, n))
+      continue;
     if (max_tx_size == TX_32X32 && n == TX_4X4)
       continue;
 #if CONFIG_EXT_TX
@@ -1583,7 +1586,7 @@
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type])
         continue;
-      if (cpi->sf.tx_type_search > 0) {
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
         if (!do_tx_type_search(tx_type, prune))
           continue;
       }
@@ -1598,7 +1601,7 @@
 #else  // CONFIG_EXT_TX
     if (n >= TX_32X32 && tx_type != DCT_DCT)
       continue;
-    if (is_inter && cpi->sf.tx_type_search > 0 &&
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
         !do_tx_type_search(tx_type, prune))
         continue;
 #endif  // CONFIG_EXT_TX
@@ -1674,7 +1677,7 @@
   ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
 #endif  // CONFIG_EXT_TX
 
-  if (is_inter && cpi->sf.tx_type_search > 0)
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
 #if CONFIG_EXT_TX
     prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
 #else
@@ -1687,11 +1690,14 @@
       if (is_inter) {
         if (!ext_tx_used_inter[ext_tx_set][tx_type])
           continue;
-        if (cpi->sf.tx_type_search > 0) {
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
           if (!do_tx_type_search(tx_type, prune))
             continue;
         }
       } else {
+        if (x->use_default_intra_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
             continue;
@@ -1740,6 +1746,9 @@
   if (mbmi->tx_size < TX_32X32 &&
       !xd->lossless[mbmi->segment_id]) {
     for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (!is_inter && x->use_default_intra_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
       mbmi->tx_type = tx_type;
       txfm_rd_in_plane(x,
                        cpi,
@@ -1750,7 +1759,8 @@
         continue;
       if (is_inter) {
         r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-        if (cpi->sf.tx_type_search > 0 && !do_tx_type_search(tx_type, prune))
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+            !do_tx_type_search(tx_type, prune))
             continue;
       } else {
         r += cpi->intra_tx_type_costs[mbmi->tx_size]
@@ -1817,7 +1827,7 @@
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   int prune = 0;
 
-  if (is_inter && cpi->sf.tx_type_search > 0)
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
     // passing -1 in for tx_type indicates that all 1D
     // transforms should be considered for pruning
     prune = prune_tx_types(cpi, bs, x, xd, -1);
@@ -2835,7 +2845,7 @@
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE bsize,
                                       int64_t best_rd) {
-  PREDICTION_MODE mode;
+  uint8_t mode_idx;
   PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
@@ -2864,6 +2874,7 @@
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp10_left_block_mode(mic, left_mi, 0);
+  const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   bmode_costs = cpi->y_mode_costs[A][L];
 
@@ -2889,15 +2900,28 @@
   if (left_mi)
     palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
 
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
   /* Y Search for intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-    mic->mbmi.mode = mode;
+  for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+    if (mode_idx == FINAL_MODE_SEARCH) {
+      if (x->use_default_intra_tx_type == 0)
+        break;
+      mic->mbmi.mode = mode_selected;
+      x->use_default_intra_tx_type = 0;
+    } else {
+      mic->mbmi.mode = mode_idx;
+    }
 #if CONFIG_EXT_INTRA
-    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
-    if (is_directional_mode && directional_mode_skip_mask[mode])
+    is_directional_mode =
+        (mic->mbmi.mode != DC_PRED && mic->mbmi.mode != TM_PRED);
+    if (is_directional_mode && directional_mode_skip_mask[mic->mbmi.mode])
       continue;
     if (is_directional_mode) {
-      rate_overhead = bmode_costs[mode] +
+      rate_overhead = bmode_costs[mic->mbmi.mode] +
           write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
       this_rate_tokenonly = INT_MAX;
       this_rd =
@@ -2917,7 +2941,7 @@
     if (this_rate_tokenonly == INT_MAX)
       continue;
 
-    this_rate = this_rate_tokenonly + bmode_costs[mode];
+    this_rate = this_rate_tokenonly + bmode_costs[mic->mbmi.mode];
 
     if (!xd->lossless[xd->mi[0]->mbmi.segment_id]) {
       // super_block_yrd above includes the cost of the tx_size in the
@@ -2928,12 +2952,12 @@
           cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
                                                  [mic->mbmi.tx_size];
     }
-    if (cpi->common.allow_screen_content_tools && mode == DC_PRED)
+    if (cpi->common.allow_screen_content_tools && mic->mbmi.mode == DC_PRED)
       this_rate +=
           vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
                                                          [palette_ctx], 0);
 #if CONFIG_EXT_INTRA
-    if (mode == DC_PRED && ALLOW_FILTER_INTRA_MODES)
+    if (mic->mbmi.mode == DC_PRED && ALLOW_FILTER_INTRA_MODES)
       this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 0);
     if (is_directional_mode) {
       int p_angle;
@@ -2950,7 +2974,7 @@
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
-      mode_selected   = mode;
+      mode_selected   = mic->mbmi.mode;
       best_rd         = this_rd;
       best_tx         = mic->mbmi.tx_size;
 #if CONFIG_EXT_INTRA
@@ -3444,7 +3468,7 @@
   int ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
 #endif  // CONFIG_EXT_TX
 
-  if (is_inter && cpi->sf.tx_type_search > 0)
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
 #if CONFIG_EXT_TX
     prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
 #else
@@ -3465,7 +3489,7 @@
     if (is_inter) {
       if (!ext_tx_used_inter[ext_tx_set][tx_type])
         continue;
-      if (cpi->sf.tx_type_search > 0) {
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
         if (!do_tx_type_search(tx_type, prune))
           continue;
       }
@@ -3480,7 +3504,7 @@
 #else  // CONFIG_EXT_TX
     if (max_tx_size >= TX_32X32 && tx_type != DCT_DCT)
       continue;
-    if (is_inter && cpi->sf.tx_type_search > 0 &&
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
         !do_tx_type_search(tx_type, prune))
       continue;
 #endif  // CONFIG_EXT_TX
@@ -8308,6 +8332,7 @@
   MB_MODE_INFO best_mbmode;
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
+  const int FINAL_MODE_SEARCH = MAX_MODES;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
@@ -8597,8 +8622,14 @@
     midx = end_pos;
   }
 
-  for (midx = 0; midx < MAX_MODES; ++midx) {
-    int mode_index = mode_map[midx];
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  for (midx = 0; midx <= FINAL_MODE_SEARCH; ++midx) {
+    int mode_index;
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
@@ -8617,6 +8648,17 @@
     uint8_t ref_frame_type;
 #endif
 
+    if (midx == FINAL_MODE_SEARCH) {
+      if (!is_inter_mode(best_mbmode.mode) && best_mode_index >= 0 &&
+          x->use_default_intra_tx_type == 1) {
+        mode_index = best_mode_index;
+        x->use_default_intra_tx_type = 0;
+      } else {
+        break;
+      }
+    } else {
+      mode_index = mode_map[midx];
+    }
     this_mode = vp10_mode_order[mode_index].mode;
     ref_frame = vp10_mode_order[mode_index].ref_frame[0];
     second_ref_frame = vp10_mode_order[mode_index].ref_frame[1];
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index b766cae..f395fac 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -159,7 +159,8 @@
 
     sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
-    sf->tx_type_search = PRUNE_ONE;
+    sf->tx_type_search.prune_mode = PRUNE_ONE;
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
     // Use transform domain distortion.
     // Note var-tx expt always uses pixel domain distortion.
     sf->use_transform_domain_distortion = 1;
@@ -183,7 +184,7 @@
     sf->allow_partition_search_skip = 1;
     sf->use_upsampled_references = 0;
 #if CONFIG_EXT_TX
-    sf->tx_type_search = PRUNE_TWO;
+    sf->tx_type_search.prune_mode = PRUNE_TWO;
 #endif
   }
 
@@ -496,7 +497,8 @@
   sf->alt_ref_search_fp = 0;
   sf->use_quant_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
-  sf->tx_type_search = NO_PRUNE;
+  sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.fast_intra_tx_type_search = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index ca6adbe..db57434 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -190,6 +190,11 @@
   // eliminates two tx types in each direction
   PRUNE_TWO = 2,
 #endif
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_mode;
+  int fast_intra_tx_type_search;
 } TX_TYPE_SEARCH;
 
 typedef enum {
@@ -310,6 +315,7 @@
   PARTITION_SEARCH_TYPE partition_search_type;
 
   TX_TYPE_SEARCH tx_type_search;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;