Use derived transform kernel in all intra speed 6 mode

Search the transform kernels that are derived from residual stats
model in all intra speed 6 mode. The compression performance change
in speed 6 is less than 0.01%. Encoding speed is up by 5%.

BUG=aomedia:2959

STATS_CHANGED

Change-Id: I58c0c0e5625094d0be0c2451cf2c37d7d1516e06
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 0eb212c..331d33f 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -1020,6 +1020,28 @@
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
 };
 
+// The bitmask corresponds to the transform types as defined in
+// enums.h TX_TYPE enumeration type. Setting the bit 0 means to disable
+// the use of the corresponding transform type in that table.
+// The av1_derived_intra_tx_used_flag table is used when
+// use_reduced_intra_txset is set to 2, where one only searches
+// the transform types derived from residual statistics.
+static const uint16_t av1_derived_intra_tx_used_flag[INTRA_MODES] = {
+  0x0209,  // DC_PRED:       0000 0010 0000 1001
+  0x0403,  // V_PRED:        0000 0100 0000 0011
+  0x0805,  // H_PRED:        0000 1000 0000 0101
+  0x020F,  // D45_PRED:      0000 0010 0000 1111
+  0x0009,  // D135_PRED:     0000 0000 0000 1001
+  0x0009,  // D113_PRED:     0000 0000 0000 1001
+  0x0009,  // D157_PRED:     0000 0000 0000 1001
+  0x0805,  // D203_PRED:     0000 1000 0000 0101
+  0x0403,  // D67_PRED:      0000 0100 0000 0011
+  0x0205,  // SMOOTH_PRED:   0000 0010 0000 1001
+  0x0403,  // SMOOTH_V_PRED: 0000 0100 0000 0011
+  0x0805,  // SMOOTH_H_PRED: 0000 1000 0000 0101
+  0x0209,  // PAETH_PRED:    0000 0010 0000 1001
+};
+
 static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
   0x080F,  // DC_PRED:       0000 1000 0000 1111
   0x040F,  // V_PRED:        0000 0100 0000 1111
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 083b4de..d04f1ec 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -672,6 +672,7 @@
     // Use largest txfm block size.
     sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
     sf->tx_sf.intra_tx_size_search_init_depth_rect = 2;
+    sf->tx_sf.tx_type_search.use_reduced_intra_txset = 2;
 
     sf->rd_sf.perform_coeff_opt = 6;
     sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1cf3052..275cf02 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -226,7 +226,8 @@
   int fast_intra_tx_type_search;
   int fast_inter_tx_type_search;
 
-  // prune two least frequently chosen transforms for each intra mode
+  // Prune less likely chosen transforms for each intra mode. The speed
+  // feature ranges from 0 to 2, for different speed / compression trade offs.
   int use_reduced_intra_txset;
 
   // Use a skip flag prediction model to detect blocks with skip = 1 early
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index e7b4d5b..30aac0a 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -1995,10 +1995,14 @@
           ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
           : mbmi->mode;
   uint16_t ext_tx_used_flag =
-      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset &&
+      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 &&
               tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
           ? av1_reduced_intra_tx_used_flag[intra_dir]
           : av1_ext_tx_used_flag[tx_set_type];
+
+  if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2)
+    ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir];
+
   if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
       ext_tx_used_flag == 0x0001 ||
       (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) ||