Add a speed feature to use tx type prune based on estimated RD.

The speed feature is turned on for speed 4 and 5.
On speed 4, speed loss is ~2%, with BD-rate gain of ~0.2% in overall
PSNR.

BD-rate on cpu-used=4 (%):

	avgPSNR	 overPSNR  ssim     VMAF
lowres	-0.202	 -0.212	   -0.190   -0.535
midres  -0.162   -0.181	   -0.161   -0.437
ugc360p -0.249   -0.225	   -0.136   -0.329

Also fixed a few implicit conversions.

STATS_CHANGED

Change-Id: I2964dfb3cabe6f0090418f18cc9a46dd4dd578bf
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b683e9e..9ef22d8 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2353,6 +2353,10 @@
 }
 #endif  // CONFIG_COLLECT_RD_STATS >= 2
 #endif  // CONFIG_COLLECT_RD_STATS
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
 // R-D costs are sorted in ascending order.
 static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
   int i, j, k;
@@ -2422,10 +2426,10 @@
     num_cand++;
   }
 
-  if (num_cand == 0) return 0xFFFF;
+  if (num_cand == 0) return (uint16_t)0xFFFF;
 
   sort_rd(rds, txk_map, num_cand);
-  uint16_t prune = ~(1 << txk_map[0]);
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
 
   // 0 < prune_factor <= 1000 controls aggressiveness
   int64_t factor = 0;
@@ -2439,13 +2443,13 @@
   return prune;
 }
 
-int16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                             int block, TX_SIZE tx_size, int blk_row,
-                             int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
-                             int16_t allowed_tx_mask, int prune_factor,
-                             const TXB_CTX *const txb_ctx,
-                             int reduced_tx_set_used, int64_t ref_best_rd,
-                             int num_sel) {
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                              int block, TX_SIZE tx_size, int blk_row,
+                              int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+                              int16_t allowed_tx_mask, int prune_factor,
+                              const TXB_CTX *const txb_ctx,
+                              int reduced_tx_set_used, int64_t ref_best_rd,
+                              int num_sel) {
   const AV1_COMMON *cm = &cpi->common;
 
   int idx;
@@ -2506,7 +2510,7 @@
     if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
   }
 
-  if (skip_h[idx_h[0]]) return 0xFFFF;
+  if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
 
   // evaluate vertical with the best horizontal chosen
   rds_v[0] = rds_h[0];
@@ -2557,7 +2561,7 @@
   }
   sort_rd(rds, txk_map, num_cand);
 
-  uint16_t prune = ~(1 << txk_map[0]);
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
   num_sel = AOMMIN(num_sel, num_cand);
 
   for (int i = 1; i < num_sel; i++) {
@@ -2744,18 +2748,37 @@
       }
       allowed_tx_mask &= (~prune);
     }
-
     for (i = 0; i < TX_TYPES; i++) {
       if (allowed_tx_mask & (1 << i)) num_allowed++;
     }
     assert(num_allowed > 0);
 
-    int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
-    // !fast_tx_search && txk_end != txk_start && plane == 0
-    if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
-        num_allowed > allowed_tx_count) {
-      prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
-                  x->prune_mode, txk_map, &allowed_tx_mask);
+    if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+      int pf = prune_factors[x->prune_mode];
+      int mf = mul_factors[x->prune_mode];
+      if (num_allowed <= 7) {
+        const uint16_t prune = prune_txk_type(
+            cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+            txk_map, allowed_tx_mask, pf, txb_ctx, cm->reduced_tx_set_used);
+        allowed_tx_mask &= (~prune);
+      } else {
+        const int num_sel = (num_allowed * mf + 50) / 100;
+        const uint16_t prune = prune_txk_type_separ(
+            cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+            txk_map, allowed_tx_mask, pf, txb_ctx, cm->reduced_tx_set_used,
+            ref_best_rd, num_sel);
+
+        allowed_tx_mask &= (~prune);
+      }
+    } else {
+      assert(num_allowed > 0);
+      int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
+      // !fast_tx_search && txk_end != txk_start && plane == 0
+      if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
+          num_allowed > allowed_tx_count) {
+        prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+                    x->prune_mode, txk_map, &allowed_tx_mask);
+      }
     }
   }
 
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 57f56ac..c99c727 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -494,6 +494,7 @@
     sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
     sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
     // TODO(any): Experiment with enabling of this speed feature as hash state
     // is reset during winner mode processing
     sf->tx_sf.use_intra_txb_hash = 0;
@@ -987,6 +988,7 @@
   tx_sf->tx_type_search.fast_inter_tx_type_search = 0;
   tx_sf->tx_type_search.skip_tx_search = 0;
   tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+  tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
   tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0;
   tx_sf->txb_split_cap = 1;
   tx_sf->adaptive_txb_search_level = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index bb6784f..e2c7d5e 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -227,6 +227,8 @@
 
   // Prune tx type search using previous frame stats.
   int prune_tx_type_using_stats;
+  // Prune tx type search using estimated RDcost
+  int prune_tx_type_est_rd;
 
   // Flag used to control the winner mode processing for tx type pruning for
   // inter blocks. It enables further tx type mode pruning based on ML model for