Enable adaptive txb search

Seach the transform block size and kernel type adaptively
conditioned on the relative rate distortion cost target. When the
rate distortion target is lower, the need for extensive transform
block partition and type search is relatively lower, hence allowing
to skip finer searches.

Enable this speed feature in speed 1. Local tests on bus_cif and
city_cif both at 1000 kbps at speed 1 showed that the encoder runs
40% - 50% faster.

bus_cif 30 frames: 1494s -> 1053s
city_cif 50 frames: 3210s -> 2328s

The compression performance changes are 0.048% for lowres and
0.091% for midres.

Change-Id: I9ac4b23d33488de0e4c3a089267aa08dab4ade5f
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 42c6fcf..1415ff5 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1869,7 +1869,7 @@
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                                const ENTROPY_CONTEXT *a,
                                const ENTROPY_CONTEXT *l, int fast_tx_search,
-                               int use_fast_coef_costing,
+                               int use_fast_coef_costing, int64_t ref_best_rd,
                                RD_STATS *best_rd_stats) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1968,6 +1968,9 @@
       best_eob = x->plane[plane].eobs[block];
     }
 
+    if (cpi->sf.adaptive_txb_search)
+      if ((best_rd - (best_rd >> 2)) > ref_best_rd) break;
+
     // Skip transform type search when we found the block has been quantized to
     // all zero and at the same time, it has better rdcost than doing transform.
     if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break;
@@ -2042,7 +2045,8 @@
   }
 
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  a, l, 0, args->use_fast_coef_costing, &this_rd_stats);
+                  a, l, 0, args->use_fast_coef_costing, INT64_MAX,
+                  &this_rd_stats);
 
 #if CONFIG_CFL
   if (plane == AOM_PLANE_Y && xd->cfl.store_y && is_cfl_allowed(mbmi)) {
@@ -3467,7 +3471,8 @@
                        int blk_row, int blk_col, int plane, int block,
                        int plane_bsize, const ENTROPY_CONTEXT *a,
                        const ENTROPY_CONTEXT *l, RD_STATS *rd_stats,
-                       int fast_tx_search, TX_SIZE_RD_INFO *rd_info_array) {
+                       int fast_tx_search, int64_t ref_rdcost,
+                       TX_SIZE_RD_INFO *rd_info_array) {
   const struct macroblock_plane *const p = &x->plane[plane];
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
@@ -3494,7 +3499,7 @@
 
   RD_STATS this_rd_stats;
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  a, l, fast_tx_search, 0, &this_rd_stats);
+                  a, l, fast_tx_search, 0, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
 
@@ -3566,7 +3571,7 @@
     mbmi->inter_tx_size[index] = tx_size;
     av1_tx_block_rd_b(
         cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, pta, ptl,
-        rd_stats, fast_tx_search,
+        rd_stats, fast_tx_search, ref_best_rd,
         rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
     if (rd_stats->rate == INT_MAX) return;
 
@@ -3599,6 +3604,9 @@
     best_tx_type = mbmi->txk_type[txk_type_idx];
   }
 
+  if (cpi->sf.adaptive_txb_search)
+    if (this_rd > ref_best_rd) return;
+
   int tx_split_prune_flag = 0;
   if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE)
     tx_split_prune_flag = ((x->tx_search_prune[0] >> TX_TYPES) & 1);
@@ -3938,7 +3946,7 @@
     rd_stats->zero_rate = zero_blk_rate;
     rd_stats->ref_rdcost = ref_best_rd;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, ta, tl, rd_stats, fast, NULL);
+                      plane_bsize, ta, tl, rd_stats, fast, ref_best_rd, NULL);
     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -4490,7 +4498,7 @@
     ENTROPY_CONTEXT *ta = above_ctx + blk_col;
     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
-                      plane_bsize, ta, tl, rd_stats, fast, NULL);
+                      plane_bsize, ta, tl, rd_stats, fast, INT64_MAX, NULL);
     av1_set_txb_context(x, plane, block, tx_size, ta, tl);
   } else {
     const TX_SIZE sub_txs = sub_tx_size_map[1][tx_size];
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 7c3adb4..59eadeb 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -153,6 +153,7 @@
     sf->use_hash_based_trellis = 1;
 #endif  // CONFIG_HASH_ME
     sf->tx_type_search.skip_tx_search = 1;
+    sf->adaptive_txb_search = 1;
   }
 
   if (speed >= 2) {
@@ -467,6 +468,7 @@
   sf->drop_ref = 0;
   sf->skip_intra_in_interframe = 1;
   sf->txb_split_cap = 1;
+  sf->adaptive_txb_search = 0;
   sf->two_pass_partition_search = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index c0d0cb7..b779cc2 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -353,6 +353,10 @@
   // is selected as all zero coefficients.
   int txb_split_cap;
 
+  // Shortcut the transform block partition and type search when the target
+  // rdcost is relatively lower.
+  int adaptive_txb_search;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;