Speed up av1_optimize_b

1. Remove calling of get_txb_ctx from av1_optimize_b
 to it's caller, outside of rdo loops.
2. Remove calling of av1_get_tx_type, tx_type can be
 passed in by it's caller.
3. For encoder, about 1.3% faster shows by encoding
20 frame of BasketballDrill_832x480_50.y4m, with no
coding loss.  ( 601278 ms -> 592634 ms)

a) gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
b) CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=20

Change-Id: I755b337e29316f4ceed37c9b669aebb4ad2d5fac
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 69c8eda..cea8db6 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -97,25 +97,23 @@
 }
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
-                   int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
-                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode, int *rate_cost) {
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode,
+                   int *rate_cost) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   const int eob = p->eobs[block];
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   const int segment_id = xd->mi[0]->segment_id;
 
   if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
       xd->lossless[segment_id]) {
-    *rate_cost = av1_cost_skip_txb(mb, &txb_ctx, plane, tx_size);
+    *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
     return eob;
   }
 
   (void)fast_mode;
-  return av1_optimize_txb_new(cpi, mb, plane, blk_row, blk_col, block, tx_size,
-                              &txb_ctx, rate_cost, cpi->oxcf.sharpness);
+  return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+                              rate_cost, cpi->oxcf.sharpness);
 }
 
 typedef enum QUANT_FUNC {
@@ -234,8 +232,10 @@
     if (args->enable_optimize_b) {
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, tx_type, AV1_XFORM_QUANT_FP);
-      av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, 1, &dummy_rate_cost);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
     } else {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
@@ -526,8 +526,10 @@
     if (args->enable_optimize_b) {
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, tx_type, AV1_XFORM_QUANT_FP);
-      av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, 1, &dummy_rate_cost);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
     } else {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index 1be2ce0..673f87e 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -15,6 +15,7 @@
 #include "config/aom_config.h"
 
 #include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/tokenize.h"
 #ifdef __cplusplus
@@ -53,9 +54,8 @@
                      AV1_XFORM_QUANT xform_quant_idx);
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
-                   int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
-                   TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
-                   const ENTROPY_CONTEXT *l, int fast_mode, int *rate_cost);
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 1aadeb1..4d4802b 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -280,7 +280,7 @@
                           const int is_eob, const TxbInfo *const txb_info,
                           const LV_MAP_COEFF_COST *const txb_costs,
                           const int coeff_ctx, const TX_CLASS tx_class) {
-  const TXB_CTX *txb_ctx = txb_info->txb_ctx;
+  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
   const int is_nz = (qc != 0);
   const tran_low_t abs_qc = abs(qc);
   int cost = 0;
@@ -1552,14 +1552,13 @@
 }
 
 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                         int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                         TXB_CTX *txb_ctx, int *rate_cost, int sharpness) {
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
-  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
-                                          tx_size, cm->reduced_tx_set_used);
   const TX_CLASS tx_class = tx_type_to_class[tx_type];
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const struct macroblock_plane *p = &x->plane[plane];
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index 62a6017..aa847ad 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -78,8 +78,8 @@
 
 void hbt_destroy();
 int av1_optimize_txb_new(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                         int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                         TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *txb_ctx, int *rate_cost, int sharpness);
 #ifdef __cplusplus
 }
 #endif
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8045e3c..6f4fced 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2516,8 +2516,7 @@
 static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
                                int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                               const ENTROPY_CONTEXT *a,
-                               const ENTROPY_CONTEXT *l,
+                               const TXB_CTX *const txb_ctx,
                                FAST_TX_SEARCH_MODE ftxs_mode,
                                int use_fast_coef_costing, int64_t ref_best_rd,
                                RD_STATS *best_rd_stats) {
@@ -2558,9 +2557,7 @@
         find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
     intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
 
-    TXB_CTX txb_ctx;
-    get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-    cur_joint_ctx = (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
+    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
     if (intra_hash_idx > 0 &&
         intra_txb_rd_info->entropy_context == cur_joint_ctx &&
         x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
@@ -2679,8 +2676,6 @@
     block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
   block_sse *= 16;
 
-  TXB_CTX txb_ctx;
-  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
     if (!allowed_tx_mask[tx_type]) continue;
     if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
@@ -2692,7 +2687,7 @@
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
           USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
       rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
-                                  tx_size, &txb_ctx, use_fast_coef_costing);
+                                  tx_size, txb_ctx, use_fast_coef_costing);
     } else {
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, tx_type, AV1_XFORM_QUANT_FP);
@@ -2702,15 +2697,15 @@
         dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
                              &this_rd_stats.sse);
         rate_cost = av1_cost_coeffs(cm, x, plane, blk_row, blk_col, block,
-                                    tx_size, &txb_ctx, use_fast_coef_costing);
+                                    tx_size, txb_ctx, use_fast_coef_costing);
         const int64_t rd_estimate =
             AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
                    RDCOST(x->rdmult, 0, this_rd_stats.sse));
         if (rd_estimate - (rd_estimate >> 3) > AOMMIN(best_rd, ref_best_rd))
           continue;
       }
-      av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                     tx_size, a, l, 1, &rate_cost);
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
+                     &rate_cost);
     }
     if (eobs_ptr[block] == 0) {
       // When eob is 0, pixel domain distortion is more efficient and accurate.
@@ -2809,8 +2804,8 @@
       } else {
         av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                         tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
-        av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
-                       tx_size, a, l, 1, &rate_cost);
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
+                       &rate_cost);
       }
     }
 
@@ -2868,9 +2863,10 @@
     av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
   }
-
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  a, l, args->ftxs_mode, args->use_fast_coef_costing,
+                  &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
                   args->best_rd - args->this_rd, &this_rd_stats);
 
   if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
@@ -4191,7 +4187,7 @@
 
   RD_STATS this_rd_stats;
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  a, l, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+                  &txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
 
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 5de6a53..1fa3d68 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -68,8 +68,8 @@
                      int bsh, int visible_w, int visible_h, int qindex);
 #endif
 
-static INLINE int av1_cost_skip_txb(MACROBLOCK *x, TXB_CTX *txb_ctx, int plane,
-                                    TX_SIZE tx_size) {
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+                                    int plane, TX_SIZE tx_size) {
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const LV_MAP_COEFF_COST *const coeff_costs =
@@ -79,7 +79,8 @@
 
 static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
                                   int plane, int blk_row, int blk_col,
-                                  int block, TX_SIZE tx_size, TXB_CTX *txb_ctx,
+                                  int block, TX_SIZE tx_size,
+                                  const TXB_CTX *const txb_ctx,
                                   int use_fast_coef_costing) {
 #if TXCOEFF_COST_TIMER
   struct aom_usec_timer timer;