Refactor and speed up av1_optimize_txb()

Added update_coeff_eob_facade() and update_coeff_simple_facade()

Refactor update_coeff_eob() and update_coeff_simple() to eliminate
redundant computations.

Encoder performance results averaged over all resolutions:

       Encoder Instruction
  CPU   Count Reduction (%)
   1        1.17
   2        1.03
   3        0.85
   4        0.50
   5        0.57
   6        0.46

This change is bit-exact for all presets.

Change-Id: Ib00ae2d15bbd06ac1d65b921171a4ab7b5035744
diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index 6f0fdff..cf76ffd 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c
@@ -80,7 +80,6 @@
     const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
     uint8_t *levels, int sharpness, const qm_val_t *iqmatrix,
     const qm_val_t *qmatrix) {
-  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   (void)eob;
   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
   // and not the last (scan_idx != eob - 1)
@@ -104,6 +103,7 @@
       return;
     }
 
+    const int dqv = get_dqv(dequant, scan[si], iqmatrix);
     const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
     const int64_t rd = RDCOST(rdmult, rate, dist);
 
@@ -135,7 +135,6 @@
     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
     tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
     const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
-  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
   assert(si != *eob - 1);
   const int ci = scan[si];
   const tran_low_t qc = qcoeff[ci];
@@ -144,6 +143,7 @@
   if (qc == 0) {
     *accu_rate += txb_costs->base_cost[coeff_ctx][0];
   } else {
+    const int dqv = get_dqv(dequant, scan[si], iqmatrix);
     int lower_level = 0;
     const tran_low_t abs_qc = abs(qc);
     const tran_low_t tqc = tcoeff[ci];
@@ -299,6 +299,36 @@
   return 0;
 }
 
+static AOM_FORCE_INLINE void update_coeff_eob_facade(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int *si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+    const qm_val_t *iqmatrix, const qm_val_t *qmatrix, int max_nz_num) {
+  for (; *si >= 0 && *nz_num <= max_nz_num; --*si) {
+    update_coeff_eob(accu_rate, accu_dist, eob, nz_num, nz_ci, *si, tx_size,
+                     tx_class, bhl, width, dc_sign_ctx, rdmult, shift, dequant,
+                     scan, txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,
+                     levels, sharpness, iqmatrix, qmatrix);
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple_facade(
+    int *accu_rate, int *si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bhl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels, int sharpness, const qm_val_t *iqmatrix,
+    const qm_val_t *qmatrix) {
+  for (; *si >= 1; --*si) {
+    update_coeff_simple(accu_rate, *si, eob, tx_size, tx_class, bhl, rdmult,
+                        shift, dequant, scan, txb_costs, tcoeff, qcoeff,
+                        dqcoeff, levels, sharpness, iqmatrix, qmatrix);
+  }
+}
+
 int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int block, TX_SIZE tx_size, TX_TYPE tx_type,
                      const TXB_CTX *const txb_ctx, int *rate_cost,
@@ -400,13 +430,11 @@
 
 #define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
   case tx_class_literal:                                                   \
-    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
-      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
-                       tx_size, tx_class_literal, bhl, width,              \
-                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
-                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
-                       levels, sharpness, iqmatrix, qmatrix);              \
-    }                                                                      \
+    update_coeff_eob_facade(                                               \
+        &accu_rate, &accu_dist, &eob, &nz_num, nz_ci, &si, tx_size,        \
+        tx_class_literal, bhl, width, txb_ctx->dc_sign_ctx, rdmult, shift, \
+        dequant, scan, txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+        levels, sharpness, iqmatrix, qmatrix, max_nz_num);                 \
     break
   switch (tx_class) {
     UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
@@ -421,14 +449,12 @@
                 non_skip_cost, qcoeff, dqcoeff);
   }
 
-#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
-  case tx_class_literal:                                                       \
-    for (; si >= 1; --si) {                                                    \
-      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \
-                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
-                          qcoeff, dqcoeff, levels, sharpness, iqmatrix,        \
-                          qmatrix);                                            \
-    }                                                                          \
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                      \
+    update_coeff_simple_facade(&accu_rate, &si, eob, tx_size,                 \
+                               tx_class_literal, bhl, rdmult, shift, dequant, \
+                               scan, txb_costs, tcoeff, qcoeff, dqcoeff,      \
+                               levels, sharpness, iqmatrix, qmatrix);         \
     break
   switch (tx_class) {
     UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);