hash_based_trellis speed feature

Add speed feature that uses hash tables to
reuse previously found optimized coefficients
in av1_optimize_txb. This skips some expensive
optimize_txb calls.

Currently shows no significant quality
degredation or speed improvement, and set to off
by default. Requires hash_me, lv_map and
lv_map_multi. Adding to speed features required
changing AV1_COMMON *cm to AV1_COMP *cpi in a
chain of functions.

Variations that have been tried:
-varying the maximum eob on which the feature
activates: 16, 32, 64. 16 currently used. 64
has best hit rate but longer execution time.
-varying the data hashed and the length of hashes
(first hash is 16 bit and based on context data,
while second hash is 16 bit and based only on
pre-optimized qcoeff values.)
-softening the data used for the hashes: ideally
this would raise the number of hits, without
compromising quality too much.

Change-Id: I94f22be82f3a46637c0489d512f2e334a307575f
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f6e9502..7b36e0f 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4453,8 +4453,7 @@
 #endif  // CONFIG_CFL
     mbmi->skip = 1;
     for (int plane = 0; plane < num_planes; ++plane) {
-      av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, plane, 1, mi_row,
-                                   mi_col);
+      av1_encode_intra_block_plane(cpi, x, bsize, plane, 1, mi_row, mi_col);
     }
 #if CONFIG_CFL
     xd->cfl.store_y = 0;
@@ -4519,7 +4518,7 @@
     }
 #endif
 
-    av1_encode_sb((AV1_COMMON *)cm, x, bsize, mi_row, mi_col, dry_run);
+    av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run);
     if (mbmi->skip) mbmi->min_tx_size = mbmi->tx_size;
     av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate,
                           tile_data->allow_update_cdf);
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index e0374a1..7b95902 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -419,8 +419,8 @@
 }
 #endif  // !CONFIG_LV_MAP
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
-                   int blk_col, int block, BLOCK_SIZE plane_bsize,
+int av1_optimize_b(const AV1_COMP *const cpi, MACROBLOCK *mb, int plane,
+                   int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
                    TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                    const ENTROPY_CONTEXT *l, int fast_mode) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -434,12 +434,13 @@
   (void)blk_row;
   (void)blk_col;
   int ctx = get_entropy_context(tx_size, a, l);
+  const AV1_COMMON *const cm = &cpi->common;
   return optimize_b_greedy(cm, mb, plane, blk_row, blk_col, block, tx_size, ctx,
                            fast_mode);
 #else   // !CONFIG_LV_MAP
   TXB_CTX txb_ctx;
   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
-  return av1_optimize_txb(cm, mb, plane, blk_row, blk_col, block, tx_size,
+  return av1_optimize_txb(cpi, mb, plane, blk_row, blk_col, block, tx_size,
                           &txb_ctx, fast_mode);
 #endif  // !CONFIG_LV_MAP
 }
@@ -556,7 +557,7 @@
   (void)mi_col;
   (void)dry_run;
   struct encode_b_args *const args = arg;
-  AV1_COMMON *cm = args->cm;
+  const AV1_COMMON *const cm = &args->cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -587,8 +588,8 @@
     p->eobs[block] = 0;
   }
 
-  av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size, a,
-                 l, CONFIG_LV_MAP);
+  av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
+                 tx_size, a, l, CONFIG_LV_MAP);
 
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
@@ -738,13 +739,13 @@
                                          encode_block_pass1, &args);
 }
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                   int mi_col, RUN_TYPE dry_run) {
+void av1_encode_sb(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run) {
   (void)dry_run;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+  struct encode_b_args arg = { cpi, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
   int plane;
 
   mbmi->skip = 1;
@@ -844,7 +845,7 @@
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
   struct encode_b_args *const args = arg;
-  AV1_COMMON *cm = args->cm;
+  const AV1_COMMON *const cm = &args->cpi->common;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -885,8 +886,8 @@
   if (args->enable_optimize_b) {
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
-    av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize, tx_size,
-                   a, l, CONFIG_LV_MAP);
+    av1_optimize_b(args->cpi, x, plane, blk_row, blk_col, block, plane_bsize,
+                   tx_size, a, l, CONFIG_LV_MAP);
 
 #if CONFIG_TXK_SEL
     if (plane == 0 && p->eobs[block] == 0) {
@@ -913,7 +914,7 @@
 #endif  // CONFIG_CFL
 }
 
-void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+void av1_encode_intra_block_plane(const AV1_COMP *const cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
                                   int mi_col) {
@@ -922,7 +923,7 @@
   ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE] = { 0 };
 
   struct encode_b_args arg = {
-    cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
+    cpi, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
   };
 
   if (!is_chroma_reference(mi_row, mi_col, bsize,
diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index cf7d3dd..2f7b109 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h
@@ -26,7 +26,7 @@
 };
 
 struct encode_b_args {
-  AV1_COMMON *cm;
+  const struct AV1_COMP *cpi;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
   int8_t *skip;
@@ -43,15 +43,15 @@
   AV1_XFORM_QUANT_TYPES,
 } AV1_XFORM_QUANT;
 
-void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
-                   int mi_col, RUN_TYPE dry_run);
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run);
 void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
                      TX_SIZE tx_size, AV1_XFORM_QUANT xform_quant_idx);
 
-int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int blk_row,
-                   int blk_col, int block, BLOCK_SIZE plane_bsize,
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int blk_row, int blk_col, int block, BLOCK_SIZE plane_bsize,
                    TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                    const ENTROPY_CONTEXT *l, int fast_mode);
 
@@ -66,7 +66,7 @@
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
-void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b, int mi_row,
                                   int mi_col);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 99cd71e..086cda6 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -18,11 +18,25 @@
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hash.h"
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
 #define TEST_OPTIMIZE_TXB 0
 
+static int hbt_hash_needs_init = 1;
+static CRC_CALCULATOR crc_calculator;
+static CRC_CALCULATOR crc_calculator2;
+static const int HBT_HASH_EOB = 16;  // also the length in opt_qcoeff
+
+typedef struct OptTxbQcoeff {
+  uint32_t hbt_hash_match;
+  double hits;
+  tran_low_t opt_qcoeff[16];
+} OptTxbQcoeff;
+
+OptTxbQcoeff hbt_hash_table[65536][16];
+
 typedef struct LevelDownStats {
   int update;
   tran_low_t low_qc;
@@ -291,6 +305,16 @@
   stats->update = 0;
   stats->rd_low = 0;
   stats->rd = 0;
+// TODO(mfo): explore if there's a better way to prevent compiler init
+// warnings
+#if CONFIG_LV_MAP_MULTI
+  stats->nz_rd = 0;
+#else
+  stats->nz_rate = 0;
+#endif
+  stats->dist_low = 0;
+  stats->rate_low = 0;
+  stats->low_qc = 0;
 
   const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
   const int dqv = txb_info->dequant[coeff_idx != 0];
@@ -2196,9 +2220,215 @@
   { 17, 13 }, { 16, 10 },
 };
 
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+void hbt_hash_init() {
+  av1_crc_calculator_init(&crc_calculator, 16, 0x5D6DCB);   // ctx 16 bit hash
+  av1_crc_calculator_init(&crc_calculator2, 16, 0x5D6DCB);  // qc 16 bit hash
+  memset(hbt_hash_table, 0, sizeof(hbt_hash_table[0][0]) * 65536 * 16);
+  hbt_hash_needs_init = 0;
+}
+
+int hbt_hash_miss(int found_index, uint16_t hbt_hash_index,
+                  uint32_t hbt_hash_match, TxbInfo *txb_info,
+                  const LV_MAP_COEFF_COST *txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                  const LV_MAP_EOB_COST *txb_eob_costs,
+#endif
+                  const struct macroblock_plane *p, int block, int fast_mode) {
+  const int16_t *scan = txb_info->scan_order->scan;
+
+  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                      txb_info->levels);
+  // The hash_based_trellis speed feature requires lv_map_multi, so always true.
+  const int update = optimize_txb(txb_info, txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                                  txb_eob_costs,
+#endif
+                                  NULL, 0, fast_mode);
+
+  if (update) {
+    // Overwrite old lowest entry
+    hbt_hash_table[hbt_hash_index][found_index].hbt_hash_match = hbt_hash_match;
+    hbt_hash_table[hbt_hash_index][found_index].hits = 1.0;
+    for (int i = 0; i < txb_info->eob; i++) {
+      hbt_hash_table[hbt_hash_index][found_index].opt_qcoeff[i] =
+          txb_info->qcoeff[scan[i]];
+    }
+    for (int i = txb_info->eob; i < HBT_HASH_EOB; i++) {
+      hbt_hash_table[hbt_hash_index][found_index].opt_qcoeff[i] = 0;
+    }
+
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+  return txb_info->eob;
+}
+
+int hbt_hash_hit(uint16_t hbt_hash_index, int found_index, TxbInfo *txb_info,
+                 const struct macroblock_plane *p, int block) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  int new_eob = 0;
+  int update = 0;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    if (txb_info->qcoeff[scan[i]] !=
+        hbt_hash_table[hbt_hash_index][found_index].opt_qcoeff[i]) {
+      txb_info->qcoeff[scan[i]] =
+          hbt_hash_table[hbt_hash_index][found_index].opt_qcoeff[i];
+      update = 1;
+      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
+    }
+
+    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
+  }
+
+  if (update) {
+    txb_info->eob = new_eob;
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+  return txb_info->eob;
+}
+
+int search_hbt_hash_match(uint16_t hbt_hash_index, uint32_t hbt_hash_match,
+                          TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                          const LV_MAP_EOB_COST *txb_eob_costs,
+#endif
+                          const struct macroblock_plane *p, int block,
+                          int fast_mode) {
+  // Decay all hits
+  double lowest_hits = 1.0;
+  int lowest_index = 0;
+
+  for (int i = 0; i < 16; i++) {
+    hbt_hash_table[hbt_hash_index][i].hits *= 31.0;
+    hbt_hash_table[hbt_hash_index][i].hits /= 32.0;
+
+    if (hbt_hash_table[hbt_hash_index][i].hits < lowest_hits) {
+      lowest_hits = hbt_hash_table[hbt_hash_index][i].hits;
+      lowest_index = i;
+    }
+  }
+
+  // Search soft hash vector for qcoeff match
+  int found_index = -1;
+  for (int i = 0; i < 16; i++) {  // OptTxbQcoeff array has fixed size of 16.
+    if (hbt_hash_table[hbt_hash_index][i].hbt_hash_match == hbt_hash_match) {
+      found_index = i;
+      hbt_hash_table[hbt_hash_index][i].hits += 1.0;
+      break;  // Found a match and it's at found_index
+    }
+  }
+
+  if (found_index == -1) {  // Add new OptTxbQcoeff into array.
+    return hbt_hash_miss(lowest_index, hbt_hash_index, hbt_hash_match, txb_info,
+                         txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                         txb_eob_costs,
+#endif
+                         p, block, fast_mode);
+  } else {  // Retrieve data from array.
+    return hbt_hash_hit(hbt_hash_index, found_index, txb_info, p, block);
+  }
+}
+
+int hash_based_trellis_mode(TxbInfo *txb_info,
+                            const LV_MAP_COEFF_COST *txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                            const LV_MAP_EOB_COST *txb_eob_costs,
+#endif
+                            const struct macroblock_plane *p, int block,
+                            int fast_mode, TXB_CTX *txb_ctx) {
+  // Initialize hash table if needed.
+  if (hbt_hash_needs_init) {
+    hbt_hash_init();
+  }
+
+  //// Hash creation
+  // TODO(mfo): use exact length once input finalized
+  uint8_t txb_hash_data[256];
+  const int16_t *scan = txb_info->scan_order->scan;
+  uint8_t chunk = 0;
+
+  uint16_t ctx_hash = 0;
+  uint32_t qc_hash = 0;
+
+  int hash_data_index = 0;
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Data softening: data from -3 -> 3 is left alone,
+    // while 'large' data is put into buckets of 16s
+    // Consider bucketing less than 16 down to 4 instead of 0
+    // if(txb_info->qcoeff[scan[i]] < 4 && txb_info->qcoeff[scan[i]] > -4)
+    chunk = (txb_info->qcoeff[scan[i]]) & 0xff;
+    /*else if(txb_info->qcoeff[scan[i]] < 16 && txb_info->qcoeff[scan[i]] > -16)
+      chunk = (txb_info->qcoeff[scan[i]]) & 0xfc; //
+    else
+      chunk = (txb_info->qcoeff[scan[i]]) & 0xf0; // greater than 16*/
+    txb_hash_data[hash_data_index++] = chunk;
+
+    chunk = ((txb_info->qcoeff[scan[i]]) & 0xff00) >> 8;
+    txb_hash_data[hash_data_index++] = chunk;
+  }
+  assert(hash_data_index <= 256);
+  // 16 bit
+  qc_hash = av1_get_crc_value(&crc_calculator2, txb_hash_data, hash_data_index);
+
+  hash_data_index = 0;
+  // tcoeff
+  for (int i = 0; i < txb_info->eob; i++) {
+    chunk = (txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]]) & 0xff;
+    txb_hash_data[hash_data_index++] = chunk;
+  }
+  // txb_ctx
+  chunk = txb_ctx->txb_skip_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = txb_ctx->dc_sign_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // dequant
+  chunk = txb_info->dequant[0] & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = (txb_info->dequant[0] & 0xff00) >> 8;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = txb_info->dequant[1] & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = (txb_info->dequant[1] & 0xff00) >> 8;
+  txb_hash_data[hash_data_index++] = chunk;
+  // txb_skip_cost
+  /*for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < TXB_SKIP_CONTEXTS; j++) {
+      chunk = (txb_costs->txb_skip_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+  // base_eob_cost
+  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
+    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
+      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }*/
+  assert(hash_data_index <= 256);
+  // Gives 16 bit hash for ctx
+  ctx_hash = av1_get_crc_value(&crc_calculator, txb_hash_data, hash_data_index);
+
+  uint16_t hbt_hash_index = ctx_hash;  // 16 bit ctx_hash: index to table
+  uint32_t hbt_hash_match = qc_hash;   // 16 bit qc_hash: matched in array
+  //// End hash creation
+
+  return search_hbt_hash_match(hbt_hash_index, hbt_hash_match, txb_info,
+                               txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                               txb_eob_costs,
+#endif
+                               p, block, fast_mode);
+}
+
+int av1_optimize_txb(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
                      TXB_CTX *txb_ctx, int fast_mode) {
+  const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const PLANE_TYPE plane_type = get_plane_type(plane);
   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
@@ -2266,6 +2496,17 @@
     &cm->coeff_ctx_table
   };
 
+  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
+  // by storing the optimized coefficients in a hash table.
+  // Currently disabled in speedfeatures.c
+  if (eob <= HBT_HASH_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
+    return hash_based_trellis_mode(&txb_info, &txb_costs,
+#if CONFIG_LV_MAP_MULTI
+                                   &txb_eob_costs,
+#endif
+                                   p, block, fast_mode, txb_ctx);
+  }
+
   av1_txb_init_levels(qcoeff, width, height, levels);
 
   const int update = optimize_txb(&txb_info, &txb_costs,
@@ -2623,7 +2864,7 @@
     } else {
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, AV1_XFORM_QUANT_FP);
-      av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize,
+      av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
                      tx_size, a, l, 1);
     }
     av1_dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
@@ -2662,7 +2903,7 @@
     } else {
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, AV1_XFORM_QUANT_FP);
-      av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize,
+      av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
                      tx_size, a, l, 1);
     }
 
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index 4d3a8ca..beddb49 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -107,9 +107,10 @@
                             const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
                             int use_fast_coef_costing, RD_STATS *rd_stats);
 #endif
-int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
-                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
-                     TXB_CTX *txb_ctx, int fast_mode);
+
+int av1_optimize_txb(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int blk_row,
+                     int blk_col, int block, TX_SIZE tx_size, TXB_CTX *txb_ctx,
+                     int fast_mode);
 #ifdef __cplusplus
 }
 #endif
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 18ab9a7..3af8c7a 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -641,7 +641,7 @@
       xd->mi[0]->mbmi.mode = DC_PRED;
       xd->mi[0]->mbmi.tx_size =
           use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      av1_encode_intra_block_plane(cm, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+      av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
       this_error = aom_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b55c936..e1e1be3 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1900,7 +1900,7 @@
     av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                     AV1_XFORM_QUANT_FP);
 
-// TX-domain results need to shift down to Q2/D10 to match pixel
+/// TX-domain results need to shift down to Q2/D10 to match pixel
 // domain distortion values which are in Q2^2
 #if CONFIG_DAALA_TX
     const int shift = (TX_COEFF_DEPTH - 10) * 2;
@@ -1929,7 +1929,7 @@
         disable_early_skip ||
 #endif
         RDCOST(x->rdmult, 0, tmp_dist) + args->this_rd < args->best_rd) {
-      av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize,
+      av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
                      tx_size, a, l, CONFIG_LV_MAP);
     } else {
       args->exit_early = 1;
@@ -3581,7 +3581,7 @@
         disable_early_skip ||
 #endif
         RDCOST(x->rdmult, 0, tmp_dist) < rd_stats->ref_rdcost) {
-      av1_optimize_b(cm, x, plane, blk_row, blk_col, block, plane_bsize,
+      av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
                      tx_size, a, l, fast);
     } else {
       rd_stats->rate += rd_stats->zero_rate;
@@ -8704,8 +8704,8 @@
       // during luma RDO, so we can store reconstructed luma values
       memcpy(x->blk_skip[0], ctx->blk_skip[0],
              sizeof(uint8_t) * ctx->num_4x4_blk);
-      av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, AOM_PLANE_Y, 1,
-                                   mi_row, mi_col);
+      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, 1, mi_row,
+                                   mi_col);
       xd->cfl.store_y = 0;
     }
 #endif  // CONFIG_CFL
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index e13819b..0e5e888 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -149,6 +149,10 @@
 #if CONFIG_DUAL_FILTER
     sf->use_fast_interpolation_filter_search = 1;
 #endif  // CONFIG_DUAL_FILTER
+#if 0   // CONFIG_HASH_ME && CONFIG_LV_MAP && CONFIG_LV_MAP_MULTI
+    // TODO(mfo): Activate feature once it gives positive results.
+    sf->use_hash_based_trellis = 1;
+#endif  // CONFIG_HASH_ME && CONFIG_LV_MAP && CONFIG_LV_MAP_MULTI
   }
 
   if (speed >= 2) {
@@ -515,6 +519,7 @@
   sf->use_transform_domain_distortion = 0;
   sf->gm_search_type = GM_FULL_SEARCH;
   sf->use_fast_interpolation_filter_search = 0;
+  sf->use_hash_based_trellis = 0;
 
   set_dev_sf(cpi, sf, oxcf->dev_sf);
 
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 9b7d3e6..66128a1 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -520,6 +520,10 @@
   // usually includes EIGHTTAP_REGULAR.
   int use_fast_interpolation_filter_search;
 
+  // Use a hash table to store previously computed optimized qcoeffs from
+  // expensive calls to optimize_txb.
+  int use_hash_based_trellis;
+
   // flag to drop some ref frames in compound motion search
   int drop_ref;
 } SPEED_FEATURES;