Rework hash map for txk-sel

The txk-sel allows each transform block to select its own
transform kernel. Such locality enables one to store the selected
RD cost including tx_type selection per transform block size.
It reduces the needed hash map size to 1 / 16 of what is needed
without txk-sel.

This commit re-works the hash map RD cost fetch for txk-sel. Tested
on red_kayak_480p in speed 1, enabling txk-sel makes the encoding
speed 12% faster than the baseline without txk-sel on. Further
enabling reduced hash map size gains speed 1 another 10%.

Change-Id: I4a5d99d27e2a76b10e76c00a8178f692c95fdf13
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 3751c19..604e781 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -132,9 +132,13 @@
 
 typedef struct {
   int64_t dist;
+  int64_t sse;
   int rate;
   uint16_t eob;
 #if CONFIG_LV_MAP
+#if CONFIG_TXK_SEL
+  TX_TYPE tx_type;
+#endif
   uint16_t entropy_context;
   uint8_t txb_entropy_ctx;
 #else
@@ -147,7 +151,11 @@
 #define TX_SIZE_RD_RECORD_BUFFER_LEN 256
 typedef struct {
   uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+#if CONFIG_TXK_SEL
+  TX_SIZE_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+#else
   TX_SIZE_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN][TX_TYPES];
+#endif
   int index_start;
   int num;
 } TX_SIZE_RD_RECORD;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 6679d50..7f3d81d 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3629,19 +3629,58 @@
                        int plane_bsize, const ENTROPY_CONTEXT *a,
                        const ENTROPY_CONTEXT *l, RD_STATS *rd_stats, int fast,
                        TX_SIZE_RD_INFO *rd_info_array) {
+  const struct macroblock_plane *const p = &x->plane[plane];
 #if CONFIG_TXK_SEL
-  (void)fast;
-  (void)rd_info_array;
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  const uint16_t cur_joint_ctx =
+      (txb_ctx.dc_sign_ctx << 8) + txb_ctx.txb_skip_ctx;
+
+  // Look up RD and terminate early in case when we've already processed exactly
+  // the same residual with exactly the same entropy context.
+  if (rd_info_array != NULL && rd_info_array->valid &&
+      rd_info_array->entropy_context == cur_joint_ctx &&
+      rd_info_array->fast == fast) {
+    rd_stats->rate += rd_info_array->rate;
+    rd_stats->dist += rd_info_array->dist;
+    rd_stats->sse += rd_info_array->sse;
+    rd_stats->skip &= rd_info_array->eob == 0;
+    p->eobs[block] = rd_info_array->eob;
+    p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+    if (plane == 0) {
+      x->e_mbd.mi[0]->mbmi.txk_type[(blk_row << MAX_MIB_SIZE_LOG2) + blk_col] =
+          rd_info_array->tx_type;
+    }
+    return;
+  }
 
   RD_STATS this_rd_stats;
   search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
                   a, l, 0, &this_rd_stats);
+
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+  // Save RD results for possible reuse in future.
+  if (rd_info_array != NULL) {
+    rd_info_array->valid = 1;
+    rd_info_array->entropy_context = cur_joint_ctx;
+    rd_info_array->fast = fast;
+    rd_info_array->rate = this_rd_stats.rate;
+    rd_info_array->dist = this_rd_stats.dist;
+    rd_info_array->sse = this_rd_stats.sse;
+    rd_info_array->eob = p->eobs[block];
+    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+    if (plane == 0) {
+      rd_info_array->tx_type =
+          x->e_mbd.mi[0]
+              ->mbmi.txk_type[(blk_row << MAX_MIB_SIZE_LOG2) + blk_col];
+    }
+  }
+
   return;
 #else
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
 
   // This function is used only for inter
@@ -4611,11 +4650,18 @@
           const int rd_record_idx =
               row_in_sb * (MAX_MIB_SIZE >> (cur_tx_size + 1 - TX_8X8)) +
               col_in_sb;
+
           int idx = find_tx_size_rd_info(
               &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx], hash);
+#if CONFIG_TXK_SEL
+          dst_rd_info[cur_rd_info_idx].rd_info_array =
+              &rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
+                   .tx_rd_info[idx];
+#else
           dst_rd_info[cur_rd_info_idx].rd_info_array =
               rd_records_table[cur_tx_size - TX_8X8][rd_record_idx]
                   .tx_rd_info[idx];
+#endif
         }
 
         // Update the output quadtree RD info structure.