AV1 RT: Make TXB RD Records dynamically allocated

TXB RD Records take about 3.2MB and are part of MACROBLOCK structure.
For RealTime multithreading this leads to huge overhead when copying
MACROBLOCK structure for each worker thread. Allocating this dynamically
helps to reduce overhead - ~15% speed up for 2 threads encoding on 360p
content of speed 8

Change-Id: I1b88fbf41bec67fe2654c88f41f3ef04ca2d9153
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index db042d9..3c40085 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -480,6 +480,38 @@
 #define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4))
 /*!\endcond */
 
+/*! \brief Txfm hash records
+ *
+ * Hash records of the transform search results based on the residue. There
+ * are two main types here:
+ * - MB_RD_RECORD: records a whole *partition block*'s inter-mode txfm result.
+ *   Since this operates on the partition block level, this can give us a
+ *   whole txfm partition tree.
+ * - TXB_RD_RECORD: records a txfm search result within a transform blcok
+ *   itself. This operates on txb level only and onlyt appplies to square
+ *   txfms.
+ */
+typedef struct {
+  /*****************************************************************************
+   * \name TXB RD Record
+   ****************************************************************************/
+  /**@{*/
+  //! Txfm hash record for the whole coding block.
+  MB_RD_RECORD mb_rd_record;
+
+  //! Inter mode txfm hash record for TX_8X8 blocks.
+  TXB_RD_RECORD txb_rd_record_8X8[MAX_NUM_8X8_TXBS];
+  //! Inter mode txfm hash record for TX_16X16 blocks.
+  TXB_RD_RECORD txb_rd_record_16X16[MAX_NUM_16X16_TXBS];
+  //! Inter mode txfm hash record for TX_32X32 blocks.
+  TXB_RD_RECORD txb_rd_record_32X32[MAX_NUM_32X32_TXBS];
+  //! Inter mode txfm hash record for TX_64X64 blocks.
+  TXB_RD_RECORD txb_rd_record_64X64[MAX_NUM_64X64_TXBS];
+  //! Intra mode txfm hash record for square tx blocks.
+  TXB_RD_RECORD txb_rd_record_intra;
+  /**@}*/
+} TxbRdRecords;
+
 /*! \brief Stores various encoding/search decisions related to txfm search.
  *
  * This struct contains a cache of previous txfm results, and some buffers for
@@ -509,7 +541,8 @@
    */
   uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE];
 
-  /** \name Txfm hash records
+  /*! \brief Txfm hash records
+   *
    * Hash records of the transform search results based on the residue. There
    * are two main types here:
    * - MB_RD_RECORD: records a whole *partition block*'s inter-mode txfm result.
@@ -519,21 +552,7 @@
    *   itself. This operates on txb level only and onlyt appplies to square
    *   txfms.
    */
-  /**@{*/
-  //! Txfm hash record for the whole coding block.
-  MB_RD_RECORD mb_rd_record;
-
-  //! Inter mode txfm hash record for TX_8X8 blocks.
-  TXB_RD_RECORD txb_rd_record_8X8[MAX_NUM_8X8_TXBS];
-  //! Inter mode txfm hash record for TX_16X16 blocks.
-  TXB_RD_RECORD txb_rd_record_16X16[MAX_NUM_16X16_TXBS];
-  //! Inter mode txfm hash record for TX_32X32 blocks.
-  TXB_RD_RECORD txb_rd_record_32X32[MAX_NUM_32X32_TXBS];
-  //! Inter mode txfm hash record for TX_64X64 blocks.
-  TXB_RD_RECORD txb_rd_record_64X64[MAX_NUM_64X64_TXBS];
-  //! Intra mode txfm hash record for square tx blocks.
-  TXB_RD_RECORD txb_rd_record_intra;
-  /**@}*/
+  TxbRdRecords *txb_rd_records;
 
   /*! \brief Number of txb splits.
    *
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 981dc95..f2981c5 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -996,8 +996,10 @@
   if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
     cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
 
-  av1_crc32c_calculator_init(
-      &td->mb.txfm_search_info.mb_rd_record.crc_calculator);
+  if (td->mb.txfm_search_info.txb_rd_records != NULL) {
+    av1_crc32c_calculator_init(
+        &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator);
+  }
 
   for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->seq_params.mib_size) {
@@ -1022,6 +1024,10 @@
   if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    cpi->td.mb.txfm_search_info.txb_rd_records =
+        (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+  }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -1036,6 +1042,11 @@
       cpi->deltaq_used |= cpi->td.deltaq_used;
     }
   }
+
+  if (cpi->td.mb.txfm_search_info.txb_rd_records) {
+    aom_free(cpi->td.mb.txfm_search_info.txb_rd_records);
+    cpi->td.mb.txfm_search_info.txb_rd_records = NULL;
+  }
 }
 
 // Set the relative distance of a reference frame w.r.t. current frame
@@ -1535,7 +1546,6 @@
       }
     }
   }
-
   if (hash_table_created) {
     av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
   }
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 6f6ffde..2aa3dd0 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -483,8 +483,10 @@
                            &td->mb.e_mbd);
 
     cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
-    av1_crc32c_calculator_init(
-        &td->mb.txfm_search_info.mb_rd_record.crc_calculator);
+    if (td->mb.txfm_search_info.txb_rd_records != NULL) {
+      av1_crc32c_calculator_init(
+          &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator);
+    }
 
     av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
 #if CONFIG_MULTITHREAD
@@ -773,6 +775,10 @@
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
     cpi->intrabc_used |= thread_data->td->intrabc_used;
     cpi->deltaq_used |= thread_data->td->deltaq_used;
+    if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
+      aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
+      thread_data->td->mb.txfm_search_info.txb_rd_records = NULL;
+    }
 
     // Accumulate counters.
     if (i > 0) {
@@ -825,6 +831,11 @@
         }
       }
     }
+    if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+      thread_data->td->mb.txfm_search_info.txb_rd_records =
+          (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+    }
+
     if (thread_data->td->counts != &cpi->counts) {
       memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
     }
@@ -868,6 +879,10 @@
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
     }
+    if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+      thread_data->td->mb.txfm_search_info.txb_rd_records =
+          (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+    }
   }
 }
 #endif
@@ -1148,6 +1163,12 @@
   fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+    if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
+      aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
+    }
+  }
 }
 
 void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 3a567c1..5c7397c 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -320,33 +320,38 @@
 static INLINE void reset_hash_records(TxfmSearchInfo *const txfm_info,
                                       int use_inter_txb_hash) {
   int32_t record_idx;
-
+  if (!txfm_info->txb_rd_records) return;
   // Reset the state for use_inter_txb_hash
   if (use_inter_txb_hash) {
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
-      txfm_info->txb_rd_record_8X8[record_idx].num =
-          txfm_info->txb_rd_record_8X8[record_idx].index_start = 0;
+      txfm_info->txb_rd_records->txb_rd_record_8X8[record_idx].num =
+          txfm_info->txb_rd_records->txb_rd_record_8X8[record_idx].index_start =
+              0;
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
-      txfm_info->txb_rd_record_16X16[record_idx].num =
-          txfm_info->txb_rd_record_16X16[record_idx].index_start = 0;
+      txfm_info->txb_rd_records->txb_rd_record_16X16[record_idx].num =
+          txfm_info->txb_rd_records->txb_rd_record_16X16[record_idx]
+              .index_start = 0;
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
-      txfm_info->txb_rd_record_32X32[record_idx].num =
-          txfm_info->txb_rd_record_32X32[record_idx].index_start = 0;
+      txfm_info->txb_rd_records->txb_rd_record_32X32[record_idx].num =
+          txfm_info->txb_rd_records->txb_rd_record_32X32[record_idx]
+              .index_start = 0;
     for (record_idx = 0;
          record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
-      txfm_info->txb_rd_record_64X64[record_idx].num =
-          txfm_info->txb_rd_record_64X64[record_idx].index_start = 0;
+      txfm_info->txb_rd_records->txb_rd_record_64X64[record_idx].num =
+          txfm_info->txb_rd_records->txb_rd_record_64X64[record_idx]
+              .index_start = 0;
   }
 
   // Reset the state for use_intra_txb_hash
-  txfm_info->txb_rd_record_intra.num =
-      txfm_info->txb_rd_record_intra.index_start = 0;
+  txfm_info->txb_rd_records->txb_rd_record_intra.num =
+      txfm_info->txb_rd_records->txb_rd_record_intra.index_start = 0;
 
   // Reset the state for use_mb_rd_hash
-  txfm_info->mb_rd_record.num = txfm_info->mb_rd_record.index_start = 0;
+  txfm_info->txb_rd_records->mb_rd_record.num =
+      txfm_info->txb_rd_records->mb_rd_record.index_start = 0;
 }
 
 void av1_setup_pred_block(const MACROBLOCKD *xd,
diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index d26616f..e7b4d5b 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c
@@ -263,10 +263,12 @@
 static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
                                    TXB_RD_INFO_NODE *dst_rd_info) {
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
-  TXB_RD_RECORD *rd_records_table[4] = { txfm_info->txb_rd_record_8X8,
-                                         txfm_info->txb_rd_record_16X16,
-                                         txfm_info->txb_rd_record_32X32,
-                                         txfm_info->txb_rd_record_64X64 };
+  TXB_RD_RECORD *rd_records_table[4] = {
+    txfm_info->txb_rd_records->txb_rd_record_8X8,
+    txfm_info->txb_rd_records->txb_rd_record_16X16,
+    txfm_info->txb_rd_records->txb_rd_record_32X32,
+    txfm_info->txb_rd_records->txb_rd_record_64X64
+  };
   const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
   const int bw = block_size_wide[bsize];
   const int bh = block_size_high[bsize];
@@ -313,8 +315,8 @@
             cur_diff_row += diff_stride;
           }
           const int hash = av1_get_crc32c_value(
-              &txfm_info->mb_rd_record.crc_calculator, (uint8_t *)hash_data,
-              2 * cur_tx_bw * cur_tx_bh);
+              &txfm_info->txb_rd_records->mb_rd_record.crc_calculator,
+              (uint8_t *)hash_data, 2 * cur_tx_bw * cur_tx_bh);
           // Find corresponding RD info based on the hash value.
           const int record_idx =
               row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
@@ -336,9 +338,9 @@
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
   const int16_t *diff = x->plane[0].src_diff;
-  const uint32_t hash =
-      av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record.crc_calculator,
-                           (uint8_t *)diff, 2 * rows * cols);
+  const uint32_t hash = av1_get_crc32c_value(
+      &x->txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator,
+      (uint8_t *)diff, 2 * rows * cols);
   return (hash << 5) + bsize;
 }
 
@@ -1293,7 +1295,8 @@
     }
     hash_data = (uint8_t *)tmp_data;
   }
-  CRC32C *crc = &x->txfm_search_info.mb_rd_record.crc_calculator;
+  CRC32C *crc =
+      &x->txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator;
   const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
   return (hash << 5) + tx_size;
 }
@@ -1316,13 +1319,14 @@
          plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
   const uint32_t intra_hash =
       get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
-  const int intra_hash_idx =
-      find_tx_size_rd_info(&txfm_info->txb_rd_record_intra, intra_hash);
-  *intra_txb_rd_info =
-      &txfm_info->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+  const int intra_hash_idx = find_tx_size_rd_info(
+      &txfm_info->txb_rd_records->txb_rd_record_intra, intra_hash);
+  *intra_txb_rd_info = &txfm_info->txb_rd_records->txb_rd_record_intra
+                            .tx_rd_info[intra_hash_idx];
   *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
   if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
-      txfm_info->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+      txfm_info->txb_rd_records->txb_rd_record_intra.tx_rd_info[intra_hash_idx]
+          .valid) {
     xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
     const TX_TYPE ref_tx_type =
         av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
@@ -3703,7 +3707,7 @@
   const int n4 = bsize_to_num_blk(bsize);
   if (is_mb_rd_hash_enabled) {
     hash = get_block_residue_hash(x, bsize);
-    mb_rd_record = &x->txfm_search_info.mb_rd_record;
+    mb_rd_record = &x->txfm_search_info.txb_rd_records->mb_rd_record;
     const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
     if (match_index != -1) {
       MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
@@ -3785,7 +3789,7 @@
         (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
     if (within_border) {
       hash = get_block_residue_hash(x, bs);
-      mb_rd_record = &x->txfm_search_info.mb_rd_record;
+      mb_rd_record = &x->txfm_search_info.txb_rd_records->mb_rd_record;
       const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
       if (match_index != -1) {
         MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];