AV1 RT: Make TXB RD Records dynamically allocated
TXB RD Records take about 3.2MB and are part of MACROBLOCK structure.
For RealTime multithreading this leads to huge overhead when copying
MACROBLOCK structure for each worker thread. Allocating this dynamically
helps to reduce overhead - ~15% speed up for 2 threads encoding on 360p
content of speed 8
Change-Id: I1b88fbf41bec67fe2654c88f41f3ef04ca2d9153
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 6f6ffde..2aa3dd0 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -483,8 +483,10 @@
&td->mb.e_mbd);
cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
- av1_crc32c_calculator_init(
- &td->mb.txfm_search_info.mb_rd_record.crc_calculator);
+ if (td->mb.txfm_search_info.txb_rd_records != NULL) {
+ av1_crc32c_calculator_init(
+ &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator);
+ }
av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
#if CONFIG_MULTITHREAD
@@ -773,6 +775,10 @@
EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
cpi->intrabc_used |= thread_data->td->intrabc_used;
cpi->deltaq_used |= thread_data->td->deltaq_used;
+ if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
+ aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
+ thread_data->td->mb.txfm_search_info.txb_rd_records = NULL;
+ }
// Accumulate counters.
if (i > 0) {
@@ -825,6 +831,11 @@
}
}
}
+ if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->mb.txfm_search_info.txb_rd_records =
+ (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+ }
+
if (thread_data->td->counts != &cpi->counts) {
memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
}
@@ -868,6 +879,10 @@
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
}
+ if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->mb.txfm_search_info.txb_rd_records =
+ (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+ }
}
}
#endif
@@ -1148,6 +1163,12 @@
fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
launch_workers(&cpi->mt_info, num_workers);
sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+ if (thread_data->td->mb.txfm_search_info.txb_rd_records) {
+ aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records);
+ }
+ }
}
void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,