Allocate workers for each module independently
This CL facilitates the allocation of number of threads for each
module to be independent of the other modules.
Change-Id: I758a03d1d21d6b188003ce3c42467d8bfc48ba54
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index d3d7a04..02afee5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2277,10 +2277,15 @@
num_workers = av1_fp_compute_num_enc_workers(cpi);
#endif
} else {
- num_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ av1_compute_num_workers_for_mt(cpi);
+ num_workers = av1_get_max_num_workers(cpi);
}
- if ((num_workers > 1) && (cpi->mt_info.num_workers == 0))
+ if ((num_workers > 1) && (cpi->mt_info.num_workers == 0)) {
av1_create_workers(cpi, num_workers);
+ if (cpi->oxcf.pass != 1) {
+ av1_create_second_pass_workers(cpi, num_workers);
+ }
+ }
// Call for LAP stage
if (cpi_lap != NULL) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 44cefbb..4e7de26 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1544,7 +1544,8 @@
av1_loop_filter_dealloc(&mt_info->lf_row_sync);
av1_cdef_mt_dealloc(&mt_info->cdef_sync);
#if !CONFIG_REALTIME_ONLY
- av1_loop_restoration_dealloc(&mt_info->lr_row_sync, mt_info->num_workers);
+ av1_loop_restoration_dealloc(&mt_info->lr_row_sync,
+ mt_info->num_mod_workers[MOD_LR]);
av1_gm_dealloc(&mt_info->gm_sync);
av1_tf_mt_dealloc(&mt_info->tf_sync);
#endif
@@ -2067,7 +2068,7 @@
#endif
if (use_restoration) {
MultiThreadInfo *const mt_info = &cpi->mt_info;
- const int num_workers = mt_info->num_workers;
+ const int num_workers = mt_info->num_mod_workers[MOD_LR];
av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
av1_pick_filter_restoration(cpi->source, cpi);
if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
@@ -2099,7 +2100,7 @@
*/
static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
MultiThreadInfo *const mt_info = &cpi->mt_info;
- const int num_workers = mt_info->num_workers;
+ const int num_workers = mt_info->num_mod_workers[MOD_LPF];
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 95efd65..f7d9d6b 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -157,6 +157,18 @@
COST_UPD_OFF,
} COST_UPDATE_TYPE;
+typedef enum {
+ MOD_FP, // First pass
+ MOD_TF, // Temporal filtering
+ MOD_TPL, // TPL
+ MOD_GME, // Global motion estimation
+ MOD_ENC, // Encode stage
+ MOD_LPF, // Deblocking loop filter
+ MOD_CDEF_SEARCH, // CDEF search
+ MOD_LR, // Loop restoration filtering
+ NUM_MT_MODULES
+} MULTI_THREADED_MODULES;
+
/*!\endcond */
/*!
@@ -1345,14 +1357,21 @@
int num_workers;
/*!
- * Number of workers created for tpl and tile/row multi-threading of encoder.
+ * Number of workers used for different MT modules.
*/
- int num_enc_workers;
+ int num_mod_workers[NUM_MT_MODULES];
/*!
- * Number of workers created for first-pass multi-threading.
+ * Flag to indicate whether thread specific buffers need to be allocated for
+ * tile/row based multi-threading of first pass stage.
*/
- int num_fp_workers;
+ int fp_mt_buf_init_done;
+
+ /*!
+ * Flag to indicate whether thread specific buffers need to be allocated for
+ * tile/row based multi-threading of encode stage.
+ */
+ int enc_mt_buf_init_done;
/*!
* Synchronization object used to launch job in the worker thread.
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 43776f9..b71a40a 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -528,7 +528,7 @@
return 1;
}
-static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
AV1_COMMON *const cm = &cpi->common;
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
MultiThreadInfo *const mt_info = &cpi->mt_info;
@@ -568,14 +568,37 @@
AVxWorker *const worker = &mt_info->workers[i];
EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
- ++mt_info->num_enc_workers;
-
thread_data->cpi = cpi;
thread_data->thread_id = i;
// Set the starting tile for each thread.
thread_data->start = i;
if (i > 0) {
+ // alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
+
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->td = &cpi->td;
+ }
+ winterface->sync(worker);
+ }
+}
+
+static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+ assert(mt_info->workers != NULL);
+ assert(mt_info->tile_thr_data != NULL);
+
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ if (i > 0) {
// Set up sms_tree.
av1_setup_sms_tree(cpi, thread_data->td);
@@ -622,21 +645,13 @@
cm, thread_data->td->vt64x64,
aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
}
-
- // Create threads
- if (!winterface->reset(worker))
- aom_internal_error(&cm->error, AOM_CODEC_ERROR,
- "Tile encoder thread creation failed");
- } else {
- // Main thread acts as a worker and uses the thread data in cpi.
- thread_data->td = &cpi->td;
}
if (cpi->oxcf.row_mt == 1)
CHECK_MEM_ERROR(
cm, thread_data->td->tctx,
(FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
- winterface->sync(worker);
}
+ mt_info->enc_mt_buf_init_done = 1;
}
void av1_create_workers(AV1_COMP *cpi, int num_workers) {
@@ -675,6 +690,10 @@
AV1_COMMON *const cm = &cpi->common;
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
MultiThreadInfo *const mt_info = &cpi->mt_info;
+ // For single-pass encode, threads are already created during call to
+ // av1_create_second_pass_workers(). Create threads only in the case of
+ // pass = 1.
+ const int create_workers = (mt_info->num_mod_workers[MOD_FP] == 0) ? 1 : 0;
assert(mt_info->workers != NULL);
assert(mt_info->tile_thr_data != NULL);
@@ -692,8 +711,6 @@
AVxWorker *const worker = &mt_info->workers[i];
EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
- ++mt_info->num_fp_workers;
-
thread_data->cpi = cpi;
thread_data->thread_id = i;
// Set the starting tile for each thread.
@@ -704,16 +721,22 @@
thread_data->td->firstpass_ctx =
av1_alloc_pmc(cm, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
- // Create threads
- if (!winterface->reset(worker))
- aom_internal_error(&cm->error, AOM_CODEC_ERROR,
- "Tile encoder thread creation failed");
+ if (create_workers) {
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ }
} else {
// Main thread acts as a worker and uses the thread data in cpi.
thread_data->td = &cpi->td;
}
- winterface->sync(worker);
+ if (create_workers) {
+ winterface->sync(worker);
+ ++mt_info->num_mod_workers[MOD_FP];
+ }
}
+ mt_info->fp_mt_buf_init_done = 1;
}
#endif
@@ -878,8 +901,17 @@
return AOMMIN(max_threads, tile_cols * tile_rows);
}
+// Find max worker of all MT stages
+int av1_get_max_num_workers(AV1_COMP *cpi) {
+ int max_num_workers = 0;
+ for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+ max_num_workers = AOMMAX(cpi->mt_info.num_mod_workers[i], max_num_workers);
+ assert(max_num_workers >= 1);
+ return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
+}
+
// Computes the number of workers for encoding stage (row/tile multi-threading)
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+static AOM_INLINE int compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
if (max_workers <= 1) return 1;
if (cpi->oxcf.row_mt)
return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
@@ -892,7 +924,7 @@
MultiThreadInfo *const mt_info = &cpi->mt_info;
const int tile_cols = cm->tiles.cols;
const int tile_rows = cm->tiles.rows;
- int num_workers = av1_compute_num_enc_workers(cpi, mt_info->num_workers);
+ int num_workers = mt_info->num_mod_workers[MOD_ENC];
assert(IMPLIES(cpi->tile_data == NULL,
cpi->allocated_tiles < tile_cols * tile_rows));
@@ -900,10 +932,10 @@
av1_init_tile_data(cpi);
// Only run once to create threads and allocate thread data.
- if (mt_info->num_enc_workers == 0) {
+ if (mt_info->enc_mt_buf_init_done == 0) {
create_enc_workers(cpi, num_workers);
} else {
- num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
}
prepare_enc_workers(cpi, enc_worker_hook, num_workers);
launch_workers(&cpi->mt_info, num_workers);
@@ -996,13 +1028,7 @@
const int tile_rows = cm->tiles.rows;
int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
int max_sb_rows = 0, max_sb_cols = 0;
-
- // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
- // post-processing stages in encoder is quiet low, so limiting the number of
- // threads to the theoretical limit in row-mt does not have much impact on
- // post-processing multi-threading stage. Need to revisit this when
- // post-processing time starts shooting up.
- int num_workers = av1_compute_num_enc_workers(cpi, mt_info->num_workers);
+ int num_workers = mt_info->num_mod_workers[MOD_ENC];
assert(IMPLIES(cpi->tile_data == NULL,
cpi->allocated_tiles < tile_cols * tile_rows));
@@ -1047,10 +1073,10 @@
}
// Only run once to create threads and allocate thread data.
- if (mt_info->num_enc_workers == 0) {
+ if (mt_info->enc_mt_buf_init_done == 0) {
create_enc_workers(cpi, num_workers);
} else {
- num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
}
assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
num_workers);
@@ -1084,12 +1110,12 @@
const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
max_mb_rows = fp_compute_max_mb_rows(cm, cpi->tile_data, fp_block_size);
- // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
- // post-processing stages in encoder is quiet low, so limiting the number of
- // threads to the theoretical limit in row-mt does not have much impact on
- // post-processing multi-threading stage. Need to revisit this when
- // post-processing time starts shooting up.
- num_workers = av1_fp_compute_num_enc_workers(cpi);
+ // For pass = 1, compute the no. of workers needed. For single-pass encode
+ // (pass = 0), no. of workers are already computed.
+ if (mt_info->num_mod_workers[MOD_FP] == 0)
+ num_workers = av1_fp_compute_num_enc_workers(cpi);
+ else
+ num_workers = mt_info->num_mod_workers[MOD_FP];
if (enc_row_mt->allocated_tile_cols != tile_cols ||
enc_row_mt->allocated_tile_rows != tile_rows ||
@@ -1117,7 +1143,8 @@
num_workers = AOMMIN(num_workers, mt_info->num_workers);
// Only run once to create threads and allocate thread data.
- if (mt_info->num_fp_workers == 0) fp_create_enc_workers(cpi, num_workers);
+ if (mt_info->fp_mt_buf_init_done == 0)
+ fp_create_enc_workers(cpi, num_workers);
assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
num_workers);
fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
@@ -1291,16 +1318,13 @@
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
- thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+ // OBMC buffers are used only to init MS params and remain unused when
+ // called from tpl, hence set the buffers to defaults.
+ av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
}
}
}
-// Computes num_workers for tpl multi-threading.
-static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
- return av1_compute_num_enc_workers(cpi, cpi->mt_info.num_workers);
-}
-
// Implements multi-threading for tpl.
void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
AV1_COMMON *cm = &cpi->common;
@@ -1309,12 +1333,8 @@
TplParams *tpl_data = &cpi->tpl_data;
AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
int mb_rows = mi_params->mb_rows;
- int num_workers = compute_num_tpl_workers(cpi);
-
- if (mt_info->num_enc_workers == 0)
- create_enc_workers(cpi, num_workers);
- else
- num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+ int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);
if (mb_rows != tpl_sync->rows) {
av1_tpl_dealloc(tpl_sync);
@@ -1412,7 +1432,9 @@
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
- thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+ // OBMC buffers are used only to init MS params and remain unused when
+ // called from tf, hence set the buffers to defaults.
+ av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
tf_alloc_and_reset_data(&thread_data->td->tf_data, cpi->tf_ctx.num_pels,
is_highbitdepth);
}
@@ -1451,11 +1473,8 @@
MultiThreadInfo *mt_info = &cpi->mt_info;
const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
- int num_workers = mt_info->num_workers;
- if (mt_info->num_enc_workers == 0)
- create_enc_workers(cpi, num_workers);
- else
- num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+ int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);
prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
launch_workers(mt_info, num_workers);
@@ -1610,12 +1629,12 @@
static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
int total_refs =
cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
- int max_num_workers = cpi->mt_info.num_workers;
- int max_allowed_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
- ? AOMMIN(MAX_DIRECTIONS, max_num_workers)
- : max_num_workers;
-
- return (AOMMIN(total_refs, max_allowed_workers));
+ int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+ ? AOMMIN(MAX_DIRECTIONS, total_refs)
+ : total_refs;
+ num_gm_workers = AOMMIN(num_gm_workers, cpi->oxcf.max_threads);
+ assert(num_gm_workers <= cpi->mt_info.num_workers);
+ return (num_gm_workers);
}
// Frees the memory allocated for each worker in global motion multi-threading.
@@ -1783,7 +1802,7 @@
void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
CdefSearchCtx *cdef_search_ctx) {
AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
- const int num_workers = mt_info->num_workers;
+ const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
cdef_reset_job_info(cdef_sync);
prepare_cdef_workers(mt_info, cdef_search_ctx, cdef_filter_block_worker_hook,
@@ -1791,3 +1810,59 @@
launch_workers(mt_info, num_workers);
sync_enc_workers(mt_info, cm, num_workers);
}
+
+// Computes num_workers for temporal filter multi-threading.
+static AOM_INLINE int compute_num_tf_workers(AV1_COMP *cpi) {
+ return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+ return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop filter multi-threading.
+static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+ return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for cdef multi-threading.
+static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+ return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop-restoration multi-threading.
+static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+ return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
+ int num_mod_workers = 0;
+ switch (mod_name) {
+ case MOD_FP:
+ if (cpi->oxcf.pass == 2)
+ num_mod_workers = 0;
+ else
+ num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ break;
+ case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
+ case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
+ case MOD_GME: num_mod_workers = 1; break;
+ case MOD_ENC:
+ num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ break;
+ case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
+ case MOD_CDEF_SEARCH:
+ num_mod_workers = compute_num_cdef_workers(cpi);
+ break;
+ case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+ default: assert(0); break;
+ }
+ return (num_mod_workers);
+}
+// Computes the number of workers for each MT modules in the encoder
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
+ for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+ cpi->mt_info.num_mod_workers[i] =
+ compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
+}
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index e8b8ad3..55e7f7b 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -74,10 +74,14 @@
void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi);
+
+int av1_get_max_num_workers(AV1_COMP *cpi);
void av1_create_workers(AV1_COMP *cpi, int num_workers);
+void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers);
+
void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
CdefSearchCtx *cdef_search_ctx);
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 745f75b..225263c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -78,6 +78,13 @@
}
}
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
+ obmc_buffer->wsrc = NULL;
+ obmc_buffer->mask = NULL;
+ obmc_buffer->above_pred = NULL;
+ obmc_buffer->left_pred = NULL;
+}
+
void av1_make_default_fullpel_ms_params(
FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 8f6085f..94baed6 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -203,6 +203,8 @@
aom_sad_multi_d_fn_t sdx4df;
} FULLPEL_MOTION_SEARCH_PARAMS;
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
+
void av1_make_default_fullpel_ms_params(
FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index e6446c7..9b3924f 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -50,7 +50,7 @@
AV1_COMP *const cpi, int filt_level,
int partial_frame, int plane, int dir) {
MultiThreadInfo *const mt_info = &cpi->mt_info;
- int num_workers = mt_info->num_workers;
+ int num_workers = mt_info->num_mod_workers[MOD_LPF];
AV1_COMMON *const cm = &cpi->common;
int64_t filt_err;