Multi-thread support for bitmask Enable multi-thread when loop filter bitmask is used. Similar as existing multi-thread without bitmask, each row (every 64x64 block, instead of previous 128x128) uses a thread. Building bitmask for the whole frame can only use one thread right now. Change-Id: Ibd753b677ecf2d04ac0dcccf6a06b5fcd97a7c23
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h index 4ea4e16..3129ce5 100644 --- a/av1/common/av1_loopfilter.h +++ b/av1/common/av1_loopfilter.h
@@ -175,6 +175,22 @@ int mi_row, int mi_col); int get_index_shift(int mi_col, int mi_row, int *index); +void av1_build_bitmask_vert_info( + struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr, + int plane); + +void av1_build_bitmask_horz_info( + struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr, + int plane); + +void av1_filter_block_plane_bitmask_vert( + struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); + +void av1_filter_block_plane_bitmask_horz( + struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr, + int pl, int mi_row, int mi_col); + extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL]; extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c index 8df4c9a..064cd67 100644 --- a/av1/common/thread_common.c +++ b/av1/common/thread_common.c
@@ -205,7 +205,11 @@ } static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, - int stop, int plane_start, int plane_end) { + int stop, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif + int plane_start, int plane_end) { int mi_row, plane, dir; AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; lf_sync->jobs_enqueued = 0; @@ -219,7 +223,16 @@ continue; else if (plane == 2 && !(cm->lf.filter_level_v)) continue; - for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { +#if LOOP_FILTER_BITMASK + int step = MAX_MIB_SIZE; + if (is_decoding) { + step = MI_SIZE_64X64; + } + for (mi_row = start; mi_row < stop; mi_row += step) +#else + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) +#endif + { lf_job_queue->mi_row = mi_row; lf_job_queue->plane = plane; lf_job_queue->dir = dir; @@ -312,15 +325,94 @@ return 1; } +#if LOOP_FILTER_BITMASK +static INLINE void thread_loop_filter_bitmask_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, + AV1LfSync *const lf_sync) { + const int sb_cols = + ALIGN_POWER_OF_TWO(cm->mi_cols, MIN_MIB_SIZE_LOG2) >> MIN_MIB_SIZE_LOG2; + int mi_row, mi_col, plane, dir; + int r, c; + (void)xd; + + while (1) { + AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync); + + if (cur_job_info != NULL) { + mi_row = cur_job_info->mi_row; + plane = cur_job_info->plane; + dir = cur_job_info->dir; + r = mi_row >> MIN_MIB_SIZE_LOG2; + + if (dir == 0) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_SIZE_64X64) { + c = mi_col >> MIN_MIB_SIZE_LOG2; + + av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row, + mi_col, plane, plane + 1); + + av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row, + mi_col); + sync_write(lf_sync, r, c, sb_cols, plane); + } + } else if (dir == 1) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_SIZE_64X64) { + c = mi_col >> MIN_MIB_SIZE_LOG2; + + // Wait for vertical edge filtering of the top-right block to be + // completed + sync_read(lf_sync, r, c, plane); + + // Wait for vertical edge filtering of the right block to be + // completed + sync_read(lf_sync, r + 1, c, plane); + + av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row, + mi_col, plane, plane + 1); + av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row, + mi_col); + } + } + } else { + break; + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm, + lf_data->planes, lf_data->xd, lf_sync); + return 1; +} +#endif // LOOP_FILTER_BITMASK + static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, int plane_start, int plane_end, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif AVxWorker *workers, int nworkers, AV1LfSync *lf_sync) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); +#if LOOP_FILTER_BITMASK + int sb_rows; + if (is_decoding) { + sb_rows = + ALIGN_POWER_OF_TWO(cm->mi_rows, MIN_MIB_SIZE_LOG2) >> MIN_MIB_SIZE_LOG2; + } else { + sb_rows = + ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2; + } +#else // Number of superblock rows and cols const int sb_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2; +#endif const int num_workers = nworkers; int i; @@ -336,14 +428,26 @@ sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows); } - enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end); + enqueue_lf_jobs(lf_sync, cm, start, stop, +#if LOOP_FILTER_BITMASK + is_decoding, +#endif + plane_start, plane_end); // Set up loopfilter thread data. for (i = 0; i < num_workers; ++i) { AVxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; +#if LOOP_FILTER_BITMASK + if (is_decoding) { + worker->hook = loop_filter_bitmask_row_worker; + } else { + worker->hook = loop_filter_row_worker; + } +#else worker->hook = loop_filter_row_worker; +#endif worker->data1 = lf_sync; worker->data2 = lf_data; @@ -366,8 +470,12 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int plane_start, int plane_end, - int partial_frame, AVxWorker *workers, - int num_workers, AV1LfSync *lf_sync) { + int partial_frame, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; @@ -380,8 +488,37 @@ end_mi_row = start_mi_row + mi_rows_to_filter; av1_loop_filter_frame_init(cm, plane_start, plane_end); +#if LOOP_FILTER_BITMASK + if (is_decoding) { + cm->is_decoding = is_decoding; + // TODO(chengchen): currently use one thread to build bitmasks for the + // frame. Make it support multi-thread later. + for (int plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1])) + break; + else if (plane == 1 && !(cm->lf.filter_level_u)) + continue; + else if (plane == 2 && !(cm->lf.filter_level_v)) + continue; + + // TODO(chengchen): can we remove this? + struct macroblockd_plane *pd = xd->plane; + av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane, + plane + 1); + + av1_build_bitmask_vert_info(cm, &pd[plane], plane); + av1_build_bitmask_horz_info(cm, &pd[plane], plane); + } + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, 1, workers, num_workers, lf_sync); + } else { + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, + plane_end, 0, workers, num_workers, lf_sync); + } +#else loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, plane_end, workers, num_workers, lf_sync); +#endif } static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h index 23d61d7..e7dbb8b 100644 --- a/av1/common/thread_common.h +++ b/av1/common/thread_common.h
@@ -103,6 +103,9 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, struct macroblockd *mbd, int plane_start, int plane_end, int partial_frame, +#if LOOP_FILTER_BITMASK + int is_decoding, +#endif AVxWorker *workers, int num_workers, AV1LfSync *lf_sync); void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index cdda29c..1950d36 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c
@@ -5549,9 +5549,12 @@ if (!cm->allow_intrabc && !cm->single_tile_decoding) { if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) { if (pbi->num_workers > 1) { - av1_loop_filter_frame_mt(get_frame_new_buffer(cm), cm, &pbi->mb, 0, - num_planes, 0, pbi->tile_workers, - pbi->num_workers, &pbi->lf_row_sync); + av1_loop_filter_frame_mt( + get_frame_new_buffer(cm), cm, &pbi->mb, 0, num_planes, 0, +#if LOOP_FILTER_BITMASK + 1, +#endif + pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync); } else { av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, #if LOOP_FILTER_BITMASK
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c index 487efd2..ab9d0bc 100644 --- a/av1/encoder/encoder.c +++ b/av1/encoder/encoder.c
@@ -4229,6 +4229,9 @@ if (lf->filter_level[0] || lf->filter_level[1]) { if (cpi->num_workers > 1) av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0, +#if LOOP_FILTER_BITMASK + 0, +#endif cpi->workers, cpi->num_workers, &cpi->lf_row_sync); else
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c index c99d0a2..5bd0e60 100644 --- a/av1/encoder/picklpf.c +++ b/av1/encoder/picklpf.c
@@ -71,8 +71,11 @@ // filter mask is compatible with multi-thread. if (cpi->num_workers > 1) av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, - plane + 1, partial_frame, cpi->workers, - cpi->num_workers, &cpi->lf_row_sync); + plane + 1, partial_frame, +#if LOOP_FILTER_BITMASK + 0, +#endif + cpi->workers, cpi->num_workers, &cpi->lf_row_sync); else av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, #if LOOP_FILTER_BITMASK