rtc: Pipeline lpf after encode for threads > 1
In multi-threaded encoding scenario, when no encode
job is available for a thread, start loop filtering
of a superblock row.
Change-Id: I32140a5bcfddcd2bb9a9466fcbc22a3b4a7a58d0
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 1701a91..e30010f 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -24,6 +24,7 @@
#include "av1/encoder/global_motion.h"
#include "av1/encoder/global_motion_facade.h"
#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/picklpf.h"
#include "av1/encoder/rdopt.h"
#include "aom_dsp/aom_dsp_common.h"
#include "av1/encoder/temporal_filter.h"
@@ -221,6 +222,11 @@
}
}
+static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) {
+ return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows,
+ cm->seq_params->mib_size_log2);
+}
+
static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
int alloc_row_ctx) {
struct AV1Common *cm = &cpi->common;
@@ -247,10 +253,16 @@
}
}
}
+ const int sb_rows = get_sb_rows_in_frame(cm);
+ CHECK_MEM_ERROR(
+ cm, enc_row_mt->num_tile_cols_done,
+ aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows));
+
enc_row_mt->allocated_tile_cols = tile_cols;
enc_row_mt->allocated_tile_rows = tile_rows;
enc_row_mt->allocated_rows = max_rows;
enc_row_mt->allocated_cols = max_cols - 1;
+ enc_row_mt->allocated_sb_rows = sb_rows;
}
void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
@@ -270,10 +282,12 @@
if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx);
}
}
+ aom_free(enc_row_mt->num_tile_cols_done);
enc_row_mt->allocated_rows = 0;
enc_row_mt->allocated_cols = 0;
enc_row_mt->allocated_tile_cols = 0;
enc_row_mt->allocated_tile_rows = 0;
+ enc_row_mt->allocated_sb_rows = 0;
}
static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
@@ -433,6 +447,40 @@
}
#endif
+static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
+ AV1EncRowMultiThreadInfo *enc_row_mt,
+ int mib_size_log2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
+ const int sb_rows = get_sb_rows_in_frame(cm);
+ AV1LfMTInfo *cur_job_info;
+ (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+
+ while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+ LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data;
+ const int lpf_opt_level = cur_job_info->lpf_opt_level;
+ (void)sb_rows;
+#if CONFIG_MULTITHREAD
+ const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2;
+ const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
+ // Wait for current and next superblock row to finish encoding.
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ while (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+ enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols) {
+ pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
+ }
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ av1_thread_loop_filter_rows(
+ lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+ cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+ lpf_opt_level, lf_sync, lf_data->params_buf, lf_data->tx_buf,
+ mib_size_log2);
+ }
+}
+
static int enc_row_mt_worker_hook(void *arg1, void *unused) {
EncWorkerData *const thread_data = (EncWorkerData *)arg1;
AV1_COMP *const cpi = thread_data->cpi;
@@ -440,6 +488,7 @@
int thread_id = thread_data->thread_id;
AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
#if CONFIG_MULTITHREAD
pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
#endif
@@ -486,6 +535,7 @@
const int tile_row = tile_info->tile_row;
const int tile_col = tile_info->tile_col;
ThreadData *td = thread_data->td;
+ const int sb_row = current_mi_row >> mib_size_log2;
assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
@@ -516,11 +566,21 @@
#endif
this_tile->abs_sum_level += td->abs_sum_level;
row_mt_sync->num_threads_working--;
+ enc_row_mt->num_tile_cols_done[sb_row]++;
#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(enc_row_mt->cond_);
pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
}
-
+ if (cpi->mt_info.pipeline_lpf_mt_with_enc &&
+ (cm->lf.filter_level[PLANE_TYPE_Y] ||
+ cm->lf.filter_level[PLANE_TYPE_UV])) {
+ // Loop-filter a superblock row if encoding of the current and next
+ // superblock row is complete.
+ // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving
+ // encoding and loop filter stage.
+ launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
+ }
av1_free_pc_tree_recursive(thread_data->td->rt_pc_root, av1_num_planes(cm), 0,
0);
return 1;
@@ -612,6 +672,11 @@
aom_malloc(sizeof(*(enc_row_mt->mutex_))));
if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
}
+ if (enc_row_mt->cond_ == NULL) {
+ CHECK_MEM_ERROR(cm, enc_row_mt->cond_,
+ aom_malloc(sizeof(*(enc_row_mt->cond_))));
+ if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL);
+ }
}
if (!is_first_pass) {
@@ -1245,10 +1310,16 @@
int num_workers) {
MultiThreadInfo *const mt_info = &cpi->mt_info;
AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &mt_info->workers[i];
EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+ // Initialize loopfilter data
+ thread_data->lf_sync = &mt_info->lf_row_sync;
+ thread_data->lf_data = &thread_data->lf_sync->lfdata[i];
+ loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd);
+
worker->hook = hook;
worker->data1 = thread_data;
worker->data2 = NULL;
@@ -1524,12 +1595,67 @@
}
#endif
+static void lpf_pipeline_mt_init(AV1_COMP *cpi) {
+ // Pipelining of loop-filtering after encoding is enabled when loop-filter
+ // level is chosen based on quantizer and frame type. It is disabled in case
+ // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage
+ // decides the filter level. Loop-filtering is disabled in case
+ // of non-reference frames and for frames with intra block copy tool enabled.
+ AV1_COMMON *cm = &cpi->common;
+ const int use_cdef = is_cdef_used(cm);
+ const int use_restoration = is_restoration_used(cm);
+ const int skip_postproc_filtering =
+ should_skip_postproc_filtering(cpi, use_cdef, use_restoration);
+ // TODO(deepa.kg @ittiam.com) Enable for rt speed 5, 6.
+ cpi->mt_info.pipeline_lpf_mt_with_enc =
+ (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 7) &&
+ (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) &&
+ (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) &&
+ !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc &&
+ !skip_postproc_filtering;
+
+ if (!cpi->mt_info.pipeline_lpf_mt_with_enc) return;
+
+ set_postproc_filter_default_params(cm);
+
+ if (!is_loopfilter_used(cm)) return;
+
+ const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick;
+ assert(method == LPF_PICK_FROM_Q);
+ assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY);
+
+ av1_pick_filter_level(cpi->source, cpi, method);
+
+ struct loopfilter *lf = &cm->lf;
+ const int plane_start = 0;
+ const int plane_end = av1_num_planes(cm);
+ int planes_to_lf[MAX_MB_PLANE];
+ if ((lf->filter_level[PLANE_TYPE_Y] || lf->filter_level[PLANE_TYPE_UV]) &&
+ check_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end)) {
+ int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+ assert(lpf_opt_level == 2);
+
+ const int start_mi_row = 0;
+ const int end_mi_row = start_mi_row + cm->mi_params.mi_rows;
+
+ av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+ assert(cpi->mt_info.num_mod_workers[MOD_ENC] ==
+ cpi->mt_info.num_mod_workers[MOD_LPF]);
+ loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf,
+ cpi->mt_info.num_mod_workers[MOD_LPF],
+ &cpi->mt_info.lf_row_sync, lpf_opt_level,
+ cm->seq_params->mib_size_log2);
+ }
+}
+
void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
MultiThreadInfo *const mt_info = &cpi->mt_info;
AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
const int tile_cols = cm->tiles.cols;
const int tile_rows = cm->tiles.rows;
+ const int sb_rows = get_sb_rows_in_frame(cm);
int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
int max_sb_rows = 0, max_sb_cols = 0;
int num_workers = mt_info->num_mod_workers[MOD_ENC];
@@ -1541,6 +1667,8 @@
av1_alloc_tile_data(cpi);
}
+ lpf_pipeline_mt_init(cpi);
+
av1_init_tile_data(cpi);
compute_max_sb_rows_cols(cpi, &max_sb_rows, &max_sb_cols);
@@ -1548,7 +1676,8 @@
if (enc_row_mt->allocated_tile_cols != tile_cols ||
enc_row_mt->allocated_tile_rows != tile_rows ||
enc_row_mt->allocated_rows != max_sb_rows ||
- enc_row_mt->allocated_cols != (max_sb_cols - 1)) {
+ enc_row_mt->allocated_cols != (max_sb_cols - 1) ||
+ enc_row_mt->allocated_sb_rows != sb_rows) {
av1_row_mt_mem_dealloc(cpi);
row_mt_mem_alloc(cpi, max_sb_rows, max_sb_cols,
cpi->oxcf.algo_cfg.cdf_update_mode);
@@ -1556,6 +1685,8 @@
memset(thread_id_to_tile_id, -1,
sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+ memset(enc_row_mt->num_tile_cols_done, 0,
+ sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows);
for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
for (int tile_col = 0; tile_col < tile_cols; tile_col++) {