Implement jobs interleaving in LR multi-threading
Interleaving of Jobs in Loop restoration multi-threading
has been added to reduce the top-right sync wastage.
Change-Id: I460eea1b140c3b6ebf1102db616eb09cede47a35
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index a097457..58a5275 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1244,9 +1244,10 @@
void av1_foreach_rest_unit_in_row(
RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
- int unit_idx0, int hunits_per_tile, int plane, void *priv, int32_t *tmpbuf,
- RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
- sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync) {
+ int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+ void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+ struct AV1LrSyncData *const lr_sync) {
const int tile_w = tile_rect->right - tile_rect->left;
const int ext_size = unit_size * 3 / 2;
int x0 = 0, j = 0;
@@ -1260,7 +1261,15 @@
const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
+ // No sync for even numbered rows
+ // For odd numbered rows, Loop Restoration of current block requires the LR
+ // of top-right and bottom-right blocks to be completed
+
+ // top-right sync
on_sync_read(lr_sync, row_number, j, plane);
+ if ((row_number + 1) < vunits_per_tile)
+ // bottom-right sync
+ on_sync_read(lr_sync, row_number + 2, j, plane);
on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
@@ -1287,13 +1296,11 @@
(void)plane;
}
-static void foreach_rest_unit_in_tile(const AV1PixelRect *tile_rect,
- int tile_row, int tile_col, int tile_cols,
- int hunits_per_tile, int units_per_tile,
- int unit_size, int ss_y, int plane,
- rest_unit_visitor_t on_rest_unit,
- void *priv, int32_t *tmpbuf,
- RestorationLineBuffers *rlbs) {
+static void foreach_rest_unit_in_tile(
+ const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+ int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+ int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
const int tile_h = tile_rect->bottom - tile_rect->top;
const int ext_size = unit_size * 3 / 2;
@@ -1314,10 +1321,10 @@
limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
- av1_foreach_rest_unit_in_row(&limits, tile_rect, on_rest_unit, i, unit_size,
- unit_idx0, hunits_per_tile, plane, priv,
- tmpbuf, rlbs, av1_lr_sync_read_dummy,
- av1_lr_sync_write_dummy, NULL);
+ av1_foreach_rest_unit_in_row(
+ &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+ hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+ av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
y0 += h;
++i;
@@ -1335,9 +1342,9 @@
const RestorationInfo *rsi = &cm->rst_info[plane];
foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
- rsi->horz_units_per_tile, rsi->units_per_tile,
- rsi->restoration_unit_size, ss_y, plane,
- on_rest_unit, priv, tmpbuf, rlbs);
+ rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+ rsi->units_per_tile, rsi->restoration_unit_size,
+ ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
}
int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index b66b0b6..0c40175 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -360,9 +360,10 @@
void av1_foreach_rest_unit_in_row(
RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
- int unit_idx0, int hunits_per_tile, int plane, void *priv, int32_t *tmpbuf,
- RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
- sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync);
+ int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+ void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+ struct AV1LrSyncData *const lr_sync);
AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
int av1_lr_count_units_in_tile(int unit_size, int tile_size);
void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 41f9248..3fa998a 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -557,11 +557,20 @@
const int num_planes = av1_num_planes(cm);
AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+ int32_t lr_job_counter[2], num_even_lr_jobs = 0;
lr_sync->jobs_enqueued = 0;
lr_sync->jobs_dequeued = 0;
for (int plane = 0; plane < num_planes; plane++) {
if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ num_even_lr_jobs =
+ num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+ }
+ lr_job_counter[0] = 0;
+ lr_job_counter[1] = num_even_lr_jobs;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
const int is_uv = plane > 0;
const int ss_y = is_uv && cm->subsampling_y;
@@ -585,11 +594,33 @@
limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
- lr_job_queue->lr_unit_row = i;
- lr_job_queue->plane = plane;
- lr_job_queue->v_start = limits.v_start;
- lr_job_queue->v_end = limits.v_end;
- lr_job_queue++;
+ assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+ lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+ lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+ lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+ lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+ lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+ if ((i & 1) == 0) {
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+ limits.v_start + RESTORATION_BORDER;
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+ limits.v_end - RESTORATION_BORDER;
+ if (i == 0) {
+ assert(limits.v_start == tile_rect.top);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+ }
+ if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+ assert(limits.v_end == tile_rect.bottom);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+ }
+ } else {
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+ AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+ AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+ }
+ lr_job_counter[i & 1]++;
lr_sync->jobs_enqueued++;
y0 += h;
@@ -639,26 +670,32 @@
AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
if (cur_job_info != NULL) {
RestorationTileLimits limits;
+ sync_read_fn_t on_sync_read;
+ sync_write_fn_t on_sync_write;
limits.v_start = cur_job_info->v_start;
limits.v_end = cur_job_info->v_end;
lr_unit_row = cur_job_info->lr_unit_row;
plane = cur_job_info->plane;
const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
- int copy_v_start = AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
- int copy_v_end = AOMMAX(limits.v_end - RESTORATION_BORDER, 0);
- if (cur_job_info->lr_unit_row ==
- (ctxt[plane].rsi->vert_units_per_tile - 1))
- copy_v_end = limits.v_end;
+
+ // sync_mode == 1 implies only sync read is required in LR Multi-threading
+ // sync_mode == 0 implies only sync write is required.
+ on_sync_read =
+ cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+ on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+ : av1_lr_sync_write_dummy;
av1_foreach_rest_unit_in_row(
&limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
ctxt[plane].rsi->restoration_unit_size, unit_idx0,
- ctxt[plane].rsi->horz_units_per_tile, plane, &ctxt[plane],
- lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, lr_sync_read,
- lr_sync_write, lr_sync);
+ ctxt[plane].rsi->horz_units_per_tile,
+ ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+ lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+ on_sync_write, lr_sync);
copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
- ctxt[plane].tile_rect.right, copy_v_start, copy_v_end);
+ ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+ cur_job_info->v_copy_end);
} else {
break;
}
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index cf47562..4b0d5d2 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -59,6 +59,9 @@
int v_end;
int lr_unit_row;
int plane;
+ int sync_mode;
+ int v_copy_start;
+ int v_copy_end;
} AV1LrMTInfo;
typedef struct LoopRestorationWorkerData {