Implement jobs interleaving in LR multi-threading Interleaving of Jobs in Loop restoration multi-threading has been added to reduce the top-right sync wastage. Change-Id: I460eea1b140c3b6ebf1102db616eb09cede47a35
diff --git a/av1/common/restoration.c b/av1/common/restoration.c index a097457..58a5275 100644 --- a/av1/common/restoration.c +++ b/av1/common/restoration.c
@@ -1244,9 +1244,10 @@ void av1_foreach_rest_unit_in_row( RestorationTileLimits *limits, const AV1PixelRect *tile_rect, rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, - int unit_idx0, int hunits_per_tile, int plane, void *priv, int32_t *tmpbuf, - RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, - sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync) { + int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane, + void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, + sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, + struct AV1LrSyncData *const lr_sync) { const int tile_w = tile_rect->right - tile_rect->left; const int ext_size = unit_size * 3 / 2; int x0 = 0, j = 0; @@ -1260,7 +1261,15 @@ const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j; + // No sync for even numbered rows + // For odd numbered rows, Loop Restoration of current block requires the LR + // of top-right and bottom-right blocks to be completed + + // top-right sync on_sync_read(lr_sync, row_number, j, plane); + if ((row_number + 1) < vunits_per_tile) + // bottom-right sync + on_sync_read(lr_sync, row_number + 2, j, plane); on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs); @@ -1287,13 +1296,11 @@ (void)plane; } -static void foreach_rest_unit_in_tile(const AV1PixelRect *tile_rect, - int tile_row, int tile_col, int tile_cols, - int hunits_per_tile, int units_per_tile, - int unit_size, int ss_y, int plane, - rest_unit_visitor_t on_rest_unit, - void *priv, int32_t *tmpbuf, - RestorationLineBuffers *rlbs) { +static void foreach_rest_unit_in_tile( + const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols, + int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size, + int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs) { const int tile_h = tile_rect->bottom - tile_rect->top; const int ext_size = unit_size * 3 / 2; @@ -1314,10 +1321,10 @@ limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset); if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset; - av1_foreach_rest_unit_in_row(&limits, tile_rect, on_rest_unit, i, unit_size, - unit_idx0, hunits_per_tile, plane, priv, - tmpbuf, rlbs, av1_lr_sync_read_dummy, - av1_lr_sync_write_dummy, NULL); + av1_foreach_rest_unit_in_row( + &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0, + hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs, + av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL); y0 += h; ++i; @@ -1335,9 +1342,9 @@ const RestorationInfo *rsi = &cm->rst_info[plane]; foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS, - rsi->horz_units_per_tile, rsi->units_per_tile, - rsi->restoration_unit_size, ss_y, plane, - on_rest_unit, priv, tmpbuf, rlbs); + rsi->horz_units_per_tile, rsi->vert_units_per_tile, + rsi->units_per_tile, rsi->restoration_unit_size, + ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs); } int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
diff --git a/av1/common/restoration.h b/av1/common/restoration.h index b66b0b6..0c40175 100644 --- a/av1/common/restoration.h +++ b/av1/common/restoration.h
@@ -360,9 +360,10 @@ void av1_foreach_rest_unit_in_row( RestorationTileLimits *limits, const AV1PixelRect *tile_rect, rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, - int unit_idx0, int hunits_per_tile, int plane, void *priv, int32_t *tmpbuf, - RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, - sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync); + int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane, + void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, + sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, + struct AV1LrSyncData *const lr_sync); AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv); int av1_lr_count_units_in_tile(int unit_size, int tile_size); void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c index 41f9248..3fa998a 100644 --- a/av1/common/thread_common.c +++ b/av1/common/thread_common.c
@@ -557,11 +557,20 @@ const int num_planes = av1_num_planes(cm); AV1LrMTInfo *lr_job_queue = lr_sync->job_queue; + int32_t lr_job_counter[2], num_even_lr_jobs = 0; lr_sync->jobs_enqueued = 0; lr_sync->jobs_dequeued = 0; for (int plane = 0; plane < num_planes; plane++) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + num_even_lr_jobs = + num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1); + } + lr_job_counter[0] = 0; + lr_job_counter[1] = num_even_lr_jobs; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; const int is_uv = plane > 0; const int ss_y = is_uv && cm->subsampling_y; @@ -585,11 +594,33 @@ limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset); if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset; - lr_job_queue->lr_unit_row = i; - lr_job_queue->plane = plane; - lr_job_queue->v_start = limits.v_start; - lr_job_queue->v_end = limits.v_end; - lr_job_queue++; + assert(lr_job_counter[0] <= num_even_lr_jobs); + + lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i; + lr_job_queue[lr_job_counter[i & 1]].plane = plane; + lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start; + lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end; + lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1; + if ((i & 1) == 0) { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + limits.v_start + RESTORATION_BORDER; + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + limits.v_end - RESTORATION_BORDER; + if (i == 0) { + assert(limits.v_start == tile_rect.top); + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top; + } + if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) { + assert(limits.v_end == tile_rect.bottom); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom; + } + } else { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom); + } + lr_job_counter[i & 1]++; lr_sync->jobs_enqueued++; y0 += h; @@ -639,26 +670,32 @@ AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync); if (cur_job_info != NULL) { RestorationTileLimits limits; + sync_read_fn_t on_sync_read; + sync_write_fn_t on_sync_write; limits.v_start = cur_job_info->v_start; limits.v_end = cur_job_info->v_end; lr_unit_row = cur_job_info->lr_unit_row; plane = cur_job_info->plane; const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile; - int copy_v_start = AOMMAX(limits.v_start - RESTORATION_BORDER, 0); - int copy_v_end = AOMMAX(limits.v_end - RESTORATION_BORDER, 0); - if (cur_job_info->lr_unit_row == - (ctxt[plane].rsi->vert_units_per_tile - 1)) - copy_v_end = limits.v_end; + + // sync_mode == 1 implies only sync read is required in LR Multi-threading + // sync_mode == 0 implies only sync write is required. + on_sync_read = + cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy; + on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write + : av1_lr_sync_write_dummy; av1_foreach_rest_unit_in_row( &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row, ctxt[plane].rsi->restoration_unit_size, unit_idx0, - ctxt[plane].rsi->horz_units_per_tile, plane, &ctxt[plane], - lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, lr_sync_read, - lr_sync_write, lr_sync); + ctxt[plane].rsi->horz_units_per_tile, + ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane], + lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read, + on_sync_write, lr_sync); copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left, - ctxt[plane].tile_rect.right, copy_v_start, copy_v_end); + ctxt[plane].tile_rect.right, cur_job_info->v_copy_start, + cur_job_info->v_copy_end); } else { break; }
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h index cf47562..4b0d5d2 100644 --- a/av1/common/thread_common.h +++ b/av1/common/thread_common.h
@@ -59,6 +59,9 @@ int v_end; int lr_unit_row; int plane; + int sync_mode; + int v_copy_start; + int v_copy_end; } AV1LrMTInfo; typedef struct LoopRestorationWorkerData {