Implement jobs interleaving in LR multi-threading

Interleaving of Jobs in Loop restoration multi-threading
has been added to reduce the top-right sync wastage.

Change-Id: I460eea1b140c3b6ebf1102db616eb09cede47a35
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index a097457..58a5275 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1244,9 +1244,10 @@
 void av1_foreach_rest_unit_in_row(
     RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
-    int unit_idx0, int hunits_per_tile, int plane, void *priv, int32_t *tmpbuf,
-    RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
-    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync) {
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync) {
   const int tile_w = tile_rect->right - tile_rect->left;
   const int ext_size = unit_size * 3 / 2;
   int x0 = 0, j = 0;
@@ -1260,7 +1261,15 @@
 
     const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
 
+    // No sync for even numbered rows
+    // For odd numbered rows, Loop Restoration of current block requires the LR
+    // of top-right and bottom-right blocks to be completed
+
+    // top-right sync
     on_sync_read(lr_sync, row_number, j, plane);
+    if ((row_number + 1) < vunits_per_tile)
+      // bottom-right sync
+      on_sync_read(lr_sync, row_number + 2, j, plane);
 
     on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
 
@@ -1287,13 +1296,11 @@
   (void)plane;
 }
 
-static void foreach_rest_unit_in_tile(const AV1PixelRect *tile_rect,
-                                      int tile_row, int tile_col, int tile_cols,
-                                      int hunits_per_tile, int units_per_tile,
-                                      int unit_size, int ss_y, int plane,
-                                      rest_unit_visitor_t on_rest_unit,
-                                      void *priv, int32_t *tmpbuf,
-                                      RestorationLineBuffers *rlbs) {
+static void foreach_rest_unit_in_tile(
+    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
   const int tile_h = tile_rect->bottom - tile_rect->top;
   const int ext_size = unit_size * 3 / 2;
 
@@ -1314,10 +1321,10 @@
     limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
     if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
 
-    av1_foreach_rest_unit_in_row(&limits, tile_rect, on_rest_unit, i, unit_size,
-                                 unit_idx0, hunits_per_tile, plane, priv,
-                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
-                                 av1_lr_sync_write_dummy, NULL);
+    av1_foreach_rest_unit_in_row(
+        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
 
     y0 += h;
     ++i;
@@ -1335,9 +1342,9 @@
   const RestorationInfo *rsi = &cm->rst_info[plane];
 
   foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
-                            rsi->horz_units_per_tile, rsi->units_per_tile,
-                            rsi->restoration_unit_size, ss_y, plane,
-                            on_rest_unit, priv, tmpbuf, rlbs);
+                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+                            rsi->units_per_tile, rsi->restoration_unit_size,
+                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
 }
 
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index b66b0b6..0c40175 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -360,9 +360,10 @@
 void av1_foreach_rest_unit_in_row(
     RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
-    int unit_idx0, int hunits_per_tile, int plane, void *priv, int32_t *tmpbuf,
-    RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read,
-    sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync);
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync);
 AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
 int av1_lr_count_units_in_tile(int unit_size, int tile_size);
 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 41f9248..3fa998a 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -557,11 +557,20 @@
 
   const int num_planes = av1_num_planes(cm);
   AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
   lr_sync->jobs_enqueued = 0;
   lr_sync->jobs_dequeued = 0;
 
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    num_even_lr_jobs =
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+  }
+  lr_job_counter[0] = 0;
+  lr_job_counter[1] = num_even_lr_jobs;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
     const int ss_y = is_uv && cm->subsampling_y;
 
@@ -585,11 +594,33 @@
       limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
       if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
 
-      lr_job_queue->lr_unit_row = i;
-      lr_job_queue->plane = plane;
-      lr_job_queue->v_start = limits.v_start;
-      lr_job_queue->v_end = limits.v_end;
-      lr_job_queue++;
+      assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+      if ((i & 1) == 0) {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            limits.v_start + RESTORATION_BORDER;
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            limits.v_end - RESTORATION_BORDER;
+        if (i == 0) {
+          assert(limits.v_start == tile_rect.top);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+        }
+        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+          assert(limits.v_end == tile_rect.bottom);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        }
+      } else {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+      }
+      lr_job_counter[i & 1]++;
       lr_sync->jobs_enqueued++;
 
       y0 += h;
@@ -639,26 +670,32 @@
     AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
     if (cur_job_info != NULL) {
       RestorationTileLimits limits;
+      sync_read_fn_t on_sync_read;
+      sync_write_fn_t on_sync_write;
       limits.v_start = cur_job_info->v_start;
       limits.v_end = cur_job_info->v_end;
       lr_unit_row = cur_job_info->lr_unit_row;
       plane = cur_job_info->plane;
       const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
-      int copy_v_start = AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
-      int copy_v_end = AOMMAX(limits.v_end - RESTORATION_BORDER, 0);
-      if (cur_job_info->lr_unit_row ==
-          (ctxt[plane].rsi->vert_units_per_tile - 1))
-        copy_v_end = limits.v_end;
+
+      // sync_mode == 1 implies only sync read is required in LR Multi-threading
+      // sync_mode == 0 implies only sync write is required.
+      on_sync_read =
+          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+                                                   : av1_lr_sync_write_dummy;
 
       av1_foreach_rest_unit_in_row(
           &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
           ctxt[plane].rsi->restoration_unit_size, unit_idx0,
-          ctxt[plane].rsi->horz_units_per_tile, plane, &ctxt[plane],
-          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, lr_sync_read,
-          lr_sync_write, lr_sync);
+          ctxt[plane].rsi->horz_units_per_tile,
+          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+          on_sync_write, lr_sync);
 
       copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
-                       ctxt[plane].tile_rect.right, copy_v_start, copy_v_end);
+                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+                       cur_job_info->v_copy_end);
     } else {
       break;
     }
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index cf47562..4b0d5d2 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -59,6 +59,9 @@
   int v_end;
   int lr_unit_row;
   int plane;
+  int sync_mode;
+  int v_copy_start;
+  int v_copy_end;
 } AV1LrMTInfo;
 
 typedef struct LoopRestorationWorkerData {