Implement jobs interleaving in LR multi-threading

Interleaving of Jobs in Loop restoration multi-threading
has been added to reduce the top-right sync wastage.

Change-Id: I460eea1b140c3b6ebf1102db616eb09cede47a35
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 41f9248..3fa998a 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -557,11 +557,20 @@
 
   const int num_planes = av1_num_planes(cm);
   AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
   lr_sync->jobs_enqueued = 0;
   lr_sync->jobs_dequeued = 0;
 
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    num_even_lr_jobs =
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+  }
+  lr_job_counter[0] = 0;
+  lr_job_counter[1] = num_even_lr_jobs;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
     const int ss_y = is_uv && cm->subsampling_y;
 
@@ -585,11 +594,33 @@
       limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
       if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
 
-      lr_job_queue->lr_unit_row = i;
-      lr_job_queue->plane = plane;
-      lr_job_queue->v_start = limits.v_start;
-      lr_job_queue->v_end = limits.v_end;
-      lr_job_queue++;
+      assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+      if ((i & 1) == 0) {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            limits.v_start + RESTORATION_BORDER;
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            limits.v_end - RESTORATION_BORDER;
+        if (i == 0) {
+          assert(limits.v_start == tile_rect.top);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+        }
+        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+          assert(limits.v_end == tile_rect.bottom);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        }
+      } else {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+      }
+      lr_job_counter[i & 1]++;
       lr_sync->jobs_enqueued++;
 
       y0 += h;
@@ -639,26 +670,32 @@
     AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
     if (cur_job_info != NULL) {
       RestorationTileLimits limits;
+      sync_read_fn_t on_sync_read;
+      sync_write_fn_t on_sync_write;
       limits.v_start = cur_job_info->v_start;
       limits.v_end = cur_job_info->v_end;
       lr_unit_row = cur_job_info->lr_unit_row;
       plane = cur_job_info->plane;
       const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
-      int copy_v_start = AOMMAX(limits.v_start - RESTORATION_BORDER, 0);
-      int copy_v_end = AOMMAX(limits.v_end - RESTORATION_BORDER, 0);
-      if (cur_job_info->lr_unit_row ==
-          (ctxt[plane].rsi->vert_units_per_tile - 1))
-        copy_v_end = limits.v_end;
+
+      // sync_mode == 1 implies only sync read is required in LR Multi-threading
+      // sync_mode == 0 implies only sync write is required.
+      on_sync_read =
+          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+                                                   : av1_lr_sync_write_dummy;
 
       av1_foreach_rest_unit_in_row(
           &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
           ctxt[plane].rsi->restoration_unit_size, unit_idx0,
-          ctxt[plane].rsi->horz_units_per_tile, plane, &ctxt[plane],
-          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, lr_sync_read,
-          lr_sync_write, lr_sync);
+          ctxt[plane].rsi->horz_units_per_tile,
+          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+          on_sync_write, lr_sync);
 
       copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
-                       ctxt[plane].tile_rect.right, copy_v_start, copy_v_end);
+                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+                       cur_job_info->v_copy_end);
     } else {
       break;
     }