Facilitate multi-threading in loop restoration

Copy and sync functions have been added to facilitate
multi-threading in loop restoration.

Change-Id: I4b45f9671c8b4e2add3038670096c32f1a4e032d
diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl
index 6d1255a..c5990b1 100644
--- a/aom_scale/aom_scale_rtcd.pl
+++ b/aom_scale/aom_scale_rtcd.pl
@@ -36,6 +36,12 @@
 
 add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
 
+add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_yv12_partial_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
 add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
 specialize qw/aom_extend_frame_borders dspr2/;
 
diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index 9d6eb76..ba18352 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c
@@ -320,3 +320,92 @@
     dst += dst_bc->uv_stride;
   }
 }
+
+void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                               YV12_BUFFER_CONFIG *dst_ybc, int hstart,
+                               int hend, int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_ybc->y_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_ybc->y_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+  src = (src + vstart * src_ybc->y_stride + hstart);
+  dst = (dst + vstart * dst_ybc->y_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
+
+void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend,
+                               int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_bc->u_buffer;
+  uint8_t *dst = dst_bc->u_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  src = (src + vstart * src_bc->uv_stride + hstart);
+  dst = (dst + vstart * dst_bc->uv_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend,
+                               int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_bc->v_buffer;
+  uint8_t *dst = dst_bc->v_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  src = (src + vstart * src_bc->uv_stride + hstart);
+  dst = (dst + vstart * dst_bc->uv_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 411d89e..2e37ac7 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -59,7 +59,7 @@
 // restoration unit can extend to up to 150% its normal width or height. The
 // max with 1 is to deal with tiles that are smaller than half of a restoration
 // unit.
-static int count_units_in_tile(int unit_size, int tile_size) {
+int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
   return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
 }
 
@@ -82,8 +82,8 @@
   // max with 1 is to deal with tiles that are smaller than half of a
   // restoration unit.
   const int unit_size = rsi->restoration_unit_size;
-  const int hpertile = count_units_in_tile(unit_size, max_tile_w);
-  const int vpertile = count_units_in_tile(unit_size, max_tile_h);
+  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
+  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
 
   rsi->units_per_tile = hpertile * vpertile;
   rsi->horz_units_per_tile = hpertile;
@@ -1189,20 +1189,24 @@
     lr_plane_ctxt->data_stride = frame->strides[is_uv];
     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
     lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
-    filter_frame_on_tile(0, 0, lr_plane_ctxt, cm);
+    filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
   }
 }
 
 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
                                       AV1_COMMON *cm, int num_planes) {
-  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src,
-                           YV12_BUFFER_CONFIG *dst);
-  static const copy_fun copy_funs[3] = { aom_yv12_copy_y, aom_yv12_copy_u,
-                                         aom_yv12_copy_v };
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
 
   for (int plane = 0; plane < num_planes; ++plane) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
-    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame);
+    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
+                     tile_rect.right, tile_rect.top, tile_rect.bottom);
   }
 }
 
@@ -1241,9 +1245,10 @@
                                   const AV1PixelRect *tile_rect,
                                   rest_unit_visitor_t on_rest_unit,
                                   int row_number, int unit_size, int unit_idx0,
-                                  int hunits_per_tile, void *priv,
-                                  int32_t *tmpbuf,
-                                  RestorationLineBuffers *rlbs) {
+                                  int hunits_per_tile, int plane, void *priv,
+                                  int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+                                  sync_read_fn_t on_sync_read,
+                                  sync_write_fn_t on_sync_write) {
   const int tile_w = tile_rect->right - tile_rect->left;
   const int ext_size = unit_size * 3 / 2;
   int x0 = 0, j = 0;
@@ -1257,16 +1262,37 @@
 
     const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
 
+    on_sync_read(NULL, row_number, j, plane);
+
     on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+
+    on_sync_write(NULL, row_number, j, hunits_per_tile, plane);
+
     x0 += w;
     ++j;
   }
 }
 
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+}
+
 static void foreach_rest_unit_in_tile(const AV1PixelRect *tile_rect,
                                       int tile_row, int tile_col, int tile_cols,
                                       int hunits_per_tile, int units_per_tile,
-                                      int unit_size, int ss_y,
+                                      int unit_size, int ss_y, int plane,
                                       rest_unit_visitor_t on_rest_unit,
                                       void *priv, int32_t *tmpbuf,
                                       RestorationLineBuffers *rlbs) {
@@ -1291,8 +1317,9 @@
     if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
 
     av1_foreach_rest_unit_in_row(&limits, tile_rect, on_rest_unit, i, unit_size,
-                                 unit_idx0, hunits_per_tile, priv, tmpbuf,
-                                 rlbs);
+                                 unit_idx0, hunits_per_tile, plane, priv,
+                                 tmpbuf, rlbs, av1_lr_sync_read_dummy,
+                                 av1_lr_sync_write_dummy);
 
     y0 += h;
     ++i;
@@ -1309,9 +1336,10 @@
 
   const RestorationInfo *rsi = &cm->rst_info[plane];
 
-  foreach_rest_unit_in_tile(tile_rect, 0, 0, 1, rsi->horz_units_per_tile,
-                            rsi->units_per_tile, rsi->restoration_unit_size,
-                            ss_y, on_rest_unit, priv, tmpbuf, rlbs);
+  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
+                            rsi->horz_units_per_tile, rsi->units_per_tile,
+                            rsi->restoration_unit_size, ss_y, plane,
+                            on_rest_unit, priv, tmpbuf, rlbs);
 }
 
 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
@@ -1346,8 +1374,8 @@
 
   // Calculate the number of restoration units in this tile (which might be
   // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
-  const int horz_units = count_units_in_tile(size, tile_w);
-  const int vert_units = count_units_in_tile(size, tile_h);
+  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
+  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
 
   // The size of an MI-unit on this plane of the image
   const int ss_x = is_uv && cm->subsampling_x;
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 48087c2..81a03e3 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -171,6 +171,10 @@
 #error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
 #endif
 
+#define LR_TILE_ROW 0
+#define LR_TILE_COL 0
+#define LR_TILE_COLS 1
+
 typedef struct {
   int r[2];  // radii
   int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
@@ -317,6 +321,11 @@
 typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
                                           void *priv);
 
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+                                const int sb_cols, int plane);
+
 // Call on_rest_unit for each loop restoration unit in the plane.
 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
                                     rest_unit_visitor_t on_rest_unit,
@@ -351,10 +360,15 @@
                                   const AV1PixelRect *tile_rect,
                                   rest_unit_visitor_t on_rest_unit,
                                   int row_number, int unit_size, int unit_idx0,
-                                  int hunits_per_tile, void *priv,
-                                  int32_t *tmpbuf,
-                                  RestorationLineBuffers *rlbs);
+                                  int hunits_per_tile, int plane, void *priv,
+                                  int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+                                  sync_read_fn_t on_sync_read,
+                                  sync_write_fn_t on_sync_write);
 AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index d3a1b74..0dbc489 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -1243,7 +1243,7 @@
   };
 
   reset_rsc(rsc);
-  rsc_on_tile(0, 0, rsc);
+  rsc_on_tile(LR_TILE_ROW, LR_TILE_COL, rsc);
   av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
                                  &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
   return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);