Add striped_loop_restoration experiment

This experiment offset the filter tile grid 8 pixels upwards.
Deblocked pixels (rather than CDEFed pixels) are used for the
2 lines above and below the filter processing unit. The 8 pixel
offset is the offset produced by deblock/cdef. This way the
loop_restoration does not need additional line buffers in a
single pass hardware implementation.

Change-Id: I89e0831dc28413a5d3e02d7a426ce2885ab629d7
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index fe22667..20f3074 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -135,6 +135,33 @@
   aom_free(cm->rst_internal.tmpbuf);
   CHECK_MEM_ERROR(cm, cm->rst_internal.tmpbuf,
                   (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  // Allocate internal storage for the loop restoration stripe boundary lines
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    int w = p == 0 ? width : ROUND_POWER_OF_TWO(width, cm->subsampling_x);
+    int align_bits = 5;  // align for efficiency
+    int stride = ALIGN_POWER_OF_TWO(w, align_bits);
+    int num_stripes = (height + 63) / 64;
+    // for each processing stripe: 2 lines above, 2 below
+    int buf_size = num_stripes * 2 * stride;
+    uint8_t *above_buf, *below_buf;
+
+    aom_free(cm->rst_internal.stripe_boundary_above[p]);
+    aom_free(cm->rst_internal.stripe_boundary_below[p]);
+
+#if CONFIG_HIGHBITDEPTH
+    if (cm->use_highbitdepth) buf_size = buf_size * 2;
+#endif
+    CHECK_MEM_ERROR(cm, above_buf,
+                    (uint8_t *)aom_memalign(1 << align_bits, buf_size));
+    CHECK_MEM_ERROR(cm, below_buf,
+                    (uint8_t *)aom_memalign(1 << align_bits, buf_size));
+    cm->rst_internal.stripe_boundary_above[p] = above_buf;
+    cm->rst_internal.stripe_boundary_below[p] = below_buf;
+    cm->rst_internal.stripe_boundary_stride[p] = stride;
+  }
+#endif  // CONFIG_STRIPED_LOOP_RESTORATION
 }
 
 void av1_free_restoration_buffers(AV1_COMMON *cm) {
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index c703660..613ad83 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -123,6 +123,111 @@
   }
 }
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+
+// This function setup a processing stripe by replacing the vertical
+// stripe boundary (2 lines above and 2 lines below) by data coming
+// from the above/below buffers. Before doing so the original
+// frame data is saved into a temporary buffer, such that it
+// can be restored by the restore_processing_stripe_boundary
+// function after the filtering of the processing stripe.
+// Returns the height of the processing stripe
+static int setup_processing_stripe_boundary(int y0, int v_end, int h_start,
+                                            int h_end, uint8_t *data,
+                                            int stride,
+                                            RestorationInternal *rst,
+                                            int use_highbd) {
+  int y, y_stripe_topmost, stripe_index, i;
+  int tile_offset = RESTORATION_TILE_OFFSET >> rst->subsampling_y;
+  int stripe_height = rst->rsi->procunit_height;
+  int comp = rst->component;
+  uint8_t *boundary_above_buf = rst->stripe_boundary_above[comp];
+  uint8_t *boundary_below_buf = rst->stripe_boundary_below[comp];
+  int boundary_stride = rst->stripe_boundary_stride[comp];
+  int x0 = h_start - RESTORATION_EXTRA_HORZ;
+  int x1 = h_end + RESTORATION_EXTRA_HORZ;
+
+  stripe_index = (y0 + tile_offset) / stripe_height;
+  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
+  boundary_above_buf +=
+      ((stripe_index - 1) * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
+      << use_highbd;
+  boundary_below_buf +=
+      (stripe_index * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
+      << use_highbd;
+
+  // setup the 2 lines above the stripe
+  for (i = 0; i < 2; i++) {
+    y = y_stripe_topmost - 2 + i;
+    if (y >= 0 && y < y0 && y >= y0 - 2) {
+      uint8_t *p = data + ((y * stride + x0) << use_highbd);
+      uint8_t *new_data =
+          boundary_above_buf + ((i * boundary_stride + x0) << use_highbd);
+      // printf("above %3d %3d: %08x %08x : %08x %08x\n", y, x0,
+      // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
+      // ((uint32_t*)new_data)[1]);
+      // Save old pixels
+      memcpy(rst->tmp_save_above[i], p, (x1 - x0) << use_highbd);
+      // Replace width pixels from boundary_above_buf
+      memcpy(p, new_data, (x1 - x0) << use_highbd);
+    }
+  }
+  // setup the 2 lines below the stripe
+  for (i = 0; i < 2; i++) {
+    y = y_stripe_topmost + stripe_height + i;
+    if (y < v_end + 2) {
+      uint8_t *p = data + ((y * stride + x0) << use_highbd);
+      uint8_t *new_data =
+          boundary_below_buf + ((i * boundary_stride + x0) << use_highbd);
+      // printf("below %3d %3d: %08x %08x : %08x %08x\n", y, x0,
+      // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
+      // ((uint32_t*)new_data)[1]);
+      // Save old pixels
+      memcpy(rst->tmp_save_below[i], p, (x1 - x0) << use_highbd);
+      // Replace width pixels from boundary_below_buf
+      memcpy(p, new_data, (x1 - x0) << use_highbd);
+    }
+  }
+  // Return actual stripe height
+  return AOMMIN(v_end, y_stripe_topmost + stripe_height) - y0;
+}
+
+// This function restores the boundary lines modified by
+// setup_processing_stripe_boundary.
+static void restore_processing_stripe_boundary(int y0, int v_end, int h_start,
+                                               int h_end, uint8_t *data,
+                                               int stride,
+                                               RestorationInternal *rst,
+                                               int use_highbd) {
+  int y, y_stripe_topmost, i, stripe_index;
+  int tile_offset = 8 >> rst->subsampling_y;
+  int stripe_height = rst->rsi->procunit_height;
+  int x0 = h_start - RESTORATION_EXTRA_HORZ;
+  int x1 = h_end + RESTORATION_EXTRA_HORZ;
+
+  stripe_index = (y0 + tile_offset) / stripe_height;
+  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
+
+  // restore the 2 lines above the stripe
+  for (i = 0; i < 2; i++) {
+    y = y_stripe_topmost - 2 + i;
+    if (y >= 0 && y < y0 && y >= y0 - 2) {
+      uint8_t *p = data + ((y * stride + x0) << use_highbd);
+      memcpy(p, rst->tmp_save_above[i], (x1 - x0) << use_highbd);
+    }
+  }
+  // restore the 2 lines below the stripe
+  for (i = 0; i < 2; i++) {
+    y = y_stripe_topmost + stripe_height + i;
+    if (y < v_end + 2) {
+      uint8_t *p = data + ((y * stride + x0) << use_highbd);
+      memcpy(p, rst->tmp_save_below[i], (x1 - x0) << use_highbd);
+    }
+  }
+}
+
+#endif
+
 static void loop_copy_tile(uint8_t *data, int tile_idx, int width, int height,
                            int stride, RestorationInternal *rst, uint8_t *dst,
                            int dst_stride) {
@@ -130,7 +235,11 @@
   const int tile_height = rst->tile_height;
   RestorationTileLimits limits =
       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                               tile_height, width, height, rst->subsampling_y);
+#else
                                tile_height, width, height);
+#endif
   for (int i = limits.v_start; i < limits.v_end; ++i)
     memcpy(dst + i * dst_stride + limits.h_start,
            data + i * stride + limits.h_start, limits.h_end - limits.h_start);
@@ -173,7 +282,11 @@
                                     RestorationInternal *rst, uint8_t *dst,
                                     int dst_stride) {
   const int procunit_width = rst->rsi->procunit_width;
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  int procunit_height;
+#else
   const int procunit_height = rst->rsi->procunit_height;
+#endif
   const int tile_width = rst->tile_width;
   const int tile_height = rst->tile_height;
   if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
@@ -183,13 +296,25 @@
   InterpKernel vertical_topbot;
   RestorationTileLimits limits =
       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                               tile_height, width, height, rst->subsampling_y);
+#else
                                tile_height, width, height);
+#endif
+
   // Convolve the whole tile (done in blocks here to match the requirements
   // of the vectorized convolve functions, but the result is equivalent)
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height)
+  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    int h = setup_processing_stripe_boundary(
+        i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
+    h = ALIGN_POWER_OF_TWO(h, 1);
+    procunit_height = h;
+#else
+    int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
+#endif
     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
       int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
-      int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
       const uint8_t *data_p = data + i * stride + j;
       uint8_t *dst_p = dst + i * dst_stride + j;
       // Note h is at least 16
@@ -237,6 +362,11 @@
         dst_p += dst_stride;
       }
     }
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
+                                       limits.h_end, data, stride, rst, 0);
+#endif
+  }
 }
 
 static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
@@ -978,7 +1108,11 @@
                                      RestorationInternal *rst, uint8_t *dst,
                                      int dst_stride) {
   const int procunit_width = rst->rsi->procunit_width;
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  int procunit_height;
+#else
   const int procunit_height = rst->rsi->procunit_height;
+#endif
   const int tile_width = rst->tile_width;
   const int tile_height = rst->tile_height;
   if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
@@ -987,17 +1121,32 @@
   }
   RestorationTileLimits limits =
       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                               tile_height, width, height, rst->subsampling_y);
+#else
                                tile_height, width, height);
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height)
+#endif
+  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    int h = setup_processing_stripe_boundary(
+        i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
+    procunit_height = h;
+#else
+    int h = AOMMIN(procunit_height, limits.v_end - i);
+#endif
     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
       int w = AOMMIN(procunit_width, limits.h_end - j);
-      int h = AOMMIN(procunit_height, limits.v_end - i);
       uint8_t *data_p = data + i * stride + j;
       uint8_t *dst_p = dst + i * dst_stride + j;
       apply_selfguided_restoration(
           data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
           rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
     }
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
+                                       limits.h_end, data, stride, rst, 0);
+#endif
+  }
 }
 
 static void loop_sgrproj_filter(uint8_t *data, int width, int height,
@@ -1061,7 +1210,11 @@
   const int tile_height = rst->tile_height;
   RestorationTileLimits limits =
       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                               tile_height, width, height, rst->subsampling_y);
+#else
                                tile_height, width, height);
+#endif
   for (int i = limits.v_start; i < limits.v_end; ++i)
     memcpy(dst + i * dst_stride + limits.h_start,
            data + i * stride + limits.h_start,
@@ -1074,7 +1227,11 @@
                                            int bit_depth, uint16_t *dst,
                                            int dst_stride) {
   const int procunit_width = rst->rsi->procunit_width;
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  int procunit_height;
+#else
   const int procunit_height = rst->rsi->procunit_height;
+#endif
   const int tile_width = rst->tile_width;
   const int tile_height = rst->tile_height;
 
@@ -1085,14 +1242,27 @@
   }
   RestorationTileLimits limits =
       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                               tile_height, width, height, rst->subsampling_y);
+#else
                                tile_height, width, height);
+#endif
   InterpKernel vertical_topbot;
+
   // Convolve the whole tile (done in blocks here to match the requirements
   // of the vectorized convolve functions, but the result is equivalent)
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height)
+  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
+                                             limits.h_end, (uint8_t *)data,
+                                             stride, rst, 1);
+    h = ALIGN_POWER_OF_TWO(h, 1);
+    procunit_height = h;
+#else
+    int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
+#endif
     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
       int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
-      int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
       const uint16_t *data_p = data + i * stride + j;
       uint16_t *dst_p = dst + i * dst_stride + j;
       // Note h is at least 16
@@ -1146,6 +1316,12 @@
         dst_p += dst_stride;
       }
     }
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
+                                       limits.h_end, (uint8_t *)data, stride,
+                                       rst, 1);
+#endif
+  }
 }
 
 static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
@@ -1315,7 +1491,11 @@
                                             int bit_depth, uint16_t *dst,
                                             int dst_stride) {
   const int procunit_width = rst->rsi->procunit_width;
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  int procunit_height;
+#else
   const int procunit_height = rst->rsi->procunit_height;
+#endif
   const int tile_width = rst->tile_width;
   const int tile_height = rst->tile_height;
 
@@ -1326,17 +1506,34 @@
   }
   RestorationTileLimits limits =
       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                               tile_height, width, height, rst->subsampling_y);
+#else
                                tile_height, width, height);
-  for (int i = limits.v_start; i < limits.v_end; i += procunit_height)
+#endif
+  for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
+                                             limits.h_end, (uint8_t *)data,
+                                             stride, rst, 1);
+    procunit_height = h;
+#else
+    int h = AOMMIN(procunit_height, limits.v_end - i);
+#endif
     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
       int w = AOMMIN(procunit_width, limits.h_end - j);
-      int h = AOMMIN(procunit_height, limits.v_end - i);
       uint16_t *data_p = data + i * stride + j;
       uint16_t *dst_p = dst + i * dst_stride + j;
       apply_selfguided_restoration_highbd(
           data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
           rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
     }
+#if CONFIG_STRIPED_LOOP_RESTORATION
+    restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
+                                       limits.h_end, (uint8_t *)data, stride,
+                                       rst, 1);
+#endif
+  }
 }
 
 static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
@@ -1409,7 +1606,6 @@
 
   yend = AOMMIN(yend, yheight);
   uvend = AOMMIN(uvend, uvheight);
-
   if (components_pattern == (1 << AOM_PLANE_Y)) {
     // Only y
     if (rsi[0].frame_restoration_type == RESTORE_NONE) {
@@ -1459,6 +1655,10 @@
           &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
           &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
       cm->rst_internal.rsi = &rsi[0];
+#if CONFIG_STRIPED_LOOP_RESTORATION
+      cm->rst_internal.component = AOM_PLANE_Y;
+      cm->rst_internal.subsampling_y = 0;
+#endif
       restore_func =
           restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
 #if CONFIG_HIGHBITDEPTH
@@ -1486,6 +1686,10 @@
           &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
           &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
       cm->rst_internal.rsi = &rsi[AOM_PLANE_U];
+#if CONFIG_STRIPED_LOOP_RESTORATION
+      cm->rst_internal.component = AOM_PLANE_U;
+      cm->rst_internal.subsampling_y = cm->subsampling_y;
+#endif
       restore_func =
           restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
 #if CONFIG_HIGHBITDEPTH
@@ -1513,6 +1717,10 @@
           &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
           &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
       cm->rst_internal.rsi = &rsi[AOM_PLANE_V];
+#if CONFIG_STRIPED_LOOP_RESTORATION
+      cm->rst_internal.component = AOM_PLANE_V;
+      cm->rst_internal.subsampling_y = cm->subsampling_y;
+#endif
       restore_func =
           restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
 #if CONFIG_HIGHBITDEPTH
@@ -1629,3 +1837,93 @@
 
   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
 }
+
+#if CONFIG_STRIPED_LOOP_RESTORATION
+
+// Extend to left and right
+static void extend_line(uint8_t *buf, int width, int extend,
+                        int use_highbitdepth) {
+  int i;
+  if (use_highbitdepth) {
+    uint16_t val, *buf16 = (uint16_t *)buf;
+    val = buf16[0];
+    for (i = 0; i < extend; i++) buf16[-1 - i] = val;
+    val = buf16[width - 1];
+    for (i = 0; i < extend; i++) buf16[width + i] = val;
+  } else {
+    uint8_t val;
+    val = buf[0];
+    for (i = 0; i < extend; i++) buf[-1 - i] = val;
+    val = buf[width - 1];
+    for (i = 0; i < extend; i++) buf[width + i] = val;
+  }
+}
+
+// For each 64 pixel high stripe, save 4 scan lines to be used as boundary in
+// the loop restoration process. The lines are saved in
+// rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
+                                              AV1_COMMON *cm) {
+  int p, boundary_stride;
+  int src_width, src_height, src_stride, stripe_height, stripe_offset, stripe_y,
+      yy;
+  uint8_t *src_buf, *boundary_below_buf, *boundary_above_buf;
+  int use_highbitdepth = 0;
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    if (p == 0) {
+      src_buf = frame->y_buffer;
+      src_width = frame->y_crop_width;
+      src_height = frame->y_crop_height;
+      src_stride = frame->y_stride;
+      stripe_height = 64;
+      stripe_offset = 56 - 2;  // offset of first line to copy
+    } else {
+      src_buf = p == 1 ? frame->u_buffer : frame->v_buffer;
+      src_width = frame->uv_crop_width;
+      src_height = frame->uv_crop_height;
+      src_stride = frame->uv_stride;
+      stripe_height = 64 >> cm->subsampling_y;
+      stripe_offset = (56 >> cm->subsampling_y) - 2;
+    }
+    boundary_above_buf = cm->rst_internal.stripe_boundary_above[p];
+    boundary_below_buf = cm->rst_internal.stripe_boundary_below[p];
+    boundary_stride = cm->rst_internal.stripe_boundary_stride[p];
+#if CONFIG_HIGHBITDEPTH
+    use_highbitdepth = cm->use_highbitdepth;
+    if (use_highbitdepth) {
+      src_buf = (uint8_t *)CONVERT_TO_SHORTPTR(src_buf);
+    }
+#endif
+    src_buf += (stripe_offset * src_stride) << use_highbitdepth;
+    boundary_above_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
+    boundary_below_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
+    // Loop over stripes
+    for (stripe_y = stripe_offset; stripe_y < src_height;
+         stripe_y += stripe_height) {
+      // Save 2 lines above the LR stripe (offset -9, -10)
+      for (yy = 0; yy < 2; yy++) {
+        if (stripe_y + yy < src_height) {
+          memcpy(boundary_above_buf, src_buf, src_width << use_highbitdepth);
+          extend_line(boundary_above_buf, src_width, RESTORATION_EXTRA_HORZ,
+                      use_highbitdepth);
+          src_buf += src_stride << use_highbitdepth;
+          boundary_above_buf += boundary_stride << use_highbitdepth;
+        }
+      }
+      // Save 2 lines below the LR stripe (offset 56,57)
+      for (yy = 2; yy < 4; yy++) {
+        if (stripe_y + yy < src_height) {
+          memcpy(boundary_below_buf, src_buf, src_width << use_highbitdepth);
+          extend_line(boundary_below_buf, src_width, RESTORATION_EXTRA_HORZ,
+                      use_highbitdepth);
+          src_buf += src_stride << use_highbitdepth;
+          boundary_below_buf += boundary_stride << use_highbitdepth;
+        }
+      }
+      // jump to next stripe
+      src_buf += ((stripe_height - 4) * src_stride) << use_highbitdepth;
+    }
+  }
+}
+
+#endif  // CONFIG_STRIPED_LOOP_RESTORATION
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 75fcefb..23a5387 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -26,10 +26,23 @@
 
 #define RESTORATION_PROC_UNIT_SIZE 64
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+// Filter tile grid offset upwards compared to the superblock grid
+#define RESTORATION_TILE_OFFSET 8
+#endif
+
+#if CONFIG_STRIPED_LOOP_RESTORATION
+#define SGRPROJ_BORDER_VERT 2  // Vertical border used for Sgr
+#else
 #define SGRPROJ_BORDER_VERT 1  // Vertical border used for Sgr
+#endif
 #define SGRPROJ_BORDER_HORZ 2  // Horizontal border used for Sgr
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+#define WIENER_BORDER_VERT 2  // Vertical border used for Wiener
+#else
 #define WIENER_BORDER_VERT 1  // Vertical border used for Wiener
+#endif
 #define WIENER_HALFWIN 3
 #define WIENER_BORDER_HORZ (WIENER_HALFWIN)  // Horizontal border for Wiener
 
@@ -48,6 +61,12 @@
 #define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
 #endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+// Additional pixels to the left and right in above/below buffers
+// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
+#define RESTORATION_EXTRA_HORZ 4
+#endif
+
 // Pad up to 20 more (may be much less is needed)
 #define RESTORATION_PADDING 20
 #define RESTORATION_PROC_UNIT_PELS                             \
@@ -57,9 +76,19 @@
     RESTORATION_PADDING))
 
 #define RESTORATION_TILESIZE_MAX 256
+#if CONFIG_STRIPED_LOOP_RESTORATION
+#define RESTORATION_TILEPELS_HORZ_MAX \
+  (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_TILEPELS_VERT_MAX                                \
+  ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+    RESTORATION_TILE_OFFSET))
+#define RESTORATION_TILEPELS_MAX \
+  (RESTORATION_TILEPELS_HORZ_MAX * RESTORATION_TILEPELS_VERT_MAX)
+#else
 #define RESTORATION_TILEPELS_MAX                                           \
   ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \
    (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT))
+#endif
 
 // Two 32-bit buffers needed for the restored versions from two filters
 // TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
@@ -195,6 +224,20 @@
   int tile_width, tile_height;
   int nhtiles, nvtiles;
   int32_t *tmpbuf;
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  int component;
+  int subsampling_y;
+  uint8_t *stripe_boundary_above[MAX_MB_PLANE];
+  uint8_t *stripe_boundary_below[MAX_MB_PLANE];
+  int stripe_boundary_stride[MAX_MB_PLANE];
+  // Temporary buffers to save/restore 2 lines above/below the restoration
+  // stripe
+  // Allow for filter margin to left and right
+  uint16_t
+      tmp_save_above[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
+  uint16_t
+      tmp_save_below[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
+#endif
 } RestorationInternal;
 
 static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
@@ -236,7 +279,12 @@
 
 static INLINE RestorationTileLimits
 av1_get_rest_tile_limits(int tile_idx, int nhtiles, int nvtiles, int tile_width,
-                         int tile_height, int im_width, int im_height) {
+                         int tile_height, int im_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+                         int im_height, int subsampling_y) {
+#else
+                         int im_height) {
+#endif
   const int htile_idx = tile_idx % nhtiles;
   const int vtile_idx = tile_idx / nhtiles;
   RestorationTileLimits limits;
@@ -246,6 +294,13 @@
       (htile_idx < nhtiles - 1) ? limits.h_start + tile_width : im_width;
   limits.v_end =
       (vtile_idx < nvtiles - 1) ? limits.v_start + tile_height : im_height;
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  // Offset the tile upwards to align with the restoration processing stripe
+  limits.v_start -= RESTORATION_TILE_OFFSET >> subsampling_y;
+  if (limits.v_start < 0) limits.v_start = 0;
+  if (limits.v_end < im_height)
+    limits.v_end -= RESTORATION_TILE_OFFSET >> subsampling_y;
+#endif
   return limits;
 }
 
@@ -284,6 +339,9 @@
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int *rcol0, int *rcol1, int *rrow0,
                                        int *rrow1, int *nhtiles);
+
+void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
+                                              struct AV1Common *cm);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index b4259c7..3054618 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -86,6 +86,10 @@
 #include "av1/common/cfl.h"
 #endif
 
+#if CONFIG_STRIPED_LOOP_RESTORATION && !CONFIG_LOOP_RESTORATION
+#error "striped_loop_restoration requires loop_restoration"
+#endif
+
 #if CONFIG_LOOP_RESTORATION
 static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
                                             MACROBLOCKD *xd,
@@ -5589,6 +5593,14 @@
                                cm->tile_rows * cm->tile_cols - 1);
   }
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm);
+  }
+#endif
+
 #if CONFIG_CDEF
   if (!cm->skip_loop_filter && !cm->all_lossless) {
     av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
@@ -5905,6 +5917,14 @@
     return;
   }
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm);
+  }
+#endif
+
 #if CONFIG_CDEF
   if (!cm->skip_loop_filter && !cm->all_lossless) {
     av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index db09c40..118377b 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4525,6 +4525,10 @@
 #endif
   }
 
+#if CONFIG_STRIPED_LOOP_RESTORATION
+  av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm);
+#endif
+
 #if CONFIG_CDEF
   if (is_lossless_requested(&cpi->oxcf)) {
     cm->cdef_bits = 0;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 4be322b..bffc92c 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -150,7 +150,12 @@
   av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, components_pattern,
                              partial_frame, dst_frame);
   RestorationTileLimits limits = av1_get_rest_tile_limits(
-      tile_idx, nhtiles, nvtiles, tile_width, tile_height, width, height);
+      tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+      height, components_pattern > 1 ? cm->subsampling_y : 0);
+#else
+      height);
+#endif
   filt_err = sse_restoration_tile(
       src, dst_frame, cm, limits.h_start, limits.h_end - limits.h_start,
       limits.v_start, limits.v_end - limits.v_start, components_pattern);
@@ -550,7 +555,12 @@
       const int rtile_idx = rtile_row * ctxt->nrtiles_x + rtile_col;
       RestorationTileLimits limits = av1_get_rest_tile_limits(
           rtile_idx, ctxt->nrtiles_x, ctxt->nrtiles_y, rtile_width,
-          rtile_height, ctxt->plane_width, ctxt->plane_height);
+          rtile_height, ctxt->plane_width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+          ctxt->plane_height, ctxt->plane > 0 ? cm->subsampling_y : 0);
+#else
+          ctxt->plane_height);
+#endif
       fun(ctxt, rtile_idx, &limits, arg);
     }
   }
@@ -1324,7 +1334,12 @@
   info->frame_restoration_type = RESTORE_NONE;
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
     RestorationTileLimits limits = av1_get_rest_tile_limits(
-        tile_idx, nhtiles, nvtiles, tile_width, tile_height, width, height);
+        tile_idx, nhtiles, nvtiles, tile_width, tile_height, width,
+#if CONFIG_STRIPED_LOOP_RESTORATION
+        height, plane != AOM_PLANE_Y ? cm->subsampling_y : 0);
+#else
+        height);
+#endif
     err = sse_restoration_tile(src, cm->frame_to_show, cm, limits.h_start,
                                limits.h_end - limits.h_start, limits.v_start,
                                limits.v_end - limits.v_start, 1 << plane);
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 8ba2048..3d8a004 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -196,6 +196,7 @@
 set(CONFIG_SEGMENT_ZEROMV 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_SMOOTH_HV 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_SPEED_REFS 0 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_STRIPED_LOOP_RESTORATION 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_SUPERTX 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_TEMPMV_SIGNALING 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_TPL_MV 0 CACHE NUMBER "AV1 experiment flag.")
diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake
index 38aa027..2d783dc 100644
--- a/build/cmake/aom_configure.cmake
+++ b/build/cmake/aom_configure.cmake
@@ -220,6 +220,12 @@
   endif()
 endif()
 
+if (CONFIG_STRIPED_LOOP_RESTORATION)
+  if (NOT CONFIG_LOOP_RESTORATION)
+    change_config_and_warn(CONFIG_LOOP_RESTORATION 1 CONFIG_STRIPED_LOOP_RESTORATION)
+  endif()
+endif()
+
 if (CONFIG_WARPED_MOTION)
   if (CONFIG_NCOBMC)
     change_config_and_warn(CONFIG_NCOBMC 0 CONFIG_WARPED_MOTION)
diff --git a/configure b/configure
index c59cd2c..a8c5f4f 100755
--- a/configure
+++ b/configure
@@ -276,6 +276,7 @@
     supertx
     ans
     loop_restoration
+    striped_loop_restoration
     ext_partition
     ext_partition_types
     ext_partition_types_ab
@@ -626,6 +627,17 @@
       log_echo "amvr requires hash_me"
       enable_feature hash_me
     fi
+
+    if enabled striped_loop_restoration && ! enabled loop_restoration ; then
+      log_echo "striped_loop_restoration requires loop_restoration"
+      log_echo "enable loop_restoration"
+      enable_feature loop_restoration
+    fi
+    if enabled striped_loop_restoration && enabled frame_superres ; then
+      log_echo "striped_loop_restoration not compatible with frame_superres"
+      log_echo "disabling striped_loop_restoration"
+      disable_feature striped_loop_restoration
+    fi
 }
 
 process_targets() {