Reduce/Eliminate line buffer for loop-restoration.

This patch forces the vertical filtering for the top and bottom
rows of a processing unit for the Wiener filter to not use border
more than what is set in the WIENER_BORDER_VERT macro.
This macro is currently set at 0 to eliminate line buffer completely,
but it could be increased to 1 or 2 to use limited line buffers
if the coding efficiency is affected too much with a 0 line-buffer.

Also, for the sgr filter we added the option of using overlapping
windows horizonttally and vertically to improve coding efficiency.
The vertical border used is set by the SGRPROJ_BORDER_VERT
macro, while the horizontal border can be set by the
SGRPROJ_BORDER_HORZ macro set at 2, the max needed. Currently we do not
recommend changing SGRPROJ_BORDER_HORZ below 2.

The overall line buffer requirement for LR is twice the max of
WIENER_BORDER_VERT and SGRPROJ_BORDER_VERT.
Currently both are set as 0, eliminating line buffers completely.

Also this patch extends borders consistently before CDEF / LR.

Change-Id: Ie58a98c784a0db547627b9cfcf55f018c30e8e79
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 2064330..989e0c9 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -146,15 +146,36 @@
            h_end - h_start);
 }
 
-// Convert 7-tap filter to 5-tap for top and bottom rows of a processing unit
-static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert) {
+static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
+                                   int boundary_dist, int istop) {
   memcpy(vert, orig, sizeof(InterpKernel));
-  int delta = vert[0] / 2;
-  vert[1] += delta;
-  vert[WIENER_WIN - 2] += delta;
-  vert[2] += vert[0] - delta;
-  vert[WIENER_WIN - 3] += vert[0] - delta;
-  vert[0] = vert[WIENER_WIN - 1] = 0;
+  switch (boundary_dist) {
+    case 0:
+      vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
+      vert[2] = vert[1] = vert[0] = 0;
+      break;
+    case 1:
+      vert[2] += vert[1] + vert[0];
+      vert[1] = vert[0] = 0;
+      break;
+    case 2:
+      vert[1] += vert[0];
+      vert[0] = 0;
+      break;
+    default: break;
+  }
+  if (!istop) {
+    int tmp;
+    tmp = vert[0];
+    vert[0] = vert[WIENER_WIN - 1];
+    vert[WIENER_WIN - 1] = tmp;
+    tmp = vert[1];
+    vert[1] = vert[WIENER_WIN - 2];
+    vert[WIENER_WIN - 2] = tmp;
+    tmp = vert[2];
+    vert[2] = vert[WIENER_WIN - 3];
+    vert[WIENER_WIN - 3] = tmp;
+  }
 }
 
 static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
@@ -173,8 +194,6 @@
     return;
   }
   InterpKernel vertical_topbot;
-  stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                         vertical_topbot);
   av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                            tile_width, tile_height, width, height, 0, 0,
                            &h_start, &h_end, &v_start, &v_end);
@@ -186,42 +205,50 @@
       int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
       const uint8_t *data_p = data + i * stride + j;
       uint8_t *dst_p = dst + i * dst_stride + j;
-// Use 5-tap vertical filtering for top and bottom rows in
-// processing unit
+      // Note h is at least 16
+      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
+        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                vertical_topbot, 16, w, 1);
+        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
+                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                                  vertical_topbot, 16, w, 1);
 #else
-      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                            vertical_topbot, 16, w, 1);
+        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                              vertical_topbot, 16, w, 1);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride;
-      dst_p += dst_stride;
-// Note h is at least 16
+        data_p += stride;
+        dst_p += dst_stride;
+      }
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
       aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
                                 rst->rsi->wiener_info[tile_idx].hfilter, 16,
                                 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                                h - 2);
+                                h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
 #else
       aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
                             rst->rsi->wiener_info[tile_idx].hfilter, 16,
                             rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                            h - 2);
+                            h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride * (h - 2);
-      dst_p += dst_stride * (h - 2);
+      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
+        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
-                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                vertical_topbot, 16, w, 1);
+        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
+                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                                  vertical_topbot, 16, w, 1);
 #else
-      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
-                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                            vertical_topbot, 16, w, 1);
+        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                              vertical_topbot, 16, w, 1);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+        data_p += stride;
+        dst_p += dst_stride;
+      }
     }
 }
 
@@ -618,30 +645,40 @@
 };
 
 static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
-                                                int height, int stride,
+                                                int height, int dgd_stride,
+                                                int32_t *dst, int dst_stride,
                                                 int bit_depth, int r, int eps,
                                                 int32_t *tmpbuf) {
-  int32_t *A = tmpbuf;
-  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
-  int8_t num[RESTORATION_TILEPELS_MAX];
-  int i, j;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int num_stride = width_ext;
   // Adjusting the stride of A and B here appears to avoid bad cache effects,
   // leading to a significant speed improvement.
   // We also align the stride to a multiple of 16 bytes, for consistency
   // with the SIMD version of this function.
-  int buf_stride = ((width + 3) & ~3) + 16;
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+  int32_t *A = tmpbuf;
+  int32_t *B = tmpbuf + SGRPROJ_OUTBUF_SIZE;
+  int8_t num_[RESTORATION_TILEPELS_MAX];
+  int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
+  int i, j;
 
   // Don't filter tiles with dimensions < 5 on any axis
   if ((width < 5) || (height < 5)) return;
 
-  boxsum(dgd, width, height, stride, r, 0, B, buf_stride);
-  boxsum(dgd, width, height, stride, r, 1, A, buf_stride);
-  boxnum(width, height, r, num, width);
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  boxnum(width_ext, height_ext, r, num_, num_stride);
   assert(r <= 3);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       const int k = i * buf_stride + j;
-      const int n = num[i * width + j];
+      const int n = num[i * num_stride + j];
 
       // a < 2^16 * n < 2^22 regardless of bit depth
       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
@@ -677,106 +714,115 @@
   j = 0;
   {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a =
         3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
     const int32_t b =
         3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   i = 0;
   j = width - 1;
   {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a =
         3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
     const int32_t b =
         3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   i = height - 1;
   j = 0;
   {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a =
         3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
     const int32_t b =
         3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   i = height - 1;
   j = width - 1;
   {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a =
         3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
     const int32_t b =
         3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   i = 0;
   for (j = 1; j < width - 1; ++j) {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
                       A[k + buf_stride - 1] + A[k + buf_stride + 1];
     const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
                       B[k + buf_stride - 1] + B[k + buf_stride + 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   i = height - 1;
   for (j = 1; j < width - 1; ++j) {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
                       A[k - buf_stride - 1] + A[k - buf_stride + 1];
     const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
                       B[k - buf_stride - 1] + B[k - buf_stride + 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   j = 0;
   for (i = 1; i < height - 1; ++i) {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                       A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
     const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                       B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   j = width - 1;
   for (i = 1; i < height - 1; ++i) {
     const int k = i * buf_stride + j;
-    const int l = i * stride + j;
+    const int l = i * dgd_stride + j;
+    const int m = i * dst_stride + j;
     const int nb = 3;
     const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                       A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
     const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                       B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
     const int32_t v = a * dgd[l] + b;
-    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
   }
   for (i = 1; i < height - 1; ++i) {
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
+      const int m = i * dst_stride + j;
       const int nb = 5;
       const int32_t a =
           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
@@ -791,7 +837,7 @@
            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
               3;
       const int32_t v = a * dgd[l] + b;
-      dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
     }
   }
 }
@@ -799,14 +845,18 @@
 void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
                                   int stride, int32_t *dst, int dst_stride,
                                   int r, int eps, int32_t *tmpbuf) {
+  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  int32_t *dgd32 =
+      tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
   int i, j;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      dst[i * dst_stride + j] = dgd[i * stride + j];
+  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
     }
   }
-  av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
-                                      tmpbuf);
+  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
+                                      dst_stride, 8, r, eps,
+                                      tmpbuf + RESTORATION_TILEPELS_MAX);
 }
 
 void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
@@ -959,7 +1009,7 @@
       int h = AOMMIN(procunit_height, v_end - i);
       uint8_t *data_p = data + i * stride + j;
       uint8_t *dst_p = dst + i * dst_stride + j;
-      apply_selfguided_restoration(
+      apply_selfguided_restoration_c(
           data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
           rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
     }
@@ -969,6 +1019,7 @@
                                 int stride, RestorationInternal *rst,
                                 uint8_t *dst, int dst_stride) {
   int tile_idx;
+  extend_frame(data, width, height, stride);
   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
     loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                              dst_stride);
@@ -1052,8 +1103,6 @@
                            tile_width, tile_height, width, height, 0, 0,
                            &h_start, &h_end, &v_start, &v_end);
   InterpKernel vertical_topbot;
-  stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
-                         vertical_topbot);
   // Convolve the whole tile (done in blocks here to match the requirements
   // of the vectorized convolve functions, but the result is equivalent)
   for (i = v_start; i < v_end; i += procunit_height)
@@ -1062,46 +1111,56 @@
       int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
       const uint16_t *data_p = data + i * stride + j;
       uint16_t *dst_p = dst + i * dst_stride + j;
-// if the filter is 7-tap do only horizontal filtering for top and
-// bottom rows.
+      // Note h is at least 16
+      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
+        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_highbd_convolve8_add_src_hip(
-          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          vertical_topbot, 16, w, 1, bit_depth);
+        aom_highbd_convolve8_add_src_hip(
+            CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+            dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+            vertical_topbot, 16, w, 1, bit_depth);
 #else
-      aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
-                                   CONVERT_TO_BYTEPTR(dst_p), dst_stride,
-                                   rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                   vertical_topbot, 16, w, 1, bit_depth);
+        aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+                                     CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+                                     rst->rsi->wiener_info[tile_idx].hfilter,
+                                     16, vertical_topbot, 16, w, 1, bit_depth);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride;
-      dst_p += dst_stride;
-// Note h is at least 16
+        data_p += stride;
+        dst_p += dst_stride;
+      }
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
       aom_highbd_convolve8_add_src_hip(
           CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
           dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
+          rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
+          h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
 #else
       aom_highbd_convolve8_add_src(
           CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
           dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
+          rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
+          h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      data_p += stride * (h - 2);
-      dst_p += dst_stride * (h - 2);
+      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
+        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
-      aom_highbd_convolve8_add_src_hip(
-          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
-          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          vertical_topbot, 16, w, 1, bit_depth);
+        aom_highbd_convolve8_add_src_hip(
+            CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+            dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+            vertical_topbot, 16, w, 1, bit_depth);
 #else
-      aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
-                                   CONVERT_TO_BYTEPTR(dst_p), dst_stride,
-                                   rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                                   vertical_topbot, 16, w, 1, bit_depth);
+        aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+                                     CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+                                     rst->rsi->wiener_info[tile_idx].hfilter,
+                                     16, vertical_topbot, 16, w, 1, bit_depth);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+        data_p += stride;
+        dst_p += dst_stride;
+      }
     }
 }
 
@@ -1123,14 +1182,18 @@
                                          int stride, int32_t *dst,
                                          int dst_stride, int bit_depth, int r,
                                          int eps, int32_t *tmpbuf) {
+  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  int32_t *dgd32 =
+      tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
   int i, j;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      dst[i * dst_stride + j] = dgd[i * stride + j];
+  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
     }
   }
-  av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
-                                      r, eps, tmpbuf);
+  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
+                                      dst_stride, bit_depth, r, eps,
+                                      tmpbuf + RESTORATION_TILEPELS_MAX);
 }
 
 void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
@@ -1288,7 +1351,7 @@
       int h = AOMMIN(procunit_height, v_end - i);
       uint16_t *data_p = data + i * stride + j;
       uint16_t *dst_p = dst + i * dst_stride + j;
-      apply_selfguided_restoration_highbd(
+      apply_selfguided_restoration_highbd_c(
           data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
           rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
     }
@@ -1301,6 +1364,7 @@
   int tile_idx;
   uint16_t *data = CONVERT_TO_SHORTPTR(data8);
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  extend_frame_highbd(data, width, height, stride);
   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
     loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
                                     bit_depth, dst, dst_stride);
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 9eee959..30ddc11 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -25,24 +25,42 @@
 #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
 
 #define RESTORATION_PROC_UNIT_SIZE 64
+// Determines line buffer requirement for LR. Should be set at the max
+// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT 0
+#define RESTORATION_BORDER_HORZ 3  // Do not change this
+
+// Pad up to 20 more (may be much less is needed)
+#define RESTORATION_PADDING 20
+#define RESTORATION_PROC_UNIT_PELS                             \
+  ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
+    RESTORATION_PADDING) *                                     \
+   (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
+    RESTORATION_PADDING))
 
 #define RESTORATION_TILESIZE_MAX 256
-#define RESTORATION_TILEPELS_MAX \
-  (RESTORATION_TILESIZE_MAX * RESTORATION_TILESIZE_MAX * 9 / 4)
+#define RESTORATION_TILEPELS_MAX                                     \
+  (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \
+      (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)
 
 // 4 32-bit buffers needed for the filter:
 // 2 for the restored versions of the frame and
 // 2 for each restoration operation
-#define SGRPROJ_OUTBUF_SIZE \
-  ((RESTORATION_TILESIZE_MAX * 3 / 2) * (RESTORATION_TILESIZE_MAX * 3 / 2 + 16))
+#define SGRPROJ_OUTBUF_SIZE                                           \
+  ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \
+   (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16))
 #define SGRPROJ_TMPBUF_SIZE                         \
   (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
-   SGRPROJ_OUTBUF_SIZE * 2 * sizeof(int32_t))
+   SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS)
+
 #define SGRPROJ_EXTBUF_SIZE (0)
 #define SGRPROJ_PARAMS_BITS 4
 #define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
 #define USE_HIGHPASS_IN_SGRPROJ 0
 
+#define SGRPROJ_BORDER_VERT 0  // Vertical border used for sgr
+#define SGRPROJ_BORDER_HORZ 2  // Horizontal border used for sgr
+
 // Precision bits for projection
 #define SGRPROJ_PRJ_BITS 7
 // Restoration precision bits generated higher than source before projection
@@ -74,6 +92,8 @@
 #define SGRPROJ_RECIP_BITS 12
 
 #define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN)
+#define WIENER_BORDER_VERT 0
 #define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
 #define WIENER_WIN (2 * WIENER_HALFWIN + 1)
 #define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 123353e..d0716e6 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -663,8 +663,11 @@
 }
 
 void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
-                                       int stride, int32_t *dst, int dst_stride,
-                                       int r, int eps, int32_t *tmpbuf) {
+                                       int dgd_stride, int32_t *dst,
+                                       int dst_stride, int r, int eps,
+                                       int32_t *tmpbuf) {
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   int32_t *A = tmpbuf;
   int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
   int i, j;
@@ -676,25 +679,31 @@
   // Don't filter tiles with dimensions < 5 on any axis
   if ((width < 5) || (height < 5)) return;
 
+  uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
   if (r == 1) {
-    selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
-    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+    selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+                               buf_stride);
+    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
   } else if (r == 2) {
-    selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
-    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+    selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+                               buf_stride);
+    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
   } else if (r == 3) {
-    selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
-    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+    selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+                               buf_stride);
+    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
   } else {
     assert(0);
   }
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
 
   {
     i = 0;
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -706,7 +715,7 @@
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -719,7 +728,7 @@
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -734,7 +743,7 @@
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -750,7 +759,7 @@
     // Vectorize the innermost loop
     for (j = 1; j < width - 1; j += 4) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
 
@@ -803,7 +812,7 @@
     // (typically have 2 such pixels, but may have anywhere between 0 and 3)
     for (; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
       const int32_t a =
@@ -825,7 +834,7 @@
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -844,7 +853,7 @@
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -856,7 +865,7 @@
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -869,7 +878,7 @@
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
@@ -1363,10 +1372,12 @@
 }
 
 void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
-                                              int height, int stride,
+                                              int height, int dgd_stride,
                                               int32_t *dst, int dst_stride,
                                               int bit_depth, int r, int eps,
                                               int32_t *tmpbuf) {
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
   int32_t *A = tmpbuf;
   int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
   int i, j;
@@ -1378,28 +1389,34 @@
   // Don't filter tiles with dimensions < 5 on any axis
   if ((width < 5) || (height < 5)) return;
 
+  uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
   if (r == 1) {
-    highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
-                                      buf_stride);
-    selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+    highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
+                                      A, B, buf_stride);
+    selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
+                               bit_depth);
   } else if (r == 2) {
-    highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
-                                      buf_stride);
-    selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+    highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
+                                      A, B, buf_stride);
+    selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
+                               bit_depth);
   } else if (r == 3) {
-    highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
-                                      buf_stride);
-    selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+    highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
+                                      A, B, buf_stride);
+    selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
+                               bit_depth);
   } else {
     assert(0);
   }
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
 
   {
     i = 0;
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -1411,7 +1428,7 @@
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -1424,7 +1441,7 @@
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -1439,7 +1456,7 @@
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1455,7 +1472,7 @@
     // Vectorize the innermost loop
     for (j = 1; j < width - 1; j += 4) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
 
@@ -1508,7 +1525,7 @@
     // (typically have 2 such pixels, but may have anywhere between 0 and 3)
     for (; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 5;
       const int32_t a =
@@ -1530,7 +1547,7 @@
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1549,7 +1566,7 @@
     j = 0;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -1561,7 +1578,7 @@
     }
     for (j = 1; j < width - 1; ++j) {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -1574,7 +1591,7 @@
     j = width - 1;
     {
       const int k = i * buf_stride + j;
-      const int l = i * stride + j;
+      const int l = i * dgd_stride + j;
       const int m = i * dst_stride + j;
       const int nb = 3;
       const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 76cc882..dd18053 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5358,6 +5358,7 @@
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
 
+  aom_extend_frame_borders(new_fb);
 #if CONFIG_CDEF
   if (!cm->skip_loop_filter && !cm->all_lossless) {
     av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index eff1fa3..7487e11 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4183,6 +4183,8 @@
       av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #endif
   }
+  aom_extend_frame_borders(cm->frame_to_show);
+
 #if CONFIG_CDEF
   if (is_lossless_requested(&cpi->oxcf)) {
     cm->cdef_bits = 0;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 210120a..98fbfb5 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -637,6 +637,15 @@
   // Compute best Sgrproj filters for each rtile, one (encoder/decoder)
   // tile at a time.
   const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
+                        ctxt.plane_height, ctxt.dgd_stride);
+  else
+#endif
+    extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
+                 ctxt.dgd_stride);
+
   for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
     for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
       SgrprojInfo ref_sgrproj_info;
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index eaf5520..63b5bac 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -40,18 +40,23 @@
 
  protected:
   void RunSpeedTest() {
-    const int w = 256, h = 256;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
     const int NUM_ITERS = 2000;
     int i, j;
 
-    uint8_t *input = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
-    uint8_t *output = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        16, out_stride * (height + 32) * sizeof(uint8_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & 0xFF;
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & 0xFF;
 
     int xqd[2] = {
       SGRPROJ_PRJ_MIN0 +
@@ -67,16 +72,17 @@
 
     std::clock_t start = std::clock();
     for (i = 0; i < NUM_ITERS; ++i) {
-      apply_selfguided_restoration(input, w, h, w, eps, xqd, output, w, tmpbuf);
+      apply_selfguided_restoration(input, width, height, stride, eps, xqd,
+                                   output, out_stride, tmpbuf);
     }
     std::clock_t end = std::clock();
     double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
-           elapsed, elapsed * 1000000. / NUM_ITERS);
+    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
+           height, elapsed, elapsed * 1000000. / NUM_ITERS);
 
-    aom_free(input);
-    aom_free(output);
+    aom_free(input_);
+    aom_free(output_);
     aom_free(tmpbuf);
   }
 
@@ -88,21 +94,26 @@
     const int NUM_ITERS = 81;
     int i, j, k;
 
-    uint8_t *input =
-        (uint8_t *)aom_memalign(16, stride * max_h * sizeof(uint8_t));
-    uint8_t *output =
-        (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
-    uint8_t *output2 =
-        (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output2_ = (uint8_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint8_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
 
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
+    uint8_t *output2 = output2_ + out_stride * 16 + 16;
+
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
     av1_loop_restoration_precal();
 
     for (i = 0; i < NUM_ITERS; ++i) {
-      for (j = 0; j < max_h; ++j)
-        for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & 0xFF;
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & 0xFF;
 
       int xqd[2] = {
         SGRPROJ_PRJ_MIN0 +
@@ -121,13 +132,14 @@
       apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
                                      output2, out_stride, tmpbuf);
       for (j = 0; j < test_h; ++j)
-        for (k = 0; k < test_w; ++k)
+        for (k = 0; k < test_w; ++k) {
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+        }
     }
 
-    aom_free(input);
-    aom_free(output);
-    aom_free(output2);
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
     aom_free(tmpbuf);
   }
 };
@@ -155,20 +167,25 @@
 
  protected:
   void RunSpeedTest() {
-    const int w = 256, h = 256;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
     const int NUM_ITERS = 2000;
     int i, j;
     int bit_depth = GET_PARAM(0);
     int mask = (1 << bit_depth) - 1;
 
-    uint16_t *input = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
-    uint16_t *output = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        16, out_stride * (height + 32) * sizeof(uint16_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & mask;
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & mask;
 
     int xqd[2] = {
       SGRPROJ_PRJ_MIN0 +
@@ -184,17 +201,18 @@
 
     std::clock_t start = std::clock();
     for (i = 0; i < NUM_ITERS; ++i) {
-      apply_selfguided_restoration_highbd(input, w, h, w, bit_depth, eps, xqd,
-                                          output, w, tmpbuf);
+      apply_selfguided_restoration_highbd(input, width, height, stride,
+                                          bit_depth, eps, xqd, output,
+                                          out_stride, tmpbuf);
     }
     std::clock_t end = std::clock();
     double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
 
-    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
-           elapsed, elapsed * 1000000. / NUM_ITERS);
+    printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
+           height, elapsed, elapsed * 1000000. / NUM_ITERS);
 
-    aom_free(input);
-    aom_free(output);
+    aom_free(input_);
+    aom_free(output_);
     aom_free(tmpbuf);
   }
 
@@ -208,21 +226,26 @@
     int bit_depth = GET_PARAM(0);
     int mask = (1 << bit_depth) - 1;
 
-    uint16_t *input =
-        (uint16_t *)aom_memalign(16, stride * max_h * sizeof(uint16_t));
-    uint16_t *output =
-        (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
-    uint16_t *output2 =
-        (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output2_ = (uint16_t *)aom_memalign(
+        16, out_stride * (max_h + 32) * sizeof(uint16_t));
     int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
 
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
+    uint16_t *output2 = output2_ + out_stride * 16 + 16;
+
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
     av1_loop_restoration_precal();
 
     for (i = 0; i < NUM_ITERS; ++i) {
-      for (j = 0; j < max_h; ++j)
-        for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & mask;
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & mask;
 
       int xqd[2] = {
         SGRPROJ_PRJ_MIN0 +
@@ -247,9 +270,9 @@
           ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
     }
 
-    aom_free(input);
-    aom_free(output);
-    aom_free(output2);
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
     aom_free(tmpbuf);
   }
 };