Add guided projection filter to loop restoration

BDRATE:
lowres: -1.01% (up from -0.7%)
midres: -1.90% (up from -1.5%)
hdres:  -2.11% (up from ~1.7%)

Change-Id: I1fe04ec9ef90ccc4cc990e09cd45eea82c752e0c
diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 7c5b358..90f0ba5 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c
@@ -89,6 +89,8 @@
   cm->rst_info.bilateral_info = NULL;
   aom_free(cm->rst_info.wiener_info);
   cm->rst_info.wiener_info = NULL;
+  aom_free(cm->rst_info.sgrproj_info);
+  cm->rst_info.sgrproj_info = NULL;
 }
 #endif  // CONFIG_LOOP_RESTORATION
 
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index b3789e2..405c983 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -951,13 +951,13 @@
                                           };
 
 #if CONFIG_LOOP_RESTORATION
-const aom_tree_index
-    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)] = {
-      -RESTORE_NONE, 2, -RESTORE_BILATERAL, -RESTORE_WIENER,
-    };
+const aom_tree_index av1_switchable_restore_tree[TREE_SIZE(
+    RESTORE_SWITCHABLE_TYPES)] = {
+  -RESTORE_NONE, 2, -RESTORE_SGRPROJ, 4, -RESTORE_BILATERAL, -RESTORE_WIENER,
+};
 
-static const aom_prob
-    default_switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1] = { 32, 128 };
+static const aom_prob default_switchable_restore_prob[RESTORE_SWITCHABLE_TYPES -
+                                                      1] = { 32, 85, 128 };
 #endif  // CONFIG_LOOP_RESTORATION
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_VAR_TX
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 147b7e9..37fe2b5 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -343,6 +343,7 @@
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if CONFIG_LOOP_RESTORATION
+#define RESTORE_NONE_SGRPROJ_PROB 64
 #define RESTORE_NONE_BILATERAL_PROB 16
 #define RESTORE_NONE_WIENER_PROB 64
 extern const aom_tree_index
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 6c0eb3d..ebf520a 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -463,10 +463,11 @@
 
 #if CONFIG_LOOP_RESTORATION
 typedef enum {
-  RESTORE_NONE,
-  RESTORE_BILATERAL,
-  RESTORE_WIENER,
-  RESTORE_SWITCHABLE,
+  RESTORE_NONE = 0,
+  RESTORE_SGRPROJ = 1,
+  RESTORE_BILATERAL = 2,
+  RESTORE_WIENER = 3,
+  RESTORE_SWITCHABLE = 4,
   RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
   RESTORE_TYPES,
 } RestorationType;
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index f1c4239..59ffab9 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -54,6 +54,12 @@
       { 56, 56, 48 }, { 56, 56, 56 }, { 56, 56, 64 }, { 64, 64, 48 },
     };
 
+const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+  // r1, eps1, r2, eps2
+  { 2, 27, 1, 11 }, { 2, 31, 1, 12 }, { 2, 37, 1, 12 }, { 2, 44, 1, 12 },
+  { 2, 49, 1, 13 }, { 2, 54, 1, 14 }, { 2, 60, 1, 15 }, { 2, 68, 1, 15 },
+};
+
 typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
                                   int stride, RestorationInternal *rst,
                                   uint8_t *tmpdata8, int tmpstride);
@@ -317,9 +323,9 @@
 static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
                                RestorationInternal *rst, uint8_t *tmpdata,
                                int tmpstride) {
-  int i, tile_idx;
+  int tile_idx;
+  int i;
   uint8_t *data_p, *tmpdata_p;
-
   // Initialize tmp buffer
   data_p = data;
   tmpdata_p = tmpdata;
@@ -334,11 +340,349 @@
   }
 }
 
+static void boxsum(int64_t *src, int width, int height, int src_stride, int r,
+                   int sqr, int64_t *dst, int dst_stride, int64_t *tmp,
+                   int tmp_stride) {
+  int i, j;
+
+  if (sqr) {
+    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
+    for (j = 0; j < width; ++j)
+      for (i = 1; i < height; ++i)
+        tmp[i * tmp_stride + j] =
+            tmp[(i - 1) * tmp_stride + j] +
+            src[i * src_stride + j] * src[i * src_stride + j];
+  } else {
+    memcpy(tmp, src, sizeof(*tmp) * width);
+    for (j = 0; j < width; ++j)
+      for (i = 1; i < height; ++i)
+        tmp[i * tmp_stride + j] =
+            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
+  }
+  for (i = 0; i <= r; ++i)
+    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
+           sizeof(*tmp) * width);
+  for (i = r + 1; i < height - r; ++i)
+    for (j = 0; j < width; ++j)
+      dst[i * dst_stride + j] =
+          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
+  for (i = height - r; i < height; ++i)
+    for (j = 0; j < width; ++j)
+      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
+                                tmp[(i - r - 1) * tmp_stride + j];
+
+  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
+  for (i = 0; i < height; ++i)
+    for (j = 1; j < width; ++j)
+      tmp[i * tmp_stride + j] =
+          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
+
+  for (j = 0; j <= r; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
+  for (j = r + 1; j < width - r; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] =
+          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
+  for (j = width - r; j < width; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] =
+          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
+}
+
+static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
+  int i, j;
+  for (i = 0; i <= r; ++i) {
+    for (j = 0; j <= r; ++j) {
+      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
+      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
+      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
+      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
+          num[i * num_stride + j];
+    }
+  }
+  for (j = 0; j <= r; ++j) {
+    const int val = (2 * r + 1) * (r + 1 + j);
+    for (i = r + 1; i < height - r; ++i) {
+      num[i * num_stride + j] = val;
+      num[i * num_stride + (width - 1 - j)] = val;
+    }
+  }
+  for (i = 0; i <= r; ++i) {
+    const int val = (2 * r + 1) * (r + 1 + i);
+    for (j = r + 1; j < width - r; ++j) {
+      num[i * num_stride + j] = val;
+      num[(height - 1 - i) * num_stride + j] = val;
+    }
+  }
+  for (i = r + 1; i < height - r; ++i) {
+    for (j = r + 1; j < width - r; ++j) {
+      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
+    }
+  }
+}
+
+void decode_xq(int *xqd, int *xq) {
+  xq[0] = -xqd[0];
+  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
+}
+
+#define APPROXIMATE_SGR 1
+void av1_selfguided_restoration(int64_t *dgd, int width, int height, int stride,
+                                int bit_depth, int r, int eps, void *tmpbuf) {
+  int64_t *A = (int64_t *)tmpbuf;
+  int64_t *B = A + RESTORATION_TILEPELS_MAX;
+  int64_t *T = B + RESTORATION_TILEPELS_MAX;
+  int8_t num[RESTORATION_TILEPELS_MAX];
+  int i, j;
+  eps <<= 2 * (bit_depth - 8);
+
+  boxsum(dgd, width, height, stride, r, 0, B, width, T, width);
+  boxsum(dgd, width, height, stride, r, 1, A, width, T, width);
+  boxnum(width, height, r, num, width);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      const int n = num[k];
+      int64_t den;
+      A[k] = A[k] * n - B[k] * B[k];
+      den = A[k] + n * n * eps;
+      A[k] = ((A[k] << SGRPROJ_SGR_BITS) + (den >> 1)) / den;
+      B[k] = ((SGRPROJ_SGR - A[k]) * B[k] + (n >> 1)) / n;
+    }
+  }
+#if APPROXIMATE_SGR
+  i = 0;
+  j = 0;
+  {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a =
+        3 * A[k] + 2 * A[k + 1] + 2 * A[k + width] + A[k + width + 1];
+    const int64_t b =
+        3 * B[k] + 2 * B[k + 1] + 2 * B[k + width] + B[k + width + 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  i = 0;
+  j = width - 1;
+  {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a =
+        3 * A[k] + 2 * A[k - 1] + 2 * A[k + width] + A[k + width - 1];
+    const int64_t b =
+        3 * B[k] + 2 * B[k - 1] + 2 * B[k + width] + B[k + width - 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  i = height - 1;
+  j = 0;
+  {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a =
+        3 * A[k] + 2 * A[k + 1] + 2 * A[k - width] + A[k - width + 1];
+    const int64_t b =
+        3 * B[k] + 2 * B[k + 1] + 2 * B[k - width] + B[k - width + 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  i = height - 1;
+  j = width - 1;
+  {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a =
+        3 * A[k] + 2 * A[k - 1] + 2 * A[k - width] + A[k - width - 1];
+    const int64_t b =
+        3 * B[k] + 2 * B[k - 1] + 2 * B[k - width] + B[k - width - 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  i = 0;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + width] +
+                      A[k + width - 1] + A[k + width + 1];
+    const int64_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + width] +
+                      B[k + width - 1] + B[k + width + 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  i = height - 1;
+  for (j = 1; j < width - 1; ++j) {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - width] +
+                      A[k - width - 1] + A[k - width + 1];
+    const int64_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - width] +
+                      B[k - width - 1] + B[k - width + 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  j = 0;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a = A[k] + 2 * (A[k - width] + A[k + width]) + A[k + 1] +
+                      A[k - width + 1] + A[k + width + 1];
+    const int64_t b = B[k] + 2 * (B[k - width] + B[k + width]) + B[k + 1] +
+                      B[k - width + 1] + B[k + width + 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  j = width - 1;
+  for (i = 1; i < height - 1; ++i) {
+    const int k = i * width + j;
+    const int l = i * stride + j;
+    const int nb = 3;
+    const int64_t a = A[k] + 2 * (A[k - width] + A[k + width]) + A[k - 1] +
+                      A[k - width - 1] + A[k + width - 1];
+    const int64_t b = B[k] + 2 * (B[k - width] + B[k + width]) + B[k - 1] +
+                      B[k - width - 1] + B[k + width - 1];
+    const int64_t v =
+        (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+  }
+  for (i = 1; i < height - 1; ++i) {
+    for (j = 1; j < width - 1; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int nb = 5;
+      const int64_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - width] + A[k + width]) * 4 +
+          (A[k - 1 - width] + A[k - 1 + width] + A[k + 1 - width] +
+           A[k + 1 + width]) *
+              3;
+      const int64_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - width] + B[k + width]) * 4 +
+          (B[k - 1 - width] + B[k - 1 + width] + B[k + 1 - width] +
+           B[k + 1 + width]) *
+              3;
+      const int64_t v =
+          (((a * dgd[l] + b) << SGRPROJ_RST_BITS) + (1 << nb) / 2) >> nb;
+      dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+    }
+  }
+#else
+  if (r > 1) boxnum(width, height, r = 1, num, width);
+  boxsum(A, width, height, width, r, 0, A, width, T, width);
+  boxsum(B, width, height, width, r, 0, B, width, T, width);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int n = num[k];
+      const int64_t v =
+          (((A[k] * dgd[l] + B[k]) << SGRPROJ_RST_BITS) + (n >> 1)) / n;
+      dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS);
+    }
+  }
+#endif  // APPROXIMATE_SGR
+}
+
+static void apply_selfguided_restoration(int64_t *dat, int width, int height,
+                                         int stride, int bit_depth, int eps,
+                                         int *xqd, void *tmpbuf) {
+  int xq[2];
+  int64_t *flt1 = (int64_t *)tmpbuf;
+  int64_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  uint8_t *tmpbuf2 = (uint8_t *)(flt2 + RESTORATION_TILEPELS_MAX);
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      assert(i * width + j < RESTORATION_TILEPELS_MAX);
+      flt1[i * width + j] = dat[i * stride + j];
+      flt2[i * width + j] = dat[i * stride + j];
+    }
+  }
+  av1_selfguided_restoration(flt1, width, height, width, bit_depth,
+                             sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
+  av1_selfguided_restoration(flt2, width, height, width, bit_depth,
+                             sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
+  decode_xq(xqd, xq);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      const int l = i * stride + j;
+      const int64_t u = ((int64_t)dat[l] << SGRPROJ_RST_BITS);
+      const int64_t f1 = (int64_t)flt1[k] - u;
+      const int64_t f2 = (int64_t)flt2[k] - u;
+      const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      dat[l] = w;
+    }
+  }
+}
+
+static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
+                                     int height, int stride,
+                                     RestorationInternal *rst, void *tmpbuf) {
+  const int tile_width = rst->tile_width >> rst->subsampling_x;
+  const int tile_height = rst->tile_height >> rst->subsampling_y;
+  int i, j;
+  int h_start, h_end, v_start, v_end;
+  uint8_t *data_p;
+  int64_t dat[RESTORATION_TILEPELS_MAX];
+
+  if (rst->rsi->sgrproj_info[tile_idx].level == 0) return;
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      dat[i * (h_end - h_start) + j] = data_p[i * stride + j];
+    }
+  }
+  apply_selfguided_restoration(dat, h_end - h_start, v_end - v_start,
+                               h_end - h_start, 8,
+                               rst->rsi->sgrproj_info[tile_idx].ep,
+                               rst->rsi->sgrproj_info[tile_idx].xqd, tmpbuf);
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      data_p[i * stride + j] = clip_pixel(dat[i * (h_end - h_start) + j]);
+    }
+  }
+}
+
+static void loop_sgrproj_filter(uint8_t *data, int width, int height,
+                                int stride, RestorationInternal *rst,
+                                uint8_t *tmpdata, int tmpstride) {
+  int tile_idx;
+  uint8_t *tmpbuf = aom_malloc(SGRPROJ_TMPBUF_SIZE);
+  (void)tmpdata;
+  (void)tmpstride;
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst,
+                             tmpbuf);
+  }
+  aom_free(tmpbuf);
+}
+
 static void loop_switchable_filter(uint8_t *data, int width, int height,
                                    int stride, RestorationInternal *rst,
                                    uint8_t *tmpdata, int tmpstride) {
   int i, tile_idx;
   uint8_t *data_p, *tmpdata_p;
+  uint8_t *tmpbuf = aom_malloc(SGRPROJ_TMPBUF_SIZE);
 
   // Initialize tmp buffer
   data_p = data;
@@ -355,8 +699,12 @@
     } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
       loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst,
                               tmpdata, tmpstride);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
+      loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst,
+                               tmpbuf);
     }
   }
+  aom_free(tmpbuf);
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -502,29 +850,69 @@
                                       int bit_depth) {
   uint16_t *data = CONVERT_TO_SHORTPTR(data8);
   uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
-  int i, tile_idx;
-  uint16_t *data_p, *tmpdata_p;
-
-  // Initialize tmp buffer
-  data_p = data;
-  tmpdata_p = tmpdata;
-  for (i = 0; i < height; ++i) {
-    memcpy(tmpdata_p, data_p, sizeof(*data_p) * width);
-    data_p += stride;
-    tmpdata_p += tmpstride;
-  }
+  int tile_idx;
   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
     loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
                                    tmpdata, tmpstride, bit_depth);
   }
 }
 
+static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
+                                            int width, int height, int stride,
+                                            RestorationInternal *rst,
+                                            int bit_depth, void *tmpbuf) {
+  const int tile_width = rst->tile_width >> rst->subsampling_x;
+  const int tile_height = rst->tile_height >> rst->subsampling_y;
+  int i, j;
+  int h_start, h_end, v_start, v_end;
+  uint16_t *data_p;
+  int64_t dat[RESTORATION_TILEPELS_MAX];
+
+  if (rst->rsi->sgrproj_info[tile_idx].level == 0) return;
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      dat[i * (h_end - h_start) + j] = data_p[i * stride + j];
+    }
+  }
+  apply_selfguided_restoration(dat, h_end - h_start, v_end - v_start,
+                               h_end - h_start, bit_depth,
+                               rst->rsi->sgrproj_info[tile_idx].ep,
+                               rst->rsi->sgrproj_info[tile_idx].xqd, tmpbuf);
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      data_p[i * stride + j] =
+          clip_pixel_highbd(dat[i * (h_end - h_start) + j], bit_depth);
+    }
+  }
+}
+
+static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
+                                       int stride, RestorationInternal *rst,
+                                       uint8_t *tmpdata8, int tmpstride,
+                                       int bit_depth) {
+  int tile_idx;
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint8_t *tmpbuf = aom_malloc(SGRPROJ_TMPBUF_SIZE);
+  (void)tmpdata8;
+  (void)tmpstride;
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
+                                    bit_depth, tmpbuf);
+  }
+  aom_free(tmpbuf);
+}
+
 static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
                                           int stride, RestorationInternal *rst,
                                           uint8_t *tmpdata8, int tmpstride,
                                           int bit_depth) {
   uint16_t *data = CONVERT_TO_SHORTPTR(data8);
   uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  uint8_t *tmpbuf = aom_malloc(SGRPROJ_TMPBUF_SIZE);
   int i, tile_idx;
   uint16_t *data_p, *tmpdata_p;
 
@@ -543,8 +931,12 @@
     } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
       loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
                                      tmpdata, tmpstride, bit_depth);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
+      loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride,
+                                      rst, bit_depth, tmpbuf);
     }
   }
+  aom_free(tmpbuf);
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -558,19 +950,21 @@
   const int uvstart = ystart >> cm->subsampling_y;
   int yend = end_mi_row << MI_SIZE_LOG2;
   int uvend = yend >> cm->subsampling_y;
+  restore_func_type restore_funcs[RESTORE_TYPES] = { NULL, loop_sgrproj_filter,
+                                                     loop_bilateral_filter,
+                                                     loop_wiener_filter,
+                                                     loop_switchable_filter };
+#if CONFIG_AOM_HIGHBITDEPTH
+  restore_func_highbd_type restore_funcs_highbd[RESTORE_TYPES] = {
+    NULL, loop_sgrproj_filter_highbd, loop_bilateral_filter_highbd,
+    loop_wiener_filter_highbd, loop_switchable_filter_highbd
+  };
+#endif  // CONFIG_AOM_HIGHBITDEPTH
   restore_func_type restore_func =
-      cm->rst_internal.rsi->frame_restoration_type == RESTORE_BILATERAL
-          ? loop_bilateral_filter
-          : (cm->rst_internal.rsi->frame_restoration_type == RESTORE_WIENER
-                 ? loop_wiener_filter
-                 : loop_switchable_filter);
+      restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
 #if CONFIG_AOM_HIGHBITDEPTH
   restore_func_highbd_type restore_func_highbd =
-      cm->rst_internal.rsi->frame_restoration_type == RESTORE_BILATERAL
-          ? loop_bilateral_filter_highbd
-          : (cm->rst_internal.rsi->frame_restoration_type == RESTORE_WIENER
-                 ? loop_wiener_filter_highbd
-                 : loop_switchable_filter_highbd);
+      restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
   YV12_BUFFER_CONFIG tmp_buf;
 
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index a5150ad..08e9c52 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -20,14 +20,35 @@
 extern "C" {
 #endif
 
+#define RESTORATION_TILESIZE_SML 128
+#define RESTORATION_TILESIZE_BIG 256
+#define RESTORATION_TILEPELS_MAX \
+  (RESTORATION_TILESIZE_BIG * RESTORATION_TILESIZE_BIG * 9 / 4)
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 5 * 8)
+
+#define SGRPROJ_PARAMS_BITS 3
+#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
+
+// Precision bits for projection
+#define SGRPROJ_PRJ_BITS 7
+// Restoration precision bits generated higher than source before projection
+#define SGRPROJ_RST_BITS 4
+// Internal precision bits for core selfguided_restoration
+#define SGRPROJ_SGR_BITS 8
+#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
+
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+
+#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
+
 #define BILATERAL_LEVEL_BITS_KF 4
 #define BILATERAL_LEVELS_KF (1 << BILATERAL_LEVEL_BITS_KF)
 #define BILATERAL_LEVEL_BITS 3
 #define BILATERAL_LEVELS (1 << BILATERAL_LEVEL_BITS)
-// #define DEF_BILATERAL_LEVEL     2
 
-#define RESTORATION_TILESIZE_SML 128
-#define RESTORATION_TILESIZE_BIG 256
 #define BILATERAL_SUBTILE_BITS 1
 #define BILATERAL_SUBTILES (1 << (2 * BILATERAL_SUBTILE_BITS))
 
@@ -65,12 +86,27 @@
 } WienerInfo;
 
 typedef struct {
+  int r1;
+  int e1;
+  int r2;
+  int e2;
+} sgr_params_type;
+
+typedef struct {
+  int level;
+  int ep;
+  int xqd[2];
+} SgrprojInfo;
+
+typedef struct {
   RestorationType frame_restoration_type;
   RestorationType *restoration_type;
   // Bilateral filter
   BilateralInfo *bilateral_info;
   // Wiener filter
   WienerInfo *wiener_info;
+  // Selfguided proj filter
+  SgrprojInfo *sgrproj_info;
 } RestorationInfo;
 
 typedef struct {
@@ -140,6 +176,11 @@
   }
 }
 
+extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+
+void av1_selfguided_restoration(int64_t *dgd, int width, int height, int stride,
+                                int bit_depth, int r, int eps, void *tmpbuf);
+void decode_xq(int *xqd, int *xq);
 int av1_bilateral_level_bits(const struct AV1Common *const cm);
 void av1_loop_restoration_init(RestorationInternal *rst, RestorationInfo *rsi,
                                int kf, int width, int height);
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index df34c75..d0ac7c1 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2204,14 +2204,60 @@
                                     struct aom_read_bit_buffer *rb) {
   RestorationInfo *rsi = &cm->rst_info;
   if (aom_rb_read_bit(rb)) {
-    rsi->frame_restoration_type =
-        aom_rb_read_bit(rb) ? RESTORE_WIENER : RESTORE_BILATERAL;
+    if (aom_rb_read_bit(rb))
+      rsi->frame_restoration_type =
+          (aom_rb_read_bit(rb) ? RESTORE_WIENER : RESTORE_BILATERAL);
+    else
+      rsi->frame_restoration_type = RESTORE_SGRPROJ;
   } else {
     rsi->frame_restoration_type =
         aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
   }
 }
 
+static void read_wiener_filter(WienerInfo *wiener_info, aom_reader *rb) {
+  wiener_info->vfilter[0] =
+      aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
+      WIENER_FILT_TAP0_MINV;
+  wiener_info->vfilter[1] =
+      aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->vfilter[2] =
+      aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
+      WIENER_FILT_TAP2_MINV;
+  wiener_info->hfilter[0] =
+      aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
+      WIENER_FILT_TAP0_MINV;
+  wiener_info->hfilter[1] =
+      aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->hfilter[2] =
+      aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
+      WIENER_FILT_TAP2_MINV;
+}
+
+static void read_sgrproj_filter(SgrprojInfo *sgrproj_info, aom_reader *rb) {
+  sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
+  sgrproj_info->xqd[0] =
+      aom_read_literal(rb, SGRPROJ_PRJ_BITS, ACCT_STR) + SGRPROJ_PRJ_MIN0;
+  sgrproj_info->xqd[1] =
+      aom_read_literal(rb, SGRPROJ_PRJ_BITS, ACCT_STR) + SGRPROJ_PRJ_MIN1;
+}
+
+static void read_bilateral_filter(const AV1_COMMON *cm,
+                                  BilateralInfo *bilateral_info,
+                                  aom_reader *rb) {
+  int s;
+  for (s = 0; s < BILATERAL_SUBTILES; ++s) {
+    if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
+      bilateral_info->level[s] =
+          aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
+    } else {
+      bilateral_info->level[s] = -1;
+    }
+  }
+}
+
 static void decode_restoration(AV1_COMMON *cm, aom_reader *rb) {
   int i;
   RestorationInfo *rsi = &cm->rst_info;
@@ -2227,45 +2273,26 @@
       rsi->wiener_info = (WienerInfo *)aom_realloc(
           rsi->wiener_info, sizeof(*rsi->wiener_info) * ntiles);
       assert(rsi->wiener_info != NULL);
+      rsi->sgrproj_info = (SgrprojInfo *)aom_realloc(
+          rsi->sgrproj_info, sizeof(*rsi->sgrproj_info) * ntiles);
+      assert(rsi->sgrproj_info != NULL);
       for (i = 0; i < ntiles; ++i) {
         rsi->restoration_type[i] =
             aom_read_tree(rb, av1_switchable_restore_tree,
                           cm->fc->switchable_restore_prob, ACCT_STR);
         if (rsi->restoration_type[i] == RESTORE_WIENER) {
           rsi->wiener_info[i].level = 1;
-          rsi->wiener_info[i].vfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
-              WIENER_FILT_TAP0_MINV;
-          rsi->wiener_info[i].vfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
-              WIENER_FILT_TAP1_MINV;
-          rsi->wiener_info[i].vfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
-              WIENER_FILT_TAP2_MINV;
-          rsi->wiener_info[i].hfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
-              WIENER_FILT_TAP0_MINV;
-          rsi->wiener_info[i].hfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
-              WIENER_FILT_TAP1_MINV;
-          rsi->wiener_info[i].hfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
-              WIENER_FILT_TAP2_MINV;
+          read_wiener_filter(&rsi->wiener_info[i], rb);
         } else if (rsi->restoration_type[i] == RESTORE_BILATERAL) {
-          int s;
-          for (s = 0; s < BILATERAL_SUBTILES; ++s) {
 #if BILATERAL_SUBTILES == 0
-            rsi->bilateral_info[i].level[s] =
-                aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
+          rsi->bilateral_info[i].level[0] =
+              aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
 #else
-            if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
-              rsi->bilateral_info[i].level[s] =
-                  aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
-            } else {
-              rsi->bilateral_info[i].level[s] = -1;
-            }
+          read_bilateral_filter(cm, &rsi->bilateral_info[i], rb);
 #endif
-          }
+        } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) {
+          rsi->sgrproj_info[i].level = 1;
+          read_sgrproj_filter(&rsi->sgrproj_info[i], rb);
         }
       }
     } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
@@ -2274,50 +2301,37 @@
       assert(rsi->wiener_info != NULL);
       for (i = 0; i < ntiles; ++i) {
         if (aom_read(rb, RESTORE_NONE_WIENER_PROB, ACCT_STR)) {
-          rsi->wiener_info[i].level = 1;
           rsi->restoration_type[i] = RESTORE_WIENER;
-          rsi->wiener_info[i].vfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
-              WIENER_FILT_TAP0_MINV;
-          rsi->wiener_info[i].vfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
-              WIENER_FILT_TAP1_MINV;
-          rsi->wiener_info[i].vfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
-              WIENER_FILT_TAP2_MINV;
-          rsi->wiener_info[i].hfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
-              WIENER_FILT_TAP0_MINV;
-          rsi->wiener_info[i].hfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
-              WIENER_FILT_TAP1_MINV;
-          rsi->wiener_info[i].hfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
-              WIENER_FILT_TAP2_MINV;
+          rsi->wiener_info[i].level = 1;
+          read_wiener_filter(&rsi->wiener_info[i], rb);
         } else {
           rsi->wiener_info[i].level = 0;
           rsi->restoration_type[i] = RESTORE_NONE;
         }
       }
-    } else {
+    } else if (rsi->frame_restoration_type == RESTORE_BILATERAL) {
       rsi->bilateral_info = (BilateralInfo *)aom_realloc(
           rsi->bilateral_info, sizeof(*rsi->bilateral_info) * ntiles);
       assert(rsi->bilateral_info != NULL);
       for (i = 0; i < ntiles; ++i) {
-        int s;
         rsi->restoration_type[i] = RESTORE_BILATERAL;
-        for (s = 0; s < BILATERAL_SUBTILES; ++s) {
-          if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
-            rsi->bilateral_info[i].level[s] =
-                aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
-          } else {
-            rsi->bilateral_info[i].level[s] = -1;
-          }
+        read_bilateral_filter(cm, &rsi->bilateral_info[i], rb);
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+      rsi->sgrproj_info = (SgrprojInfo *)aom_realloc(
+          rsi->sgrproj_info, sizeof(*rsi->sgrproj_info) * ntiles);
+      assert(rsi->sgrproj_info != NULL);
+      for (i = 0; i < ntiles; ++i) {
+        if (aom_read(rb, RESTORE_NONE_SGRPROJ_PROB, ACCT_STR)) {
+          rsi->restoration_type[i] = RESTORE_SGRPROJ;
+          rsi->sgrproj_info[i].level = 1;
+          read_sgrproj_filter(&rsi->sgrproj_info[i], rb);
+        } else {
+          rsi->sgrproj_info[i].level = 0;
+          rsi->restoration_type[i] = RESTORE_NONE;
         }
       }
     }
-  } else {
-    rsi->frame_restoration_type = RESTORE_NONE;
   }
 }
 #endif  // CONFIG_LOOP_RESTORATION
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index d004195..e8bf18c 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -174,9 +174,9 @@
       an in-order traversal of the av1_switchable_interp_tree structure. */
   av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv,
                         SWITCHABLE_FILTERS, av1_switchable_interp_tree);
-  /* This hack is necessary because the four TX_TYPES are not consecutive,
-      e.g., 0, 1, 2, 3, when doing an in-order traversal of the av1_ext_tx_tree
-      structure. */
+/* This hack is necessary because the four TX_TYPES are not consecutive,
+    e.g., 0, 1, 2, 3, when doing an in-order traversal of the av1_ext_tx_tree
+    structure. */
 #if !CONFIG_EXT_TX
   av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, TX_TYPES,
                         av1_ext_tx_tree);
@@ -2897,18 +2897,60 @@
       aom_wb_write_bit(wb, 0);
       aom_wb_write_bit(wb, 1);
       break;
+    case RESTORE_SGRPROJ:
+      aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 0);
+      break;
     case RESTORE_BILATERAL:
       aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 1);
       aom_wb_write_bit(wb, 0);
       break;
     case RESTORE_WIENER:
       aom_wb_write_bit(wb, 1);
       aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 1);
       break;
     default: assert(0);
   }
 }
 
+static void write_wiener_filter(WienerInfo *wiener_info, aom_writer *wb) {
+  aom_write_literal(wb, wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+                    WIENER_FILT_TAP0_BITS);
+  aom_write_literal(wb, wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+                    WIENER_FILT_TAP1_BITS);
+  aom_write_literal(wb, wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+                    WIENER_FILT_TAP2_BITS);
+  aom_write_literal(wb, wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+                    WIENER_FILT_TAP0_BITS);
+  aom_write_literal(wb, wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+                    WIENER_FILT_TAP1_BITS);
+  aom_write_literal(wb, wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+                    WIENER_FILT_TAP2_BITS);
+}
+
+static void write_sgrproj_filter(SgrprojInfo *sgrproj_info, aom_writer *wb) {
+  aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+  aom_write_literal(wb, sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+                    SGRPROJ_PRJ_BITS);
+  aom_write_literal(wb, sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+                    SGRPROJ_PRJ_BITS);
+}
+
+static void write_bilateral_filter(const AV1_COMMON *cm,
+                                   BilateralInfo *bilateral_info,
+                                   aom_writer *wb) {
+  int s;
+  for (s = 0; s < BILATERAL_SUBTILES; ++s) {
+    aom_write(wb, bilateral_info->level[s] >= 0, RESTORE_NONE_BILATERAL_PROB);
+    if (bilateral_info->level[s] >= 0) {
+      aom_write_literal(wb, bilateral_info->level[s],
+                        av1_bilateral_level_bits(cm));
+    }
+  }
+}
+
 static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
   int i;
   RestorationInfo *rsi = &cm->rst_info;
@@ -2920,75 +2962,35 @@
             wb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
             &switchable_restore_encodings[rsi->restoration_type[i]]);
         if (rsi->restoration_type[i] == RESTORE_BILATERAL) {
-          int s;
-          for (s = 0; s < BILATERAL_SUBTILES; ++s) {
 #if BILATERAL_SUBTILES == 0
-            aom_write_literal(wb, rsi->bilateral_info[i].level[s],
-                              av1_bilateral_level_bits(cm));
+          aom_write_literal(wb, rsi->bilateral_info[i].level[0],
+                            av1_bilateral_level_bits(cm));
 #else
-            aom_write(wb, rsi->bilateral_info[i].level[s] >= 0,
-                      RESTORE_NONE_BILATERAL_PROB);
-            if (rsi->bilateral_info[i].level[s] >= 0) {
-              aom_write_literal(wb, rsi->bilateral_info[i].level[s],
-                                av1_bilateral_level_bits(cm));
-            }
+          write_bilateral_filter(cm, &rsi->bilateral_info[i], wb);
 #endif
-          }
         } else if (rsi->restoration_type[i] == RESTORE_WIENER) {
-          aom_write_literal(
-              wb, rsi->wiener_info[i].vfilter[0] - WIENER_FILT_TAP0_MINV,
-              WIENER_FILT_TAP0_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].vfilter[1] - WIENER_FILT_TAP1_MINV,
-              WIENER_FILT_TAP1_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].vfilter[2] - WIENER_FILT_TAP2_MINV,
-              WIENER_FILT_TAP2_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].hfilter[0] - WIENER_FILT_TAP0_MINV,
-              WIENER_FILT_TAP0_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].hfilter[1] - WIENER_FILT_TAP1_MINV,
-              WIENER_FILT_TAP1_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].hfilter[2] - WIENER_FILT_TAP2_MINV,
-              WIENER_FILT_TAP2_BITS);
+          write_wiener_filter(&rsi->wiener_info[i], wb);
+        } else if (rsi->restoration_type[i] == RESTORE_SGRPROJ) {
+          write_sgrproj_filter(&rsi->sgrproj_info[i], wb);
         }
       }
     } else if (rsi->frame_restoration_type == RESTORE_BILATERAL) {
       for (i = 0; i < cm->rst_internal.ntiles; ++i) {
-        int s;
-        for (s = 0; s < BILATERAL_SUBTILES; ++s) {
-          aom_write(wb, rsi->bilateral_info[i].level[s] >= 0,
-                    RESTORE_NONE_BILATERAL_PROB);
-          if (rsi->bilateral_info[i].level[s] >= 0) {
-            aom_write_literal(wb, rsi->bilateral_info[i].level[s],
-                              av1_bilateral_level_bits(cm));
-          }
-        }
+        write_bilateral_filter(cm, &rsi->bilateral_info[i], wb);
       }
     } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
       for (i = 0; i < cm->rst_internal.ntiles; ++i) {
         aom_write(wb, rsi->wiener_info[i].level != 0, RESTORE_NONE_WIENER_PROB);
         if (rsi->wiener_info[i].level) {
-          aom_write_literal(
-              wb, rsi->wiener_info[i].vfilter[0] - WIENER_FILT_TAP0_MINV,
-              WIENER_FILT_TAP0_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].vfilter[1] - WIENER_FILT_TAP1_MINV,
-              WIENER_FILT_TAP1_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].vfilter[2] - WIENER_FILT_TAP2_MINV,
-              WIENER_FILT_TAP2_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].hfilter[0] - WIENER_FILT_TAP0_MINV,
-              WIENER_FILT_TAP0_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].hfilter[1] - WIENER_FILT_TAP1_MINV,
-              WIENER_FILT_TAP1_BITS);
-          aom_write_literal(
-              wb, rsi->wiener_info[i].hfilter[2] - WIENER_FILT_TAP2_MINV,
-              WIENER_FILT_TAP2_BITS);
+          write_wiener_filter(&rsi->wiener_info[i], wb);
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+      for (i = 0; i < cm->rst_internal.ntiles; ++i) {
+        aom_write(wb, rsi->sgrproj_info[i].level != 0,
+                  RESTORE_NONE_SGRPROJ_PROB);
+        if (rsi->sgrproj_info[i].level) {
+          write_sgrproj_filter(&rsi->sgrproj_info[i], wb);
         }
       }
     }
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 62303b7..408c76d 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -33,7 +33,7 @@
                                       int partial_frame, RestorationInfo *info,
                                       double *best_tile_cost);
 
-const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 3, 3, 2 };
 
 static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src,
                                     AV1_COMMON *const cm, int h_start,
@@ -100,6 +100,228 @@
   return filt_err;
 }
 
+static int64_t get_pixel_proj_error(int64_t *src, int width, int height,
+                                    int src_stride, int64_t *dgd,
+                                    int dgd_stride, int64_t *flt1,
+                                    int flt1_stride, int64_t *flt2,
+                                    int flt2_stride, int *xqd) {
+  int i, j;
+  int64_t err = 0;
+  int xq[2];
+  decode_xq(xqd, xq);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int64_t s = (int64_t)src[i * src_stride + j];
+      const int64_t u = (int64_t)dgd[i * dgd_stride + j];
+      const int64_t f1 = (int64_t)flt1[i * flt1_stride + j] - u;
+      const int64_t f2 = (int64_t)flt2[i * flt2_stride + j] - u;
+      const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
+      const int64_t e =
+          ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
+          ROUND_POWER_OF_TWO(s, SGRPROJ_RST_BITS);
+      err += e * e;
+    }
+  }
+  return err;
+}
+
+static void get_proj_subspace(int64_t *src, int width, int height,
+                              int src_stride, int64_t *dgd, int dgd_stride,
+                              int64_t *flt1, int flt1_stride, int64_t *flt2,
+                              int flt2_stride, int *xq) {
+  int i, j;
+  double H[2][2] = { { 0, 0 }, { 0, 0 } };
+  double C[2] = { 0, 0 };
+  double Det;
+  double x[2];
+  const int size = width * height;
+
+  xq[0] = -(1 << SGRPROJ_PRJ_BITS) / 4;
+  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0];
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const double u = (double)dgd[i * dgd_stride + j];
+      const double s = (double)src[i * src_stride + j] - u;
+      const double f1 = (double)flt1[i * flt1_stride + j] - u;
+      const double f2 = (double)flt2[i * flt2_stride + j] - u;
+      H[0][0] += f1 * f1;
+      H[1][1] += f2 * f2;
+      H[0][1] += f1 * f2;
+      C[0] += f1 * s;
+      C[1] += f2 * s;
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+  Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+  if (Det < 1e-8) return;  // ill-posed, return default values
+  x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+  x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+  xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+  xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+}
+
+void encode_xq(int *xq, int *xqd) {
+  xqd[0] = -xq[0];
+  xqd[0] = clamp(xqd[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+  xqd[1] = (1 << SGRPROJ_PRJ_BITS) + xqd[0] - xq[1];
+  xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+}
+
+static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
+                                          int dat_stride, uint8_t *src8,
+                                          int src_stride, int bit_depth,
+                                          int *eps, int *xqd, void *tmpbuf) {
+  int64_t *flt1 = (int64_t *)tmpbuf;
+  int64_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
+  uint8_t *tmpbuf2 = (uint8_t *)(flt2 + RESTORATION_TILEPELS_MAX);
+  int64_t srd[RESTORATION_TILEPELS_MAX];
+  int64_t dgd[RESTORATION_TILEPELS_MAX];
+  int i, j, ep, bestep = 0;
+  int64_t err, besterr = -1;
+  int exqd[2], bestxqd[2] = { 0, 0 };
+  for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+    int exq[2];
+    if (bit_depth > 8) {
+      uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+      uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          flt1[i * width + j] = (int64_t)dat[i * dat_stride + j];
+          flt2[i * width + j] = (int64_t)dat[i * dat_stride + j];
+          dgd[i * width + j] = (int64_t)dat[i * dat_stride + j]
+                               << SGRPROJ_RST_BITS;
+          srd[i * width + j] = (int64_t)src[i * src_stride + j]
+                               << SGRPROJ_RST_BITS;
+        }
+      }
+    } else {
+      uint8_t *src = src8;
+      uint8_t *dat = dat8;
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int k = i * width + j;
+          const int l = i * dat_stride + j;
+          flt1[k] = (int64_t)dat[l];
+          flt2[k] = (int64_t)dat[l];
+          dgd[k] = (int64_t)dat[l] << SGRPROJ_RST_BITS;
+          srd[k] = (int64_t)src[i * src_stride + j] << SGRPROJ_RST_BITS;
+        }
+      }
+    }
+    av1_selfguided_restoration(flt1, width, height, width, bit_depth,
+                               sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
+    av1_selfguided_restoration(flt2, width, height, width, bit_depth,
+                               sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
+    get_proj_subspace(srd, width, height, width, dgd, width, flt1, width, flt2,
+                      width, exq);
+    encode_xq(exq, exqd);
+    err = get_pixel_proj_error(srd, width, height, width, dgd, width, flt1,
+                               width, flt2, width, exqd);
+    if (besterr == -1 || err < besterr) {
+      bestep = ep;
+      besterr = err;
+      bestxqd[0] = exqd[0];
+      bestxqd[1] = exqd[1];
+    }
+  }
+  *eps = bestep;
+  xqd[0] = bestxqd[0];
+  xqd[1] = bestxqd[1];
+}
+
+static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                             int filter_level, int partial_frame,
+                             RestorationInfo *info, double *best_tile_cost) {
+  SgrprojInfo *sgrproj_info = info->sgrproj_info;
+  double err, cost_norestore, cost_sgrproj;
+  int bits;
+  MACROBLOCK *x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  RestorationInfo rsi;
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  uint8_t *tmpbuf = aom_malloc(SGRPROJ_TMPBUF_SIZE);
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, &tile_width,
+                                         &tile_height, &nhtiles, &nvtiles);
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                        1, partial_frame);
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  rsi.frame_restoration_type = RESTORE_SGRPROJ;
+  rsi.sgrproj_info =
+      (SgrprojInfo *)aom_malloc(sizeof(*rsi.sgrproj_info) * ntiles);
+  assert(rsi.sgrproj_info != NULL);
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx)
+    rsi.sgrproj_info[tile_idx].level = 0;
+  // Compute best Sgrproj filters for each tile
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, cm->width, cm->height, 0, 0, &h_start,
+                             &h_end, &v_start, &v_end);
+    err = sse_restoration_tile(src, cm, h_start, h_end - h_start, v_start,
+                               v_end - v_start);
+    // #bits when a tile is not restored
+    bits = av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 0);
+    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    best_tile_cost[tile_idx] = DBL_MAX;
+    search_selfguided_restoration(
+        dgd->y_buffer + v_start * dgd->y_stride + h_start, h_end - h_start,
+        v_end - v_start, dgd->y_stride,
+        src->y_buffer + v_start * src->y_stride + h_start, src->y_stride,
+#if CONFIG_AOM_HIGHBITDEPTH
+        cm->bit_depth,
+#else
+        8,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+        &rsi.sgrproj_info[tile_idx].ep, rsi.sgrproj_info[tile_idx].xqd, tmpbuf);
+    rsi.sgrproj_info[tile_idx].level = 1;
+    err = try_restoration_tile(src, cpi, &rsi, partial_frame, tile_idx, 0, 0);
+    bits = SGRPROJ_BITS << AV1_PROB_COST_SHIFT;
+    bits += av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, 1);
+    cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    if (cost_sgrproj >= cost_norestore) {
+      sgrproj_info[tile_idx].level = 0;
+    } else {
+      memcpy(&sgrproj_info[tile_idx], &rsi.sgrproj_info[tile_idx],
+             sizeof(sgrproj_info[tile_idx]));
+      bits = SGRPROJ_BITS << AV1_PROB_COST_SHIFT;
+      best_tile_cost[tile_idx] = RDCOST_DBL(
+          x->rdmult, x->rddiv,
+          (bits + cpi->switchable_restore_cost[RESTORE_SGRPROJ]) >> 4, err);
+    }
+    rsi.sgrproj_info[tile_idx].level = 0;
+  }
+  // Cost for Sgrproj filtering
+  bits = frame_level_restore_bits[rsi.frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    bits +=
+        av1_cost_bit(RESTORE_NONE_SGRPROJ_PROB, sgrproj_info[tile_idx].level);
+    memcpy(&rsi.sgrproj_info[tile_idx], &sgrproj_info[tile_idx],
+           sizeof(sgrproj_info[tile_idx]));
+    if (sgrproj_info[tile_idx].level) {
+      bits += (SGRPROJ_BITS << AV1_PROB_COST_SHIFT);
+    }
+  }
+  err = try_restoration_frame(src, cpi, &rsi, partial_frame);
+  cost_sgrproj = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+  aom_free(rsi.sgrproj_info);
+  aom_free(tmpbuf);
+
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return cost_sgrproj;
+}
+
 static double search_bilateral(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                                int filter_level, int partial_frame,
                                RestorationInfo *info, double *best_tile_cost) {
@@ -520,7 +742,7 @@
   RestorationInfo rsi;
   int64_t err;
   int bits;
-  double cost_wiener, cost_norestore_tile;
+  double cost_wiener, cost_norestore;
   MACROBLOCK *x = &cpi->td.mb;
   double M[RESTORATION_WIN2];
   double H[RESTORATION_WIN2 * RESTORATION_WIN2];
@@ -533,7 +755,7 @@
   double score;
   int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
   int h_start, h_end, v_start, v_end;
-  int i, j;
+  int i;
 
   const int ntiles = av1_get_rest_ntiles(width, height, &tile_width,
                                          &tile_height, &nhtiles, &nvtiles);
@@ -552,7 +774,8 @@
   rsi.wiener_info = (WienerInfo *)aom_malloc(sizeof(*rsi.wiener_info) * ntiles);
   assert(rsi.wiener_info != NULL);
 
-  for (j = 0; j < ntiles; ++j) rsi.wiener_info[j].level = 0;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx)
+    rsi.wiener_info[tile_idx].level = 0;
 
   // Compute best Wiener filters for each tile
   for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
@@ -563,7 +786,7 @@
                                v_end - v_start);
     // #bits when a tile is not restored
     bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
-    cost_norestore_tile = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
     best_tile_cost[tile_idx] = DBL_MAX;
 
     av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
@@ -601,7 +824,7 @@
     bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT;
     bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
     cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
-    if (cost_wiener >= cost_norestore_tile) {
+    if (cost_wiener >= cost_norestore) {
       wiener_info[tile_idx].level = 0;
     } else {
       wiener_info[tile_idx].level = 1;
@@ -632,6 +855,7 @@
   }
   err = try_restoration_frame(src, cpi, &rsi, partial_frame);
   cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
   aom_free(rsi.wiener_info);
 
   aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
@@ -713,7 +937,7 @@
 void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
                                  LPF_PICK_METHOD method) {
   static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = {
-    search_norestore, search_bilateral, search_wiener,
+    search_norestore, search_sgrproj, search_bilateral, search_wiener,
   };
   AV1_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
@@ -734,6 +958,9 @@
   cm->rst_info.wiener_info = (WienerInfo *)aom_realloc(
       cm->rst_info.wiener_info, sizeof(*cm->rst_info.wiener_info) * ntiles);
   assert(cm->rst_info.wiener_info != NULL);
+  cm->rst_info.sgrproj_info = (SgrprojInfo *)aom_realloc(
+      cm->rst_info.sgrproj_info, sizeof(*cm->rst_info.sgrproj_info) * ntiles);
+  assert(cm->rst_info.sgrproj_info != NULL);
 
   for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++)
     tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles);
@@ -796,10 +1023,10 @@
   }
   cm->rst_info.frame_restoration_type = best_restore;
   /*
-  printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
+  printf("Frame %d/%d frame_restore_type %d : %f %f %f %f %f\n",
          cm->current_video_frame, cm->show_frame,
-         cm->rst_info.frame_restoration_type,
-         cost_restore[0], cost_restore[1], cost_restore[2], cost_restore[3]);
+         cm->rst_info.frame_restoration_type, cost_restore[0], cost_restore[1],
+         cost_restore[2], cost_restore[3], cost_restore[4]);
          */
   for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) aom_free(tile_cost[r]);
 }