Don't use extra lines for r=2 guided filter

Changes the CONFIG_FAST_SGR=1 strategy to not use any
subsampling for the r=1 filter, but for the r=2 filter
sub-sample vertically but combine only by filtering
horizontally in the last stage for odd rows.

Coding efficiency loss sems quite minimal.

Change-Id: I5644ac400b387c37a2d278db7f6ad3ac0a6b5e93
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 1bc25ef..b49e248 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -1024,62 +1024,98 @@
     }
   }
   // Use the A[] and B[] arrays to calculate the filtered image
-  for (i = 0; i < height; ++i) {
-    if (!(i & 1)) {  // even row
-      for (j = 0; j < width; ++j) {
-        const int k = i * buf_stride + j;
-        const int l = i * dgd_stride + j;
-        const int m = i * dst_stride + j;
-        const int nb = 5;
-        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
-                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
-                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
-                              5;
-        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
-                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
-                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
-                              5;
-        const int32_t v = a * dgd[l] + b;
-        dst[m] =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  if (r == 1) {
+    for (i = 0; i < height; ++i) {
+      if (!(i & 1)) {  // even row
+        for (j = 0; j < width; ++j) {
+          const int k = i * buf_stride + j;
+          const int l = i * dgd_stride + j;
+          const int m = i * dst_stride + j;
+          const int nb = 5;
+          const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+                            (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+                             A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+                                5;
+          const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+                            (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+                             B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+                                5;
+          const int32_t v = a * dgd[l] + b;
+          dst[m] =
+              ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+        }
+      } else if (i != height - 1) {  // odd row and not last
+        for (j = 0; j < width; ++j) {
+          const int k = i * buf_stride + j;
+          const int l = i * dgd_stride + j;
+          const int m = i * dst_stride + j;
+          const int nb = 6;
+          const int buf_stride2 = 2 * buf_stride;
+          const int32_t a = A[k] * 16 + (A[k - 1] + A[k + 1]) * 14 +
+                            (A[k - buf_stride2] + A[k + buf_stride2]) * 4 +
+                            (A[k - 1 - buf_stride2] + A[k - 1 + buf_stride2] +
+                             A[k + 1 - buf_stride2] + A[k + 1 + buf_stride2]) *
+                                3;
+          const int32_t b = B[k] * 16 + (B[k - 1] + B[k + 1]) * 14 +
+                            (B[k - buf_stride2] + B[k + buf_stride2]) * 4 +
+                            (B[k - 1 - buf_stride2] + B[k - 1 + buf_stride2] +
+                             B[k + 1 - buf_stride2] + B[k + 1 + buf_stride2]) *
+                                3;
+          const int32_t v = a * dgd[l] + b;
+          dst[m] =
+              ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+        }
+      } else {  // odd row and last
+        for (j = 0; j < width; ++j) {
+          const int k = i * buf_stride + j;
+          const int l = i * dgd_stride + j;
+          const int m = i * dst_stride + j;
+          const int nb = 6;
+          const int buf_stride2 = 2 * buf_stride;
+          const int32_t a =
+              A[k] * 18 + (A[k - 1] + A[k + 1]) * 16 + A[k - buf_stride2] * 6 +
+              (A[k - 1 - buf_stride2] + A[k + 1 - buf_stride2]) * 4;
+          const int32_t b =
+              B[k] * 18 + (B[k - 1] + B[k + 1]) * 16 + B[k - buf_stride2] * 6 +
+              (B[k - 1 - buf_stride2] + B[k + 1 - buf_stride2]) * 4;
+          const int32_t v = a * dgd[l] + b;
+          dst[m] =
+              ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+        }
       }
-    } else if (i != height - 1) {  // odd row and not last
-      for (j = 0; j < width; ++j) {
-        const int k = i * buf_stride + j;
-        const int l = i * dgd_stride + j;
-        const int m = i * dst_stride + j;
-        const int nb = 6;
-        const int buf_stride2 = 2 * buf_stride;
-        const int32_t a = A[k] * 16 + (A[k - 1] + A[k + 1]) * 14 +
-                          (A[k - buf_stride2] + A[k + buf_stride2]) * 4 +
-                          (A[k - 1 - buf_stride2] + A[k - 1 + buf_stride2] +
-                           A[k + 1 - buf_stride2] + A[k + 1 + buf_stride2]) *
-                              3;
-        const int32_t b = B[k] * 16 + (B[k - 1] + B[k + 1]) * 14 +
-                          (B[k - buf_stride2] + B[k + buf_stride2]) * 4 +
-                          (B[k - 1 - buf_stride2] + B[k - 1 + buf_stride2] +
-                           B[k + 1 - buf_stride2] + B[k + 1 + buf_stride2]) *
-                              3;
-        const int32_t v = a * dgd[l] + b;
-        dst[m] =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
-      }
-    } else {  // odd row and last
-      for (j = 0; j < width; ++j) {
-        const int k = i * buf_stride + j;
-        const int l = i * dgd_stride + j;
-        const int m = i * dst_stride + j;
-        const int nb = 6;
-        const int buf_stride2 = 2 * buf_stride;
-        const int32_t a = A[k] * 18 + (A[k - 1] + A[k + 1]) * 16 +
-                          A[k - buf_stride2] * 6 +
-                          (A[k - 1 - buf_stride2] + A[k + 1 - buf_stride2]) * 4;
-        const int32_t b = B[k] * 18 + (B[k - 1] + B[k + 1]) * 16 +
-                          B[k - buf_stride2] * 6 +
-                          (B[k - 1 - buf_stride2] + B[k + 1 - buf_stride2]) * 4;
-        const int32_t v = a * dgd[l] + b;
-        dst[m] =
-            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  } else {  // r = 2
+    for (i = 0; i < height; ++i) {
+      if (!(i & 1)) {  // even row
+        for (j = 0; j < width; ++j) {
+          const int k = i * buf_stride + j;
+          const int l = i * dgd_stride + j;
+          const int m = i * dst_stride + j;
+          const int nb = 5;
+          const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+                            (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+                             A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+                                5;
+          const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+                            (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+                             B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+                                5;
+          const int32_t v = a * dgd[l] + b;
+          dst[m] =
+              ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+        }
+      } else {  // odd row
+        for (j = 0; j < width; ++j) {
+          const int k = i * buf_stride + j;
+          const int l = i * dgd_stride + j;
+          const int m = i * dst_stride + j;
+          const int nb = 4;
+          const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+          const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+          const int32_t v = a * dgd[l] + b;
+          dst[m] =
+              ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+        }
       }
     }
   }
@@ -1242,9 +1278,9 @@
   av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
                                            flt1, flt_stride, bit_depth,
                                            params->r1, params->e1);
-  av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
-                                           flt2, flt_stride, bit_depth,
-                                           params->r2, params->e2);
+  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt2,
+                                      flt_stride, bit_depth, params->r2,
+                                      params->e2);
 #else
   av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
                                       flt_stride, bit_depth, params->r1,
@@ -1300,7 +1336,7 @@
 
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-#if CONFIG_FAST_SGR == 2
+#if CONFIG_FAST_SGR
     apply_selfguided_restoration_c(src + j, w, stripe_height, src_stride,
                                    rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                    dst + j, dst_stride, tmpbuf, bit_depth, 0);
@@ -1308,7 +1344,7 @@
     apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                  dst + j, dst_stride, tmpbuf, bit_depth, 0);
-#endif  // CONFIG_FAST_SGR == 2
+#endif  // CONFIG_FAST_SGR
   }
 }
 
@@ -1344,7 +1380,7 @@
                                          int32_t *tmpbuf, int bit_depth) {
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, stripe_width - j);
-#if CONFIG_FAST_SGR == 2
+#if CONFIG_FAST_SGR
     apply_selfguided_restoration_c(src8 + j, w, stripe_height, src_stride,
                                    rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                    dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
@@ -1352,7 +1388,7 @@
     apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
                                  dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
-#endif  // CONFIG_FAST_SGR == 2
+#endif  // CONFIG_FAST_SGR
   }
 }
 
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index e8b40c0..2e18bb6 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -353,7 +353,7 @@
                              int width, int height, int dat_stride,
                              int use_highbd, int bit_depth, int32_t *flt1,
                              int32_t *flt2, int flt_stride) {
-#if CONFIG_FAST_SGR == 2
+#if CONFIG_FAST_SGR
   av1_selfguided_restoration_c(dat8, width, height, dat_stride, flt1, flt2,
                                flt_stride, params, bit_depth, use_highbd);
 #else