Expand the parameter set for sgrproj restoration

A slight improvement for lowres and midres.

Change-Id: I377ba41034e1d70320e0c694d90a058e7809b129
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 50a1eec..c4997b1 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -33,8 +33,8 @@
 
 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
   // r1, eps1, r2, eps2
-  { 2, 27, 1, 11 }, { 2, 31, 1, 12 }, { 2, 37, 1, 12 }, { 2, 44, 1, 12 },
-  { 2, 49, 1, 13 }, { 2, 54, 1, 14 }, { 2, 60, 1, 15 }, { 2, 68, 1, 15 },
+  { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 }, { 2, 55, 1, 14 },
+  { 2, 65, 1, 15 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
 };
 
 typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
@@ -191,7 +191,7 @@
 
 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
    over the input. The window is of size (2r + 1)x(2r + 1), and we
-   only ever have r = 1 or r = 2. So we specialise to these two sizes.
+   specialize to r = 1, 2, 3. A default function is used for r > 3.
 
    Each loop follows the same format: We keep a window's worth of input
    in individual variables and select data out of that as appropriate.
@@ -348,16 +348,161 @@
   }
 }
 
+static void boxsum3(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c, d, e, f, g;
+
+  // Vertical sum over 7-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[1 * src_stride + j];
+      c = src[2 * src_stride + j];
+      d = src[3 * src_stride + j];
+      e = src[4 * src_stride + j];
+      f = src[5 * src_stride + j];
+      g = src[6 * src_stride + j];
+
+      dst[j] = a + b + c + d;
+      dst[dst_stride + j] = a + b + c + d + e;
+      dst[2 * dst_stride + j] = a + b + c + d + e + f;
+      for (i = 3; i < height - 4; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = f;
+        f = g;
+        g = src[(i + 4) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
+      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
+      dst[(i + 3) * dst_stride + j] = d + e + f + g;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[1 * src_stride + j] * src[1 * src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+      d = src[3 * src_stride + j] * src[3 * src_stride + j];
+      e = src[4 * src_stride + j] * src[4 * src_stride + j];
+      f = src[5 * src_stride + j] * src[5 * src_stride + j];
+      g = src[6 * src_stride + j] * src[6 * src_stride + j];
+
+      dst[j] = a + b + c + d;
+      dst[dst_stride + j] = a + b + c + d + e;
+      dst[2 * dst_stride + j] = a + b + c + d + e + f;
+      for (i = 3; i < height - 4; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = f;
+        f = g;
+        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
+      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
+      dst[(i + 3) * dst_stride + j] = d + e + f + g;
+    }
+  }
+
+  // Horizontal sum over 7-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+    d = dst[i * dst_stride + 3];
+    e = dst[i * dst_stride + 4];
+    f = dst[i * dst_stride + 5];
+    g = dst[i * dst_stride + 6];
+
+    dst[i * dst_stride] = a + b + c + d;
+    dst[i * dst_stride + 1] = a + b + c + d + e;
+    dst[i * dst_stride + 2] = a + b + c + d + e + f;
+    for (j = 3; j < width - 4; ++j) {
+      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+      a = b;
+      b = c;
+      c = d;
+      d = e;
+      e = f;
+      f = g;
+      g = dst[i * dst_stride + (j + 4)];
+    }
+    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
+    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
+    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
+    dst[i * dst_stride + (j + 3)] = d + e + f + g;
+  }
+}
+
+// Generic version for any r. To be removed after experiments are done.
+static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
+  int tmp_stride = width;
+  int i, j;
+  if (sqr) {
+    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
+    for (j = 0; j < width; ++j)
+      for (i = 1; i < height; ++i)
+        tmp[i * tmp_stride + j] =
+            tmp[(i - 1) * tmp_stride + j] +
+            src[i * src_stride + j] * src[i * src_stride + j];
+  } else {
+    memcpy(tmp, src, sizeof(*tmp) * width);
+    for (j = 0; j < width; ++j)
+      for (i = 1; i < height; ++i)
+        tmp[i * tmp_stride + j] =
+            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
+  }
+  for (i = 0; i <= r; ++i)
+    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
+           sizeof(*tmp) * width);
+  for (i = r + 1; i < height - r; ++i)
+    for (j = 0; j < width; ++j)
+      dst[i * dst_stride + j] =
+          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
+  for (i = height - r; i < height; ++i)
+    for (j = 0; j < width; ++j)
+      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
+                                tmp[(i - r - 1) * tmp_stride + j];
+
+  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
+  for (i = 0; i < height; ++i)
+    for (j = 1; j < width; ++j)
+      tmp[i * tmp_stride + j] =
+          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
+
+  for (j = 0; j <= r; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
+  for (j = r + 1; j < width - r; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] =
+          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
+  for (j = width - r; j < width; ++j)
+    for (i = 0; i < height; ++i)
+      dst[i * dst_stride + j] =
+          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
+  aom_free(tmp);
+}
+
 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                    int sqr, int32_t *dst, int dst_stride) {
   if (r == 1)
     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
   else if (r == 2)
     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
-  else {
-    assert(0 && "boxsum no longer supports r > 2");
-    return;
-  }
+  else if (r == 3)
+    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
+  else
+    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
 }
 
 static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
@@ -415,7 +560,7 @@
   boxnum(width, height, r, num, width);
   // The following loop is optimized assuming r <= 2. If we allow
   // r > 2, then the loop will need modifying.
-  assert(r <= 2);
+  assert(r <= 3);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       const int k = i * width + j;