Integerized SGR subspace projection

I have developed an integer-only implementation of the subspace
projection in the self-guided restoration filter encoder.  This is
mainly to serve as a more useful reference for hardware encoder
implementation.  This patch includes highbd and lowbd codepaths.

This change can be disabled by setting the build-time experiment flag
CONFIG_INTEGERIZE_SGR to 0

STATS_CHANGED:
No significant change in performance observed.  8-bit AWCY:
https://arewecompressedyet.com/?job=lowbd_base%402018-10-02T18%3A41%3A06.713Z&job=lowbd_test%402018-10-02T18%3A41%3A25.611Z
10-bit AWCY:
https://arewecompressedyet.com/?job=highbd_base%402018-10-02T18%3A41%3A36.601Z&job=highbd_test%402018-10-02T18%3A41%3A46.608Z

I have tested 12-bit videos locally with no significant change.

Change-Id: I98dc548dc17383a8bb42c5d00f0c06889c625d43
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 0141aeb..4faa849 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -401,12 +401,112 @@
   return err;
 }
 
+#if CONFIG_INTEGERIZE_SGR
+static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) {
+  if (dividend < 0)
+    return (dividend - divisor / 2) / divisor;
+  else
+    return (dividend + divisor / 2) / divisor;
+}
+#endif  // CONFIG_INTEGERIZE_SGR
+
 static void get_proj_subspace(const uint8_t *src8, int width, int height,
                               int src_stride, const uint8_t *dat8,
                               int dat_stride, int use_highbitdepth,
                               int32_t *flt0, int flt0_stride, int32_t *flt1,
                               int flt1_stride, int *xq,
                               const sgr_params_type *params) {
+#if CONFIG_INTEGERIZE_SGR
+  int i, j;
+  int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
+  int64_t C[2] = { 0, 0 };
+  const int size = width * height;
+
+  // Default values to be returned if the problem becomes ill-posed
+  xq[0] = 0;
+  xq[1] = 0;
+
+  if (!use_highbitdepth) {
+    const uint8_t *src = src8;
+    const uint8_t *dat = dat8;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t u =
+            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const int32_t s =
+            (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+        const int32_t f1 =
+            (params->r[0] > 0) ? (int32_t)flt0[i * flt0_stride + j] - u : 0;
+        const int32_t f2 =
+            (params->r[1] > 0) ? (int32_t)flt1[i * flt1_stride + j] - u : 0;
+        H[0][0] += (int64_t)f1 * f1;
+        H[1][1] += (int64_t)f2 * f2;
+        H[0][1] += (int64_t)f1 * f2;
+        C[0] += (int64_t)f1 * s;
+        C[1] += (int64_t)f2 * s;
+      }
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t u =
+            (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const int32_t s =
+            (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+        const int32_t f1 =
+            (params->r[0] > 0) ? (int32_t)flt0[i * flt0_stride + j] - u : 0;
+        const int32_t f2 =
+            (params->r[1] > 0) ? (int32_t)flt1[i * flt1_stride + j] - u : 0;
+        H[0][0] += (int64_t)f1 * f1;
+        H[1][1] += (int64_t)f2 * f2;
+        H[0][1] += (int64_t)f1 * f2;
+        C[0] += (int64_t)f1 * s;
+        C[1] += (int64_t)f2 * s;
+      }
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+  if (params->r[0] == 0) {
+    // H matrix is now only the scalar H[1][1]
+    // C vector is now only the scalar C[1]
+    const int64_t Det = H[1][1];
+    if (Det == 0) return;  // ill-posed, return default values
+    xq[0] = 0;
+    xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det);
+  } else if (params->r[1] == 0) {
+    // H matrix is now only the scalar H[0][0]
+    // C vector is now only the scalar C[0]
+    const int64_t Det = H[0][0];
+    if (Det == 0) return;  // ill-posed, return default values
+    xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det);
+    xq[1] = 0;
+  } else {
+    const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0];
+    if (Det == 0) return;  // ill-posed, return default values
+
+    // If scaling up dividend would overflow, instead scale down the divisor
+    const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1];
+    if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) ||
+        (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1))
+      xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS));
+    else
+      xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det);
+
+    const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0];
+    if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) ||
+        (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2))
+      xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS));
+    else
+      xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det);
+  }
+#else   // CONFIG_INTEGERIZE_SGR
   int i, j;
   double H[2][2] = { { 0, 0 }, { 0, 0 } };
   double C[2] = { 0, 0 };
@@ -493,6 +593,7 @@
     xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
     xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
   }
+#endif  // CONFIG_INTEGERIZE_SGR
 }
 
 void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index a07438c..a33b133 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -137,6 +137,7 @@
                    "AV1 experiment flag.")
 set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER
                    "Use sharper encoding settings")
+set_aom_config_var(CONFIG_INTEGERIZE_SGR 1 NUMBER "AV1 experiment flag.")
 
 #
 # Variables in this section control optional features of the build system.