Rearrange self-guided filter for vectorization

By rearranging the order of operations, we can ensure that all
intermediate values fit into 32 bits. This will help when we
vectorize the self-guided filter.

Results in the noise range.

Change-Id: Ic0c73613882bd103c4e8e57a0155b3132672ae04
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 609ee07..adf0c2c 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -122,8 +122,8 @@
 #define MAX_RADIUS 3  // Only 1, 2, 3 allowed
 #define MAX_EPS 80    // Max value of eps
 #define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
-#define SGRPROJ_MTABLE_BITS 30
-#define SGRPROJ_RECIP_BITS 16
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
 
 // TODO(debargha): This table can be substantially reduced since only a few
 // values are actually used.
@@ -604,11 +604,10 @@
 };
 
 static const uint16_t one_by_x[MAX_NELEM] = {
-  65535, 32768, 21845, 16384, 13107, 10923, 9362, 8192, 7282, 6554,
-  5958,  5461,  5041,  4681,  4369,  4096,  3855, 3641, 3449, 3277,
-  3121,  2979,  2849,  2731,  2621,  2521,  2427, 2341, 2260, 2185,
-  2114,  2048,  1986,  1928,  1872,  1820,  1771, 1725, 1680, 1638,
-  1598,  1560,  1524,  1489,  1456,  1425,  1394, 1365, 1337,
+  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164, 158,
+  152,  146,  141,  137,  132, 128, 124, 120, 117, 114, 111, 108, 105,
+  102,  100,  98,   95,   93,  91,  89,  87,  85,  84
 };
 #endif  // APPROXIMATE_SGR
 
@@ -619,7 +618,6 @@
   int32_t *B = A + RESTORATION_TILEPELS_MAX;
   int8_t num[RESTORATION_TILEPELS_MAX];
   int i, j;
-  eps <<= 2 * (bit_depth - 8);
 
   // Don't filter tiles with dimensions < 5 on any axis
   if ((width < 5) || (height < 5)) return;
@@ -632,30 +630,37 @@
     for (j = 0; j < width; ++j) {
       const int k = i * width + j;
       const int n = num[k];
-      // Assuming that we only allow up to 12-bit depth and r <= 2,
-      // we calculate p = n^2 * Var(n-pixel block of original image)
-      // (where n = 2 * r + 1 <= 5).
-      //
-      // There is an inequality which gives a bound on the variance:
-      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
-      // In this case, since each pixel is in the range [0, 2^12),
-      // the variance is at most 1/4 * (2^12)^2 = 2^22.
-      // Then p <= 25^2 * 2^22 < 2^32, and also q <= p + 25^2 * 68 < 2^32.
-      //
-      // The point of all this is to guarantee that q < 2^32, so that
-      // platforms with a 64-bit by 32-bit divide unit (eg, x86)
-      // can do the division by q more efficiently.
-      const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
 #if APPROXIMATE_SGR
-      const int s = sgrproj_mtable[eps - 1][n - 1];
-      const int z =
-          ((int64_t)s * (int64_t)p + (1 << (SGRPROJ_MTABLE_BITS - 1))) >>
-          SGRPROJ_MTABLE_BITS;
-      A[k] = x_by_xplus1[AOMMIN(z, 255)];
-      B[k] = ((SGRPROJ_SGR - A[k]) * (int64_t)B[k] * (int64_t)one_by_x[n - 1] +
-              (1 << (SGRPROJ_RECIP_BITS - 1))) >>
-             SGRPROJ_RECIP_BITS;
+      // a < 2^16 * n < 2^22 regardless of bit depth
+      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+      // b < 2^8 * n < 2^14 regardless of bit depth
+      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+      // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+      // This is an artefact of rounding, and can only happen if all pixels
+      // are (almost) identical, so in this case we saturate to p=0.
+      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+      uint32_t s = sgrproj_mtable[eps - 1][n - 1];
+
+      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+      // (this holds even after accounting for the rounding in s)
+      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8
+
+      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
+      // one_by_x[n - 1] = round(2^12 / n)
+      // => the product here is < 2^(20 + bit_depth) <= 2^32,
+      // and B[k] is set to a value < 2^(8 + bit depth)
+      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+                                             (uint32_t)B[k] *
+                                             (uint32_t)one_by_x[n - 1],
+                                         SGRPROJ_RECIP_BITS);
 #else
+      const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
       const uint32_t q = (uint32_t)(p + n * n * eps);
       assert((uint64_t)A[k] * n - (uint64_t)B[k] * B[k] < (25 * 25U << 22));
       A[k] = (int32_t)(((uint64_t)p << SGRPROJ_SGR_BITS) + (q >> 1)) / q;