Rearrange self-guided filter for vectorization
By rearranging the order of operations, we can ensure that all
intermediate values fit into 32 bits. This will help when we
vectorize the self-guided filter.
Results in the noise range.
Change-Id: Ic0c73613882bd103c4e8e57a0155b3132672ae04
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 609ee07..adf0c2c 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -122,8 +122,8 @@
#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
-#define SGRPROJ_MTABLE_BITS 30
-#define SGRPROJ_RECIP_BITS 16
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
@@ -604,11 +604,10 @@
};
static const uint16_t one_by_x[MAX_NELEM] = {
- 65535, 32768, 21845, 16384, 13107, 10923, 9362, 8192, 7282, 6554,
- 5958, 5461, 5041, 4681, 4369, 4096, 3855, 3641, 3449, 3277,
- 3121, 2979, 2849, 2731, 2621, 2521, 2427, 2341, 2260, 2185,
- 2114, 2048, 1986, 1928, 1872, 1820, 1771, 1725, 1680, 1638,
- 1598, 1560, 1524, 1489, 1456, 1425, 1394, 1365, 1337,
+ 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+ 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 158,
+ 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, 105,
+ 102, 100, 98, 95, 93, 91, 89, 87, 85, 84
};
#endif // APPROXIMATE_SGR
@@ -619,7 +618,6 @@
int32_t *B = A + RESTORATION_TILEPELS_MAX;
int8_t num[RESTORATION_TILEPELS_MAX];
int i, j;
- eps <<= 2 * (bit_depth - 8);
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
@@ -632,30 +630,37 @@
for (j = 0; j < width; ++j) {
const int k = i * width + j;
const int n = num[k];
- // Assuming that we only allow up to 12-bit depth and r <= 2,
- // we calculate p = n^2 * Var(n-pixel block of original image)
- // (where n = 2 * r + 1 <= 5).
- //
- // There is an inequality which gives a bound on the variance:
- // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
- // In this case, since each pixel is in the range [0, 2^12),
- // the variance is at most 1/4 * (2^12)^2 = 2^22.
- // Then p <= 25^2 * 2^22 < 2^32, and also q <= p + 25^2 * 68 < 2^32.
- //
- // The point of all this is to guarantee that q < 2^32, so that
- // platforms with a 64-bit by 32-bit divide unit (eg, x86)
- // can do the division by q more efficiently.
- const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
#if APPROXIMATE_SGR
- const int s = sgrproj_mtable[eps - 1][n - 1];
- const int z =
- ((int64_t)s * (int64_t)p + (1 << (SGRPROJ_MTABLE_BITS - 1))) >>
- SGRPROJ_MTABLE_BITS;
- A[k] = x_by_xplus1[AOMMIN(z, 255)];
- B[k] = ((SGRPROJ_SGR - A[k]) * (int64_t)B[k] * (int64_t)one_by_x[n - 1] +
- (1 << (SGRPROJ_RECIP_BITS - 1))) >>
- SGRPROJ_RECIP_BITS;
+ // a < 2^16 * n < 2^22 regardless of bit depth
+ uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+ // b < 2^8 * n < 2^14 regardless of bit depth
+ uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+ // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+ // and p itself satisfies p < 2^14 * n^2 < 2^26.
+ // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+ // This is an artefact of rounding, and can only happen if all pixels
+ // are (almost) identical, so in this case we saturate to p=0.
+ uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+ uint32_t s = sgrproj_mtable[eps - 1][n - 1];
+
+ // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+ // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+ // (this holds even after accounting for the rounding in s)
+ const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+ A[k] = x_by_xplus1[AOMMIN(z, 255)]; // < 2^8
+
+ // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
+ // one_by_x[n - 1] = round(2^12 / n)
+ // => the product here is < 2^(20 + bit_depth) <= 2^32,
+ // and B[k] is set to a value < 2^(8 + bit depth)
+ B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+ (uint32_t)B[k] *
+ (uint32_t)one_by_x[n - 1],
+ SGRPROJ_RECIP_BITS);
#else
+ const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
const uint32_t q = (uint32_t)(p + n * n * eps);
assert((uint64_t)A[k] * n - (uint64_t)B[k] * B[k] < (25 * 25U << 22));
A[k] = (int32_t)(((uint64_t)p << SGRPROJ_SGR_BITS) + (q >> 1)) / q;