Replace division in self-guided filter
Replaces division with multiplication in self-guided
filter.
The guided filter requires computation of:
n^2.s^2/(n^2.s^2 + n^2.e).
This is now implemented by computation of n^2.s^2/n^2.e followed
by using a lookup table for the function f(x) = x/(x+1).
To compute n^2.s^2/n^2.e, we use an integer multiplication based
implementation which becomes feasible since n^2.e can only
take a few values and their corresponding multipliers can be
pre-computed.
There is also another divison by n, that is also integerized.
Change-Id: Id7b81bbafead0b8f04a1853ec69b9dec423bb66a
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 80619a7..609ee07 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -116,7 +116,34 @@
}
#endif // USE_DOMAINTXFMRF
+#define APPROXIMATE_SGR 1
+
+#if APPROXIMATE_SGR
+#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
+#define MAX_EPS 80 // Max value of eps
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 30
+#define SGRPROJ_RECIP_BITS 16
+
+// TODO(debargha): This table can be substantially reduced since only a few
+// values are actually used.
+static int sgrproj_mtable[MAX_EPS][MAX_NELEM];
+
+static void GenSgrprojVtable() {
+ int e, n;
+ for (e = 1; e <= MAX_EPS; ++e)
+ for (n = 1; n <= MAX_NELEM; ++n) {
+ const int n2e = n * n * e;
+ sgrproj_mtable[e - 1][n - 1] =
+ (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+ }
+}
+#endif // APPROXIMATE_SGR
+
void av1_loop_restoration_precal() {
+#if APPROXIMATE_SGR
+ GenSgrprojVtable();
+#endif // APPROXIMATE_SGR
#if USE_DOMAINTXFMRF
GenDomainTxfmRFVtable();
#endif // USE_DOMAINTXFMRF
@@ -554,7 +581,37 @@
xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
}
-#define APPROXIMATE_SGR 1
+#if APPROXIMATE_SGR
+static const uint16_t x_by_xplus1[256] = {
+ 0, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+ 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
+ 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
+ 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
+ 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
+ 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
+ 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 256,
+};
+
+static const uint16_t one_by_x[MAX_NELEM] = {
+ 65535, 32768, 21845, 16384, 13107, 10923, 9362, 8192, 7282, 6554,
+ 5958, 5461, 5041, 4681, 4369, 4096, 3855, 3641, 3449, 3277,
+ 3121, 2979, 2849, 2731, 2621, 2521, 2427, 2341, 2260, 2185,
+ 2114, 2048, 1986, 1928, 1872, 1820, 1771, 1725, 1680, 1638,
+ 1598, 1560, 1524, 1489, 1456, 1425, 1394, 1365, 1337,
+};
+#endif // APPROXIMATE_SGR
+
void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
@@ -570,8 +627,6 @@
boxsum(dgd, width, height, stride, r, 0, B, width);
boxsum(dgd, width, height, stride, r, 1, A, width);
boxnum(width, height, r, num, width);
- // The following loop is optimized assuming r <= 2. If we allow
- // r > 2, then the loop will need modifying.
assert(r <= 3);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -591,10 +646,21 @@
// platforms with a 64-bit by 32-bit divide unit (eg, x86)
// can do the division by q more efficiently.
const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
+#if APPROXIMATE_SGR
+ const int s = sgrproj_mtable[eps - 1][n - 1];
+ const int z =
+ ((int64_t)s * (int64_t)p + (1 << (SGRPROJ_MTABLE_BITS - 1))) >>
+ SGRPROJ_MTABLE_BITS;
+ A[k] = x_by_xplus1[AOMMIN(z, 255)];
+ B[k] = ((SGRPROJ_SGR - A[k]) * (int64_t)B[k] * (int64_t)one_by_x[n - 1] +
+ (1 << (SGRPROJ_RECIP_BITS - 1))) >>
+ SGRPROJ_RECIP_BITS;
+#else
const uint32_t q = (uint32_t)(p + n * n * eps);
assert((uint64_t)A[k] * n - (uint64_t)B[k] * B[k] < (25 * 25U << 22));
A[k] = (int32_t)(((uint64_t)p << SGRPROJ_SGR_BITS) + (q >> 1)) / q;
B[k] = ((SGRPROJ_SGR - A[k]) * B[k] + (n >> 1)) / n;
+#endif // APPROXIMATE_SGR
}
}
#if APPROXIMATE_SGR