Add an alt FAST_SGR that subsamples horizontally Adds a horizontally subsampled version of FAST_SGR that would not require an extra line buffer. This code is enabled by CONFIG_FAST_SGR=2 Change-Id: Icce07155eb085652cb298bae8d5f01d3136ce867
diff --git a/av1/common/restoration.c b/av1/common/restoration.c index 23a2a7e..2c40808 100644 --- a/av1/common/restoration.c +++ b/av1/common/restoration.c
@@ -772,7 +772,164 @@ 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, }; -#if CONFIG_FAST_SGR +#if CONFIG_FAST_SGR == 2 +static void av1_selfguided_restoration_fast2_internal( + int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, int r, int eps) { + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, + // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. + for (i = -1; i < height + 1; ++i) { + for (j = -1; j < width + 1; j += 2) { + const int k = i * buf_stride + j; + const int n = (2 * r + 1) * (2 * r + 1); + + // a < 2^16 * n < 2^22 regardless of bit depth + uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); + // b < 2^8 * n < 2^14 regardless of bit depth + uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); + + // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, + // and p itself satisfies p < 2^14 * n^2 < 2^26. + // This bound on p is due to: + // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances + // + // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. + // This is an artefact of rounding, and can only happen if all pixels + // are (almost) identical, so in this case we saturate to p=0. + uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; + + // Note: If MAX_RADIUS <= 2, then this 's' is a function only of + // r and eps. Further, this is the only place we use 'eps', so we could + // pre-calculate 's' for each parameter set and store that in place of + // 'eps'. + uint32_t s = sgrproj_mtable[eps - 1][n - 1]; + + // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 + // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 + // (this holds even after accounting for the rounding in s) + const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); + + // Note: We have to be quite careful about the value of A[k]. + // This is used as a blend factor between individual pixel values and the + // local mean. So it logically has a range of [0, 256], including both + // endpoints. + // + // This is a pain for hardware, as we'd like something which can be stored + // in exactly 8 bits. + // Further, in the calculation of B[k] below, if z == 0 and r == 2, + // then A[k] "should be" 0. But then we can end up setting B[k] to a value + // slightly above 2^(8 + bit depth), due to rounding in the value of + // one_by_x[25-1]. + // + // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. + // This fixes the above issues (256 - A[k] fits in a uint8, and we can't + // overflow), without significantly affecting the final result: z == 0 + // implies that the image is essentially "flat", so the local mean and + // individual pixel values are very similar. + // + // Note that saturating on the other side, ie. requring A[k] <= 255, + // would be a bad idea, as that corresponds to the case where the image + // is very variable, when we want to preserve the local pixel value as + // much as possible. + A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] + + // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, + // one_by_x[n - 1] = round(2^12 / n) + // => the product here is < 2^(20 + bit_depth) <= 2^32, + // and B[k] is set to a value < 2^(8 + bit depth) + // This holds even with the rounding in one_by_x and in the overall + // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. + B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * + (uint32_t)B[k] * + (uint32_t)one_by_x[n - 1], + SGRPROJ_RECIP_BITS); + } + } + // Use the A[] and B[] arrays to calculate the filtered image + for (i = 0; i < height; ++i) { + const int width2 = width + (width & 1); + for (j = 0; j < width2; j += 2) { + { // even col + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = (A[k - 1] + A[k + 1]) * 6 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 5; + const int32_t b = (B[k - 1] + B[k + 1]) * 6 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + if (j + 1 < width - 1) { // odd col and not last + const int k = i * buf_stride + j + 1; + const int l = i * dgd_stride + j + 1; + const int m = i * dst_stride + j + 1; + const int nb = 6; + const int32_t a = A[k] * 16 + + (A[k - buf_stride] + A[k + buf_stride]) * 14 + + (A[k - 2] + A[k + 2]) * 4 + + (A[k - 2 - buf_stride] + A[k - 2 + buf_stride] + + A[k + 2 - buf_stride] + A[k + 2 + buf_stride]) * + 3; + const int32_t b = B[k] * 16 + + (B[k - buf_stride] + B[k + buf_stride]) * 14 + + (B[k - 2] + B[k + 2]) * 4 + + (B[k - 2 - buf_stride] + B[k - 2 + buf_stride] + + B[k + 2 - buf_stride] + B[k + 2 + buf_stride]) * + 3; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } else if (j + 1 < width) { // odd col and last + const int k = i * buf_stride + j + 1; + const int l = i * dgd_stride + j + 1; + const int m = i * dst_stride + j + 1; + const int nb = 6; + const int32_t a = + A[k] * 18 + (A[k - buf_stride] + A[k + buf_stride]) * 16 + + A[k - 2] * 6 + (A[k - 2 - buf_stride] + A[k - 2 + buf_stride]) * 4; + const int32_t b = + B[k] * 18 + (B[k - buf_stride] + B[k + buf_stride]) * 16 + + B[k - 2] * 6 + (B[k - 2 - buf_stride] + B[k - 2 + buf_stride]) * 4; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } + } +} + +#elif CONFIG_FAST_SGR == 1 + static void av1_selfguided_restoration_fast_internal( int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps) { @@ -1076,7 +1233,14 @@ } } -#if CONFIG_FAST_SGR +#if CONFIG_FAST_SGR == 2 + av1_selfguided_restoration_fast2_internal(dgd32, width, height, dgd32_stride, + flt1, flt_stride, bit_depth, + params->r1, params->e1); + av1_selfguided_restoration_fast2_internal(dgd32, width, height, dgd32_stride, + flt2, flt_stride, bit_depth, + params->r2, params->e2); +#elif CONFIG_FAST_SGR == 1 av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, flt1, flt_stride, bit_depth, params->r1, params->e1); @@ -1138,9 +1302,15 @@ for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, stripe_width - j); +#if CONFIG_FAST_SGR == 2 + apply_selfguided_restoration_c(src + j, w, stripe_height, src_stride, + rui->sgrproj_info.ep, rui->sgrproj_info.xqd, + dst + j, dst_stride, tmpbuf, bit_depth, 0); +#else apply_selfguided_restoration(src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0); +#endif // CONFIG_FAST_SGR == 2 } } @@ -1176,9 +1346,15 @@ int32_t *tmpbuf, int bit_depth) { for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, stripe_width - j); +#if CONFIG_FAST_SGR == 2 + apply_selfguided_restoration_c(src8 + j, w, stripe_height, src_stride, + rui->sgrproj_info.ep, rui->sgrproj_info.xqd, + dst8 + j, dst_stride, tmpbuf, bit_depth, 1); +#else apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1); +#endif // CONFIG_FAST_SGR == 2 } }
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c index a6a07b8..c4a5121 100644 --- a/av1/encoder/pickrst.c +++ b/av1/encoder/pickrst.c
@@ -367,7 +367,7 @@ int width, int height, int dat_stride, int use_highbd, int bit_depth, int32_t *flt1, int32_t *flt2, int flt_stride) { -#if CONFIG_FAST_SGR +#if CONFIG_FAST_SGR == 2 av1_selfguided_restoration_c(dat8, width, height, dat_stride, flt1, flt2, flt_stride, params, bit_depth, use_highbd); #else