Reduce/Eliminate line buffer for loop-restoration.
This patch forces the vertical filtering for the top and bottom
rows of a processing unit for the Wiener filter to not use border
more than what is set in the WIENER_BORDER_VERT macro.
This macro is currently set at 0 to eliminate line buffer completely,
but it could be increased to 1 or 2 to use limited line buffers
if the coding efficiency is affected too much with a 0 line-buffer.
Also, for the sgr filter we added the option of using overlapping
windows horizonttally and vertically to improve coding efficiency.
The vertical border used is set by the SGRPROJ_BORDER_VERT
macro, while the horizontal border can be set by the
SGRPROJ_BORDER_HORZ macro set at 2, the max needed. Currently we do not
recommend changing SGRPROJ_BORDER_HORZ below 2.
The overall line buffer requirement for LR is twice the max of
WIENER_BORDER_VERT and SGRPROJ_BORDER_VERT.
Currently both are set as 0, eliminating line buffers completely.
Also this patch extends borders consistently before CDEF / LR.
Change-Id: Ie58a98c784a0db547627b9cfcf55f018c30e8e79
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 2064330..989e0c9 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -146,15 +146,36 @@
h_end - h_start);
}
-// Convert 7-tap filter to 5-tap for top and bottom rows of a processing unit
-static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert) {
+static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
+ int boundary_dist, int istop) {
memcpy(vert, orig, sizeof(InterpKernel));
- int delta = vert[0] / 2;
- vert[1] += delta;
- vert[WIENER_WIN - 2] += delta;
- vert[2] += vert[0] - delta;
- vert[WIENER_WIN - 3] += vert[0] - delta;
- vert[0] = vert[WIENER_WIN - 1] = 0;
+ switch (boundary_dist) {
+ case 0:
+ vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
+ vert[2] = vert[1] = vert[0] = 0;
+ break;
+ case 1:
+ vert[2] += vert[1] + vert[0];
+ vert[1] = vert[0] = 0;
+ break;
+ case 2:
+ vert[1] += vert[0];
+ vert[0] = 0;
+ break;
+ default: break;
+ }
+ if (!istop) {
+ int tmp;
+ tmp = vert[0];
+ vert[0] = vert[WIENER_WIN - 1];
+ vert[WIENER_WIN - 1] = tmp;
+ tmp = vert[1];
+ vert[1] = vert[WIENER_WIN - 2];
+ vert[WIENER_WIN - 2] = tmp;
+ tmp = vert[2];
+ vert[2] = vert[WIENER_WIN - 3];
+ vert[WIENER_WIN - 3] = tmp;
+ }
}
static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
@@ -173,8 +194,6 @@
return;
}
InterpKernel vertical_topbot;
- stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
- vertical_topbot);
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
@@ -186,42 +205,50 @@
int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
const uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
-// Use 5-tap vertical filtering for top and bottom rows in
-// processing unit
+ // Note h is at least 16
+ for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
+ stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+ vertical_topbot, WIENER_BORDER_VERT + b, 1);
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
- rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1);
+ aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
+ rst->rsi->wiener_info[tile_idx].hfilter, 16,
+ vertical_topbot, 16, w, 1);
#else
- aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
- rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1);
+ aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+ rst->rsi->wiener_info[tile_idx].hfilter, 16,
+ vertical_topbot, 16, w, 1);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- data_p += stride;
- dst_p += dst_stride;
-// Note h is at least 16
+ data_p += stride;
+ dst_p += dst_stride;
+ }
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
- h - 2);
+ h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
#else
aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
- h - 2);
+ h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- data_p += stride * (h - 2);
- dst_p += dst_stride * (h - 2);
+ data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+ dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+ for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
+ stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+ vertical_topbot, WIENER_BORDER_VERT + b, 0);
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
- rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1);
+ aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
+ rst->rsi->wiener_info[tile_idx].hfilter, 16,
+ vertical_topbot, 16, w, 1);
#else
- aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
- rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1);
+ aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+ rst->rsi->wiener_info[tile_idx].hfilter, 16,
+ vertical_topbot, 16, w, 1);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+ data_p += stride;
+ dst_p += dst_stride;
+ }
}
}
@@ -618,30 +645,40 @@
};
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
- int height, int stride,
+ int height, int dgd_stride,
+ int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
- int32_t *A = tmpbuf;
- int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
- int8_t num[RESTORATION_TILEPELS_MAX];
- int i, j;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int num_stride = width_ext;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
- int buf_stride = ((width + 3) & ~3) + 16;
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+ int32_t *A = tmpbuf;
+ int32_t *B = tmpbuf + SGRPROJ_OUTBUF_SIZE;
+ int8_t num_[RESTORATION_TILEPELS_MAX];
+ int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
+ int i, j;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
- boxsum(dgd, width, height, stride, r, 0, B, buf_stride);
- boxsum(dgd, width, height, stride, r, 1, A, buf_stride);
- boxnum(width, height, r, num, width);
+ boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+ width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+ boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+ width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+ boxnum(width_ext, height_ext, r, num_, num_stride);
assert(r <= 3);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int k = i * buf_stride + j;
- const int n = num[i * width + j];
+ const int n = num[i * num_stride + j];
// a < 2^16 * n < 2^22 regardless of bit depth
uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
@@ -677,106 +714,115 @@
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
const int32_t b =
3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = 0;
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
const int32_t b =
3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = height - 1;
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
const int32_t b =
3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = height - 1;
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
const int32_t b =
3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = 0;
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
A[k + buf_stride - 1] + A[k + buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
B[k + buf_stride - 1] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = height - 1;
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
A[k - buf_stride - 1] + A[k - buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
B[k - buf_stride - 1] + B[k - buf_stride + 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
j = 0;
for (i = 1; i < height - 1; ++i) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
j = width - 1;
for (i = 1; i < height - 1; ++i) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
for (i = 1; i < height - 1; ++i) {
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
(A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
@@ -791,7 +837,7 @@
B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
3;
const int32_t v = a * dgd[l] + b;
- dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
}
}
@@ -799,14 +845,18 @@
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int r, int eps, int32_t *tmpbuf) {
+ const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ int32_t *dgd32 =
+ tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- dst[i * dst_stride + j] = dgd[i * stride + j];
+ for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+ for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+ dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
}
}
- av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
- tmpbuf);
+ av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
+ dst_stride, 8, r, eps,
+ tmpbuf + RESTORATION_TILEPELS_MAX);
}
void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
@@ -959,7 +1009,7 @@
int h = AOMMIN(procunit_height, v_end - i);
uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
- apply_selfguided_restoration(
+ apply_selfguided_restoration_c(
data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
@@ -969,6 +1019,7 @@
int stride, RestorationInternal *rst,
uint8_t *dst, int dst_stride) {
int tile_idx;
+ extend_frame(data, width, height, stride);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
dst_stride);
@@ -1052,8 +1103,6 @@
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
InterpKernel vertical_topbot;
- stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
- vertical_topbot);
// Convolve the whole tile (done in blocks here to match the requirements
// of the vectorized convolve functions, but the result is equivalent)
for (i = v_start; i < v_end; i += procunit_height)
@@ -1062,46 +1111,56 @@
int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
const uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
-// if the filter is 7-tap do only horizontal filtering for top and
-// bottom rows.
+ // Note h is at least 16
+ for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
+ stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+ vertical_topbot, WIENER_BORDER_VERT + b, 1);
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- aom_highbd_convolve8_add_src_hip(
- CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
- dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1, bit_depth);
+ aom_highbd_convolve8_add_src_hip(
+ CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+ dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+ vertical_topbot, 16, w, 1, bit_depth);
#else
- aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
- CONVERT_TO_BYTEPTR(dst_p), dst_stride,
- rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1, bit_depth);
+ aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+ CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+ rst->rsi->wiener_info[tile_idx].hfilter,
+ 16, vertical_topbot, 16, w, 1, bit_depth);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- data_p += stride;
- dst_p += dst_stride;
-// Note h is at least 16
+ data_p += stride;
+ dst_p += dst_stride;
+ }
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_highbd_convolve8_add_src_hip(
CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
- rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
+ rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
+ h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
#else
aom_highbd_convolve8_add_src(
CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
- rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
+ rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
+ h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- data_p += stride * (h - 2);
- dst_p += dst_stride * (h - 2);
+ data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+ dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
+ for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
+ stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+ vertical_topbot, WIENER_BORDER_VERT + b, 0);
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
- aom_highbd_convolve8_add_src_hip(
- CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
- dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1, bit_depth);
+ aom_highbd_convolve8_add_src_hip(
+ CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+ dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+ vertical_topbot, 16, w, 1, bit_depth);
#else
- aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
- CONVERT_TO_BYTEPTR(dst_p), dst_stride,
- rst->rsi->wiener_info[tile_idx].hfilter, 16,
- vertical_topbot, 16, w, 1, bit_depth);
+ aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+ CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+ rst->rsi->wiener_info[tile_idx].hfilter,
+ 16, vertical_topbot, 16, w, 1, bit_depth);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+ data_p += stride;
+ dst_p += dst_stride;
+ }
}
}
@@ -1123,14 +1182,18 @@
int stride, int32_t *dst,
int dst_stride, int bit_depth, int r,
int eps, int32_t *tmpbuf) {
+ const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ int32_t *dgd32 =
+ tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- dst[i * dst_stride + j] = dgd[i * stride + j];
+ for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+ for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+ dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
}
}
- av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
- r, eps, tmpbuf);
+ av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
+ dst_stride, bit_depth, r, eps,
+ tmpbuf + RESTORATION_TILEPELS_MAX);
}
void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
@@ -1288,7 +1351,7 @@
int h = AOMMIN(procunit_height, v_end - i);
uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
- apply_selfguided_restoration_highbd(
+ apply_selfguided_restoration_highbd_c(
data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
@@ -1301,6 +1364,7 @@
int tile_idx;
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ extend_frame_highbd(data, width, height, stride);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
bit_depth, dst, dst_stride);
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 9eee959..30ddc11 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -25,24 +25,42 @@
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
#define RESTORATION_PROC_UNIT_SIZE 64
+// Determines line buffer requirement for LR. Should be set at the max
+// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT 0
+#define RESTORATION_BORDER_HORZ 3 // Do not change this
+
+// Pad up to 20 more (may be much less is needed)
+#define RESTORATION_PADDING 20
+#define RESTORATION_PROC_UNIT_PELS \
+ ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
+ RESTORATION_PADDING) * \
+ (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
+ RESTORATION_PADDING))
#define RESTORATION_TILESIZE_MAX 256
-#define RESTORATION_TILEPELS_MAX \
- (RESTORATION_TILESIZE_MAX * RESTORATION_TILESIZE_MAX * 9 / 4)
+#define RESTORATION_TILEPELS_MAX \
+ (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \
+ (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)
// 4 32-bit buffers needed for the filter:
// 2 for the restored versions of the frame and
// 2 for each restoration operation
-#define SGRPROJ_OUTBUF_SIZE \
- ((RESTORATION_TILESIZE_MAX * 3 / 2) * (RESTORATION_TILESIZE_MAX * 3 / 2 + 16))
+#define SGRPROJ_OUTBUF_SIZE \
+ ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \
+ (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16))
#define SGRPROJ_TMPBUF_SIZE \
(RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
- SGRPROJ_OUTBUF_SIZE * 2 * sizeof(int32_t))
+ SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS)
+
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
#define USE_HIGHPASS_IN_SGRPROJ 0
+#define SGRPROJ_BORDER_VERT 0 // Vertical border used for sgr
+#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for sgr
+
// Precision bits for projection
#define SGRPROJ_PRJ_BITS 7
// Restoration precision bits generated higher than source before projection
@@ -74,6 +92,8 @@
#define SGRPROJ_RECIP_BITS 12
#define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN)
+#define WIENER_BORDER_VERT 0
#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index 123353e..d0716e6 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -663,8 +663,11 @@
}
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
- int stride, int32_t *dst, int dst_stride,
- int r, int eps, int32_t *tmpbuf) {
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int r, int eps,
+ int32_t *tmpbuf) {
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int i, j;
@@ -676,25 +679,31 @@
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
+ uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
if (r == 1) {
- selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
- selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
+ selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+ buf_stride);
+ selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else if (r == 2) {
- selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
- selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
+ selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+ buf_stride);
+ selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else if (r == 3) {
- selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
- selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
+ selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
+ buf_stride);
+ selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else {
assert(0);
}
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -706,7 +715,7 @@
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -719,7 +728,7 @@
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -734,7 +743,7 @@
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -750,7 +759,7 @@
// Vectorize the innermost loop
for (j = 1; j < width - 1; j += 4) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
@@ -803,7 +812,7 @@
// (typically have 2 such pixels, but may have anywhere between 0 and 3)
for (; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
@@ -825,7 +834,7 @@
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -844,7 +853,7 @@
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -856,7 +865,7 @@
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -869,7 +878,7 @@
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
@@ -1363,10 +1372,12 @@
}
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
- int height, int stride,
+ int height, int dgd_stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int i, j;
@@ -1378,28 +1389,34 @@
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
+ uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
if (r == 1) {
- highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
- buf_stride);
- selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
+ highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
+ A, B, buf_stride);
+ selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
+ bit_depth);
} else if (r == 2) {
- highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
- buf_stride);
- selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
+ highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
+ A, B, buf_stride);
+ selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
+ bit_depth);
} else if (r == 3) {
- highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
- buf_stride);
- selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
+ highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
+ A, B, buf_stride);
+ selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
+ bit_depth);
} else {
assert(0);
}
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
@@ -1411,7 +1428,7 @@
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
@@ -1424,7 +1441,7 @@
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
@@ -1439,7 +1456,7 @@
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1455,7 +1472,7 @@
// Vectorize the innermost loop
for (j = 1; j < width - 1; j += 4) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
@@ -1508,7 +1525,7 @@
// (typically have 2 such pixels, but may have anywhere between 0 and 3)
for (; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
@@ -1530,7 +1547,7 @@
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
@@ -1549,7 +1566,7 @@
j = 0;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
@@ -1561,7 +1578,7 @@
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
@@ -1574,7 +1591,7 @@
j = width - 1;
{
const int k = i * buf_stride + j;
- const int l = i * stride + j;
+ const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 76cc882..dd18053 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5358,6 +5358,7 @@
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
}
+ aom_extend_frame_borders(new_fb);
#if CONFIG_CDEF
if (!cm->skip_loop_filter && !cm->all_lossless) {
av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index eff1fa3..7487e11 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4183,6 +4183,8 @@
av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
#endif
}
+ aom_extend_frame_borders(cm->frame_to_show);
+
#if CONFIG_CDEF
if (is_lossless_requested(&cpi->oxcf)) {
cm->cdef_bits = 0;
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 210120a..98fbfb5 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -637,6 +637,15 @@
// Compute best Sgrproj filters for each rtile, one (encoder/decoder)
// tile at a time.
const AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
+ ctxt.plane_height, ctxt.dgd_stride);
+ else
+#endif
+ extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
+ ctxt.dgd_stride);
+
for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
SgrprojInfo ref_sgrproj_info;
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index eaf5520..63b5bac 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -40,18 +40,23 @@
protected:
void RunSpeedTest() {
- const int w = 256, h = 256;
+ const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
- uint8_t *input = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
- uint8_t *output = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
+ uint8_t *input_ =
+ (uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
+ uint8_t *output_ = (uint8_t *)aom_memalign(
+ 16, out_stride * (height + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+ uint8_t *input = input_ + stride * 16 + 16;
+ uint8_t *output = output_ + out_stride * 16 + 16;
ACMRandom rnd(ACMRandom::DeterministicSeed());
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & 0xFF;
+ for (i = -16; i < height + 16; ++i)
+ for (j = -16; j < width + 16; ++j)
+ input[i * stride + j] = rnd.Rand16() & 0xFF;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
@@ -67,16 +72,17 @@
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
- apply_selfguided_restoration(input, w, h, w, eps, xqd, output, w, tmpbuf);
+ apply_selfguided_restoration(input, width, height, stride, eps, xqd,
+ output, out_stride, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
- printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
- elapsed, elapsed * 1000000. / NUM_ITERS);
+ printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
+ height, elapsed, elapsed * 1000000. / NUM_ITERS);
- aom_free(input);
- aom_free(output);
+ aom_free(input_);
+ aom_free(output_);
aom_free(tmpbuf);
}
@@ -88,21 +94,26 @@
const int NUM_ITERS = 81;
int i, j, k;
- uint8_t *input =
- (uint8_t *)aom_memalign(16, stride * max_h * sizeof(uint8_t));
- uint8_t *output =
- (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
- uint8_t *output2 =
- (uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
+ uint8_t *input_ =
+ (uint8_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint8_t));
+ uint8_t *output_ = (uint8_t *)aom_memalign(
+ 16, out_stride * (max_h + 32) * sizeof(uint8_t));
+ uint8_t *output2_ = (uint8_t *)aom_memalign(
+ 16, out_stride * (max_h + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+ uint8_t *input = input_ + stride * 16 + 16;
+ uint8_t *output = output_ + out_stride * 16 + 16;
+ uint8_t *output2 = output2_ + out_stride * 16 + 16;
+
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
- for (j = 0; j < max_h; ++j)
- for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & 0xFF;
+ for (j = -16; j < max_h + 16; ++j)
+ for (k = -16; k < max_w + 16; ++k)
+ input[j * stride + k] = rnd.Rand16() & 0xFF;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
@@ -121,13 +132,14 @@
apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf);
for (j = 0; j < test_h; ++j)
- for (k = 0; k < test_w; ++k)
+ for (k = 0; k < test_w; ++k) {
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+ }
}
- aom_free(input);
- aom_free(output);
- aom_free(output2);
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(output2_);
aom_free(tmpbuf);
}
};
@@ -155,20 +167,25 @@
protected:
void RunSpeedTest() {
- const int w = 256, h = 256;
+ const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
- uint16_t *input = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
- uint16_t *output = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
+ uint16_t *input_ =
+ (uint16_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint16_t));
+ uint16_t *output_ = (uint16_t *)aom_memalign(
+ 16, out_stride * (height + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+ uint16_t *input = input_ + stride * 16 + 16;
+ uint16_t *output = output_ + out_stride * 16 + 16;
ACMRandom rnd(ACMRandom::DeterministicSeed());
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & mask;
+ for (i = -16; i < height + 16; ++i)
+ for (j = -16; j < width + 16; ++j)
+ input[i * stride + j] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
@@ -184,17 +201,18 @@
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
- apply_selfguided_restoration_highbd(input, w, h, w, bit_depth, eps, xqd,
- output, w, tmpbuf);
+ apply_selfguided_restoration_highbd(input, width, height, stride,
+ bit_depth, eps, xqd, output,
+ out_stride, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
- printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
- elapsed, elapsed * 1000000. / NUM_ITERS);
+ printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
+ height, elapsed, elapsed * 1000000. / NUM_ITERS);
- aom_free(input);
- aom_free(output);
+ aom_free(input_);
+ aom_free(output_);
aom_free(tmpbuf);
}
@@ -208,21 +226,26 @@
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
- uint16_t *input =
- (uint16_t *)aom_memalign(16, stride * max_h * sizeof(uint16_t));
- uint16_t *output =
- (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
- uint16_t *output2 =
- (uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
+ uint16_t *input_ =
+ (uint16_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint16_t));
+ uint16_t *output_ = (uint16_t *)aom_memalign(
+ 16, out_stride * (max_h + 32) * sizeof(uint16_t));
+ uint16_t *output2_ = (uint16_t *)aom_memalign(
+ 16, out_stride * (max_h + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
+ uint16_t *input = input_ + stride * 16 + 16;
+ uint16_t *output = output_ + out_stride * 16 + 16;
+ uint16_t *output2 = output2_ + out_stride * 16 + 16;
+
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
- for (j = 0; j < max_h; ++j)
- for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & mask;
+ for (j = -16; j < max_h + 16; ++j)
+ for (k = -16; k < max_w + 16; ++k)
+ input[j * stride + k] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
@@ -247,9 +270,9 @@
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
- aom_free(input);
- aom_free(output);
- aom_free(output2);
+ aom_free(input_);
+ aom_free(output_);
+ aom_free(output2_);
aom_free(tmpbuf);
}
};