Refactoring/simplification of buffers used for sgr
Inlcudes miscellaneous cleanups, test fixes, and code reorganization
for loop-restoration components.
Change-Id: I5b2e6419234d945e6f4344b22636119b50df4054
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d70e0f5..eb90586 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -628,7 +628,7 @@
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
- add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
+ add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
specialize qw/av1_selfguided_restoration sse4_1/;
add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
@@ -638,7 +638,7 @@
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
- add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
+ add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 989e0c9..b2d6340 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -22,14 +22,6 @@
#include "aom_ports/mem.h"
-#define USE_SIMPLER_SGR 1
-
-#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
-#define MAX_EPS 80 // Max value of eps
-#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
-#define SGRPROJ_MTABLE_BITS 20
-#define SGRPROJ_RECIP_BITS 12
-
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
#if USE_HIGHPASS_IN_SGRPROJ
// corner, edge, r2, eps2
@@ -39,7 +31,7 @@
{ -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
#else
// r1, eps1, r2, eps2
-#if USE_SIMPLER_SGR
+#if MAX_RADIUS == 2
{ 2, 12, 1, 4 }, { 2, 15, 1, 6 }, { 2, 18, 1, 8 }, { 2, 20, 1, 9 },
{ 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
{ 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
@@ -49,7 +41,7 @@
{ 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
{ 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
{ 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
-#endif // USE_SIMPLER_SGR
+#endif // MAX_RADIUS == 2
#endif
};
@@ -112,21 +104,22 @@
rst->keyframe = kf;
}
-void extend_frame(uint8_t *data, int width, int height, int stride) {
+void extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert) {
uint8_t *data_p;
int i;
for (i = 0; i < height; ++i) {
data_p = data + i * stride;
- memset(data_p - WIENER_HALFWIN, data_p[0], WIENER_HALFWIN);
- memset(data_p + width, data_p[width - 1], WIENER_HALFWIN);
+ memset(data_p - border_horz, data_p[0], border_horz);
+ memset(data_p + width, data_p[width - 1], border_horz);
}
- data_p = data - WIENER_HALFWIN;
- for (i = -WIENER_HALFWIN; i < 0; ++i) {
- memcpy(data_p + i * stride, data_p, width + 2 * WIENER_HALFWIN);
+ data_p = data - border_horz;
+ for (i = -border_vert; i < 0; ++i) {
+ memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
}
- for (i = height; i < height + WIENER_HALFWIN; ++i) {
+ for (i = height; i < height + border_vert; ++i) {
memcpy(data_p + i * stride, data_p + (height - 1) * stride,
- width + 2 * WIENER_HALFWIN);
+ width + 2 * border_horz);
}
}
@@ -256,7 +249,8 @@
RestorationInternal *rst, uint8_t *dst,
int dst_stride) {
int tile_idx;
- extend_frame(data, width, height, stride);
+ extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
+ WIENER_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
dst_stride);
@@ -639,16 +633,17 @@
const int32_t one_by_x[MAX_NELEM] = {
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
- 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 158,
- 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, 105,
- 102, 100, 98, 95, 93, 91, 89, 87, 85, 84
+ 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
+#if MAX_RADIUS > 2
+ 158, 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108,
+ 105, 102, 100, 98, 95, 93, 91, 89, 87, 85, 84
+#endif // MAX_RADIUS > 2
};
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
- int bit_depth, int r, int eps,
- int32_t *tmpbuf) {
+ int bit_depth, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
const int num_stride = width_ext;
@@ -657,10 +652,11 @@
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
-
- int32_t *A = tmpbuf;
- int32_t *B = tmpbuf + SGRPROJ_OUTBUF_SIZE;
- int8_t num_[RESTORATION_TILEPELS_MAX];
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int8_t num_[RESTORATION_PROC_UNIT_PELS];
int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
int i, j;
@@ -844,10 +840,11 @@
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
- int r, int eps, int32_t *tmpbuf) {
+ int r, int eps) {
+ int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
- tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
@@ -855,8 +852,7 @@
}
}
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
- dst_stride, 8, r, eps,
- tmpbuf + RESTORATION_TILEPELS_MAX);
+ dst_stride, 8, r, eps);
}
void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
@@ -955,7 +951,6 @@
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
@@ -963,10 +958,10 @@
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
- sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
+ sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
- sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
+ sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -1009,7 +1004,7 @@
int h = AOMMIN(procunit_height, v_end - i);
uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
- apply_selfguided_restoration_c(
+ apply_selfguided_restoration(
data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
@@ -1019,7 +1014,8 @@
int stride, RestorationInternal *rst,
uint8_t *dst, int dst_stride) {
int tile_idx;
- extend_frame(data, width, height, stride);
+ extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
+ SGRPROJ_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
dst_stride);
@@ -1030,7 +1026,8 @@
int stride, RestorationInternal *rst,
uint8_t *dst, int dst_stride) {
int tile_idx;
- extend_frame(data, width, height, stride);
+ extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
+ RESTORATION_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
@@ -1046,23 +1043,23 @@
}
#if CONFIG_HIGHBITDEPTH
-void extend_frame_highbd(uint16_t *data, int width, int height, int stride) {
+void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
+ int border_horz, int border_vert) {
uint16_t *data_p;
int i, j;
for (i = 0; i < height; ++i) {
data_p = data + i * stride;
- for (j = -WIENER_HALFWIN; j < 0; ++j) data_p[j] = data_p[0];
- for (j = width; j < width + WIENER_HALFWIN; ++j)
- data_p[j] = data_p[width - 1];
+ for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+ for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
}
- data_p = data - WIENER_HALFWIN;
- for (i = -WIENER_HALFWIN; i < 0; ++i) {
+ data_p = data - border_horz;
+ for (i = -border_vert; i < 0; ++i) {
memcpy(data_p + i * stride, data_p,
- (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
+ (width + 2 * border_horz) * sizeof(uint16_t));
}
- for (i = height; i < height + WIENER_HALFWIN; ++i) {
+ for (i = height; i < height + border_vert; ++i) {
memcpy(data_p + i * stride, data_p + (height - 1) * stride,
- (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
+ (width + 2 * border_horz) * sizeof(uint16_t));
}
}
@@ -1171,7 +1168,8 @@
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
int tile_idx;
- extend_frame_highbd(data, width, height, stride);
+ extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ,
+ WIENER_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
bit_depth, dst, dst_stride);
@@ -1181,10 +1179,11 @@
void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
int stride, int32_t *dst,
int dst_stride, int bit_depth, int r,
- int eps, int32_t *tmpbuf) {
+ int eps) {
+ int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
- tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
@@ -1192,8 +1191,7 @@
}
}
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
- dst_stride, bit_depth, r, eps,
- tmpbuf + RESTORATION_TILEPELS_MAX);
+ dst_stride, bit_depth, r, eps);
}
void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
@@ -1294,7 +1292,6 @@
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
@@ -1303,11 +1300,11 @@
#else
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
- sgr_params[eps].e1, tmpbuf2);
+ sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
- sgr_params[eps].e2, tmpbuf2);
+ sgr_params[eps].e2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
@@ -1351,7 +1348,7 @@
int h = AOMMIN(procunit_height, v_end - i);
uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
- apply_selfguided_restoration_highbd_c(
+ apply_selfguided_restoration_highbd(
data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
@@ -1364,7 +1361,8 @@
int tile_idx;
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- extend_frame_highbd(data, width, height, stride);
+ extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ,
+ SGRPROJ_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
bit_depth, dst, dst_stride);
@@ -1378,7 +1376,8 @@
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
int tile_idx;
- extend_frame_highbd(data, width, height, stride);
+ extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ,
+ RESTORATION_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst,
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 30ddc11..efeac48 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -25,10 +25,28 @@
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
#define RESTORATION_PROC_UNIT_SIZE 64
-// Determines line buffer requirement for LR. Should be set at the max
-// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT
-#define RESTORATION_BORDER_VERT 0
-#define RESTORATION_BORDER_HORZ 3 // Do not change this
+
+#define SGRPROJ_BORDER_VERT 0 // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for Sgr
+
+#define WIENER_BORDER_VERT 0 // Vertical border used for Wiener
+#define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener
+
+// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
+// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
+// Note the line buffer needed is twice the value of this macro.
+#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
+#else
+#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
+#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
+#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
+#else
+#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
+#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
// Pad up to 20 more (may be much less is needed)
#define RESTORATION_PADDING 20
@@ -39,28 +57,20 @@
RESTORATION_PADDING))
#define RESTORATION_TILESIZE_MAX 256
-#define RESTORATION_TILEPELS_MAX \
- (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \
- (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)
+#define RESTORATION_TILEPELS_MAX \
+ ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \
+ (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT))
-// 4 32-bit buffers needed for the filter:
-// 2 for the restored versions of the frame and
-// 2 for each restoration operation
-#define SGRPROJ_OUTBUF_SIZE \
- ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \
- (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16))
-#define SGRPROJ_TMPBUF_SIZE \
- (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
- SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS)
+// Two 32-bit buffers needed for the restored versions from two filters
+// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
+// on the decoder side.
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t))
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
#define USE_HIGHPASS_IN_SGRPROJ 0
-#define SGRPROJ_BORDER_VERT 0 // Vertical border used for sgr
-#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for sgr
-
// Precision bits for projection
#define SGRPROJ_PRJ_BITS 7
// Restoration precision bits generated higher than source before projection
@@ -85,15 +95,12 @@
#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
-#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
+#define MAX_RADIUS 2 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
-#define WIENER_HALFWIN 3
-#define WIENER_BORDER_HORZ (WIENER_HALFWIN)
-#define WIENER_BORDER_VERT 0
#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
@@ -268,9 +275,11 @@
int height);
void av1_free_restoration_struct(RestorationInfo *rst_info);
-void extend_frame(uint8_t *data, int width, int height, int stride);
+void extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert);
#if CONFIG_HIGHBITDEPTH
-void extend_frame_highbd(uint16_t *data, int width, int height, int stride);
+void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
+ int border_horz, int border_vert);
#endif // CONFIG_HIGHBITDEPTH
void decode_xq(int *xqd, int *xq);
void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c
index d0716e6..4006b85 100644
--- a/av1/common/x86/selfguided_sse4.c
+++ b/av1/common/x86/selfguided_sse4.c
@@ -664,17 +664,18 @@
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
int dgd_stride, int32_t *dst,
- int dst_stride, int r, int eps,
- int32_t *tmpbuf) {
+ int dst_stride, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
- int32_t *A = tmpbuf;
- int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
- int buf_stride = ((width + 3) & ~3) + 16;
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
@@ -1059,7 +1060,6 @@
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
@@ -1067,12 +1067,10 @@
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
- sgr_params[eps].r1, sgr_params[eps].e1,
- tmpbuf2);
+ sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
- sgr_params[eps].r2, sgr_params[eps].e2,
- tmpbuf2);
+ sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
@@ -1374,17 +1372,18 @@
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
- int bit_depth, int r, int eps,
- int32_t *tmpbuf) {
+ int bit_depth, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
- int32_t *A = tmpbuf;
- int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
- int buf_stride = ((width + 3) & ~3) + 16;
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
@@ -1741,7 +1740,6 @@
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
@@ -1751,11 +1749,11 @@
#else
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
width, bit_depth, sgr_params[eps].r1,
- sgr_params[eps].e1, tmpbuf2);
+ sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
width, bit_depth, sgr_params[eps].r2,
- sgr_params[eps].e2, tmpbuf2);
+ sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 98fbfb5..5650a48 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -29,9 +29,9 @@
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/encoder.h"
+#include "av1/encoder/mathutils.h"
#include "av1/encoder/picklpf.h"
#include "av1/encoder/pickrst.h"
-#include "av1/encoder/mathutils.h"
// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
// When set to RESTORE_TYPES we allow switchable.
@@ -354,7 +354,6 @@
int32_t *rstbuf) {
int32_t *flt1 = rstbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
@@ -387,11 +386,11 @@
#else
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
- sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
+ sgr_params[ep].r1, sgr_params[ep].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
- sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
+ sgr_params[ep].r2, sgr_params[ep].e2);
}
} else {
#endif
@@ -407,12 +406,11 @@
sgr_params[ep].corner, sgr_params[ep].edge);
#else
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
- sgr_params[ep].r1, sgr_params[ep].e1,
- tmpbuf2);
+ sgr_params[ep].r1, sgr_params[ep].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
flt2_stride, sgr_params[ep].r2,
- sgr_params[ep].e2, tmpbuf2);
+ sgr_params[ep].e2);
}
#if CONFIG_HIGHBITDEPTH
}
@@ -640,11 +638,12 @@
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
- ctxt.plane_height, ctxt.dgd_stride);
+ ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ,
+ SGRPROJ_BORDER_VERT);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
- ctxt.dgd_stride);
+ ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT);
for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
@@ -1242,14 +1241,17 @@
AV1_COMMON *const cm = &cpi->common;
// Construct a (WIENER_HALFWIN)-pixel border around the frame
+// Note use this border to gather stats even though the actual filter
+// may use less border on the top/bottom of a processing unit.
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
- ctxt.plane_height, ctxt.dgd_stride);
+ ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN,
+ WIENER_HALFWIN);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
- ctxt.dgd_stride);
+ ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN);
// Compute best Wiener filters for each rtile, one (encoder/decoder)
// tile at a time.
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 63b5bac..55ce1d5 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc
@@ -40,9 +40,11 @@
protected:
void RunSpeedTest() {
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
- int i, j;
+ int i, j, k;
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
@@ -72,8 +74,15 @@
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
- apply_selfguided_restoration(input, width, height, stride, eps, xqd,
- output, out_stride, tmpbuf);
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint8_t *input_p = input + k * stride + j;
+ uint8_t *output_p = output + k * out_stride + j;
+ apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf);
+ }
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
@@ -87,6 +96,8 @@
}
void RunCorrectnessTest() {
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
// Set the maximum width/height to test here. We actually test a small
// range of sizes *up to* this size, so that we can check, eg.,
// the behaviour on tiles which are not a multiple of 4 wide.
@@ -127,10 +138,24 @@
int test_w = max_w - (i / 9);
int test_h = max_h - (i % 9);
+ for (k = 0; k < test_h; k += pu_height)
+ for (j = 0; j < test_w; j += pu_width) {
+ int w = AOMMIN(pu_width, test_w - j);
+ int h = AOMMIN(pu_height, test_h - k);
+ uint8_t *input_p = input + k * stride + j;
+ uint8_t *output_p = output + k * out_stride + j;
+ uint8_t *output2_p = output2 + k * out_stride + j;
+ apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
+ output_p, out_stride, tmpbuf);
+ apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+ output2_p, out_stride, tmpbuf);
+ }
+ /*
apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
output, out_stride, tmpbuf);
apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf);
+ */
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k) {
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
@@ -167,9 +192,11 @@
protected:
void RunSpeedTest() {
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
- int i, j;
+ int i, j, k;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
@@ -201,9 +228,16 @@
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
- apply_selfguided_restoration_highbd(input, width, height, stride,
- bit_depth, eps, xqd, output,
- out_stride, tmpbuf);
+ for (k = 0; k < height; k += pu_height)
+ for (j = 0; j < width; j += pu_width) {
+ int w = AOMMIN(pu_width, width - j);
+ int h = AOMMIN(pu_height, height - k);
+ uint16_t *input_p = input + k * stride + j;
+ uint16_t *output_p = output + k * out_stride + j;
+ apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
+ eps, xqd, output_p, out_stride,
+ tmpbuf);
+ }
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
@@ -217,6 +251,8 @@
}
void RunCorrectnessTest() {
+ const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+ const int pu_height = RESTORATION_PROC_UNIT_SIZE;
// Set the maximum width/height to test here. We actually test a small
// range of sizes *up to* this size, so that we can check, eg.,
// the behaviour on tiles which are not a multiple of 4 wide.
@@ -259,12 +295,29 @@
int test_w = max_w - (i / 9);
int test_h = max_h - (i % 9);
+ for (k = 0; k < test_h; k += pu_height)
+ for (j = 0; j < test_w; j += pu_width) {
+ int w = AOMMIN(pu_width, test_w - j);
+ int h = AOMMIN(pu_height, test_h - k);
+ uint16_t *input_p = input + k * stride + j;
+ uint16_t *output_p = output + k * out_stride + j;
+ uint16_t *output2_p = output2 + k * out_stride + j;
+ apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
+ eps, xqd, output_p, out_stride,
+ tmpbuf);
+ apply_selfguided_restoration_highbd_c(input_p, w, h, stride,
+ bit_depth, eps, xqd, output2_p,
+ out_stride, tmpbuf);
+ }
+
+ /*
apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
bit_depth, eps, xqd, output2,
out_stride, tmpbuf);
+ */
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);