Refactoring/simplification of buffers used for sgr Inlcudes miscellaneous cleanups, test fixes, and code reorganization for loop-restoration components. Change-Id: I5b2e6419234d945e6f4344b22636119b50df4054
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index d70e0f5..eb90586 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -628,7 +628,7 @@ add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf"; specialize qw/apply_selfguided_restoration sse4_1/; - add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf"; + add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps"; specialize qw/av1_selfguided_restoration sse4_1/; add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps"; @@ -638,7 +638,7 @@ add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf"; specialize qw/apply_selfguided_restoration_highbd sse4_1/; - add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf"; + add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps"; specialize qw/av1_selfguided_restoration_highbd sse4_1/; add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
diff --git a/av1/common/restoration.c b/av1/common/restoration.c index 989e0c9..b2d6340 100644 --- a/av1/common/restoration.c +++ b/av1/common/restoration.c
@@ -22,14 +22,6 @@ #include "aom_ports/mem.h" -#define USE_SIMPLER_SGR 1 - -#define MAX_RADIUS 3 // Only 1, 2, 3 allowed -#define MAX_EPS 80 // Max value of eps -#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) -#define SGRPROJ_MTABLE_BITS 20 -#define SGRPROJ_RECIP_BITS 12 - const sgr_params_type sgr_params[SGRPROJ_PARAMS] = { #if USE_HIGHPASS_IN_SGRPROJ // corner, edge, r2, eps2 @@ -39,7 +31,7 @@ { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 } #else // r1, eps1, r2, eps2 -#if USE_SIMPLER_SGR +#if MAX_RADIUS == 2 { 2, 12, 1, 4 }, { 2, 15, 1, 6 }, { 2, 18, 1, 8 }, { 2, 20, 1, 9 }, { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 }, { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 }, @@ -49,7 +41,7 @@ { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 }, { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 }, { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 }, -#endif // USE_SIMPLER_SGR +#endif // MAX_RADIUS == 2 #endif }; @@ -112,21 +104,22 @@ rst->keyframe = kf; } -void extend_frame(uint8_t *data, int width, int height, int stride) { +void extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert) { uint8_t *data_p; int i; for (i = 0; i < height; ++i) { data_p = data + i * stride; - memset(data_p - WIENER_HALFWIN, data_p[0], WIENER_HALFWIN); - memset(data_p + width, data_p[width - 1], WIENER_HALFWIN); + memset(data_p - border_horz, data_p[0], border_horz); + memset(data_p + width, data_p[width - 1], border_horz); } - data_p = data - WIENER_HALFWIN; - for (i = -WIENER_HALFWIN; i < 0; ++i) { - memcpy(data_p + i * stride, data_p, width + 2 * WIENER_HALFWIN); + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, width + 2 * border_horz); } - for (i = height; i < height + WIENER_HALFWIN; ++i) { + for (i = height; i < height + border_vert; ++i) { memcpy(data_p + i * stride, data_p + (height - 1) * stride, - width + 2 * WIENER_HALFWIN); + width + 2 * border_horz); } } @@ -256,7 +249,8 @@ RestorationInternal *rst, uint8_t *dst, int dst_stride) { int tile_idx; - extend_frame(data, width, height, stride); + extend_frame(data, width, height, stride, WIENER_BORDER_HORZ, + WIENER_BORDER_VERT); for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) { loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride); @@ -639,16 +633,17 @@ const int32_t one_by_x[MAX_NELEM] = { 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, - 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 158, - 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, 105, - 102, 100, 98, 95, 93, 91, 89, 87, 85, 84 + 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, +#if MAX_RADIUS > 2 + 158, 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, + 105, 102, 100, 98, 95, 93, 91, 89, 87, 85, 84 +#endif // MAX_RADIUS > 2 }; static void av1_selfguided_restoration_internal(int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, - int bit_depth, int r, int eps, - int32_t *tmpbuf) { + int bit_depth, int r, int eps) { const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int num_stride = width_ext; @@ -657,10 +652,11 @@ // We also align the stride to a multiple of 16 bytes, for consistency // with the SIMD version of this function. int buf_stride = ((width_ext + 3) & ~3) + 16; - - int32_t *A = tmpbuf; - int32_t *B = tmpbuf + SGRPROJ_OUTBUF_SIZE; - int8_t num_[RESTORATION_TILEPELS_MAX]; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int8_t num_[RESTORATION_PROC_UNIT_PELS]; int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ; int i, j; @@ -844,10 +840,11 @@ void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, - int r, int eps, int32_t *tmpbuf) { + int r, int eps) { + int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; int32_t *dgd32 = - tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; int i, j; for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { @@ -855,8 +852,7 @@ } } av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst, - dst_stride, 8, r, eps, - tmpbuf + RESTORATION_TILEPELS_MAX); + dst_stride, 8, r, eps); } void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride, @@ -955,7 +951,6 @@ int xq[2]; int32_t *flt1 = tmpbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int i, j; assert(width * height <= RESTORATION_TILEPELS_MAX); #if USE_HIGHPASS_IN_SGRPROJ @@ -963,10 +958,10 @@ sgr_params[eps].corner, sgr_params[eps].edge); #else av1_selfguided_restoration_c(dat, width, height, stride, flt1, width, - sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2); + sgr_params[eps].r1, sgr_params[eps].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_c(dat, width, height, stride, flt2, width, - sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2); + sgr_params[eps].r2, sgr_params[eps].e2); decode_xq(xqd, xq); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -1009,7 +1004,7 @@ int h = AOMMIN(procunit_height, v_end - i); uint8_t *data_p = data + i * stride + j; uint8_t *dst_p = dst + i * dst_stride + j; - apply_selfguided_restoration_c( + apply_selfguided_restoration( data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep, rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf); } @@ -1019,7 +1014,8 @@ int stride, RestorationInternal *rst, uint8_t *dst, int dst_stride) { int tile_idx; - extend_frame(data, width, height, stride); + extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ, + SGRPROJ_BORDER_VERT); for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) { loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride); @@ -1030,7 +1026,8 @@ int stride, RestorationInternal *rst, uint8_t *dst, int dst_stride) { int tile_idx; - extend_frame(data, width, height, stride); + extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ, + RESTORATION_BORDER_VERT); for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) { if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) { loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst, @@ -1046,23 +1043,23 @@ } #if CONFIG_HIGHBITDEPTH -void extend_frame_highbd(uint16_t *data, int width, int height, int stride) { +void extend_frame_highbd(uint16_t *data, int width, int height, int stride, + int border_horz, int border_vert) { uint16_t *data_p; int i, j; for (i = 0; i < height; ++i) { data_p = data + i * stride; - for (j = -WIENER_HALFWIN; j < 0; ++j) data_p[j] = data_p[0]; - for (j = width; j < width + WIENER_HALFWIN; ++j) - data_p[j] = data_p[width - 1]; + for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0]; + for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1]; } - data_p = data - WIENER_HALFWIN; - for (i = -WIENER_HALFWIN; i < 0; ++i) { + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { memcpy(data_p + i * stride, data_p, - (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t)); + (width + 2 * border_horz) * sizeof(uint16_t)); } - for (i = height; i < height + WIENER_HALFWIN; ++i) { + for (i = height; i < height + border_vert; ++i) { memcpy(data_p + i * stride, data_p + (height - 1) * stride, - (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t)); + (width + 2 * border_horz) * sizeof(uint16_t)); } } @@ -1171,7 +1168,8 @@ uint16_t *data = CONVERT_TO_SHORTPTR(data8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); int tile_idx; - extend_frame_highbd(data, width, height, stride); + extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ, + WIENER_BORDER_VERT); for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) { loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst, bit_depth, dst, dst_stride); @@ -1181,10 +1179,11 @@ void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, - int eps, int32_t *tmpbuf) { + int eps) { + int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; int32_t *dgd32 = - tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; int i, j; for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { @@ -1192,8 +1191,7 @@ } } av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst, - dst_stride, bit_depth, r, eps, - tmpbuf + RESTORATION_TILEPELS_MAX); + dst_stride, bit_depth, r, eps); } void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height, @@ -1294,7 +1292,6 @@ int xq[2]; int32_t *flt1 = tmpbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int i, j; assert(width * height <= RESTORATION_TILEPELS_MAX); #if USE_HIGHPASS_IN_SGRPROJ @@ -1303,11 +1300,11 @@ #else av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width, bit_depth, sgr_params[eps].r1, - sgr_params[eps].e1, tmpbuf2); + sgr_params[eps].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width, bit_depth, sgr_params[eps].r2, - sgr_params[eps].e2, tmpbuf2); + sgr_params[eps].e2); decode_xq(xqd, xq); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -1351,7 +1348,7 @@ int h = AOMMIN(procunit_height, v_end - i); uint16_t *data_p = data + i * stride + j; uint16_t *dst_p = dst + i * dst_stride + j; - apply_selfguided_restoration_highbd_c( + apply_selfguided_restoration_highbd( data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep, rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf); } @@ -1364,7 +1361,8 @@ int tile_idx; uint16_t *data = CONVERT_TO_SHORTPTR(data8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - extend_frame_highbd(data, width, height, stride); + extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ, + SGRPROJ_BORDER_VERT); for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) { loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst, bit_depth, dst, dst_stride); @@ -1378,7 +1376,8 @@ uint16_t *data = CONVERT_TO_SHORTPTR(data8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); int tile_idx; - extend_frame_highbd(data, width, height, stride); + extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ, + RESTORATION_BORDER_VERT); for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) { if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) { loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst,
diff --git a/av1/common/restoration.h b/av1/common/restoration.h index 30ddc11..efeac48 100644 --- a/av1/common/restoration.h +++ b/av1/common/restoration.h
@@ -25,10 +25,28 @@ #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) #define RESTORATION_PROC_UNIT_SIZE 64 -// Determines line buffer requirement for LR. Should be set at the max -// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT -#define RESTORATION_BORDER_VERT 0 -#define RESTORATION_BORDER_HORZ 3 // Do not change this + +#define SGRPROJ_BORDER_VERT 0 // Vertical border used for Sgr +#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for Sgr + +#define WIENER_BORDER_VERT 0 // Vertical border used for Wiener +#define WIENER_HALFWIN 3 +#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener + +// RESTORATION_BORDER_VERT determines line buffer requirement for LR. +// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT. +// Note the line buffer needed is twice the value of this macro. +#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT +#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT) +#else +#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ +#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ) +#else +#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT // Pad up to 20 more (may be much less is needed) #define RESTORATION_PADDING 20 @@ -39,28 +57,20 @@ RESTORATION_PADDING)) #define RESTORATION_TILESIZE_MAX 256 -#define RESTORATION_TILEPELS_MAX \ - (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \ - (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) +#define RESTORATION_TILEPELS_MAX \ + ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \ + (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)) -// 4 32-bit buffers needed for the filter: -// 2 for the restored versions of the frame and -// 2 for each restoration operation -#define SGRPROJ_OUTBUF_SIZE \ - ((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \ - (RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)) -#define SGRPROJ_TMPBUF_SIZE \ - (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \ - SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS) +// Two 32-bit buffers needed for the restored versions from two filters +// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored +// on the decoder side. +#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t)) #define SGRPROJ_EXTBUF_SIZE (0) #define SGRPROJ_PARAMS_BITS 4 #define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS) #define USE_HIGHPASS_IN_SGRPROJ 0 -#define SGRPROJ_BORDER_VERT 0 // Vertical border used for sgr -#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for sgr - // Precision bits for projection #define SGRPROJ_PRJ_BITS 7 // Restoration precision bits generated higher than source before projection @@ -85,15 +95,12 @@ #define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS) -#define MAX_RADIUS 3 // Only 1, 2, 3 allowed +#define MAX_RADIUS 2 // Only 1, 2, 3 allowed #define MAX_EPS 80 // Max value of eps #define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) #define SGRPROJ_MTABLE_BITS 20 #define SGRPROJ_RECIP_BITS 12 -#define WIENER_HALFWIN 3 -#define WIENER_BORDER_HORZ (WIENER_HALFWIN) -#define WIENER_BORDER_VERT 0 #define WIENER_HALFWIN1 (WIENER_HALFWIN + 1) #define WIENER_WIN (2 * WIENER_HALFWIN + 1) #define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN)) @@ -268,9 +275,11 @@ int height); void av1_free_restoration_struct(RestorationInfo *rst_info); -void extend_frame(uint8_t *data, int width, int height, int stride); +void extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert); #if CONFIG_HIGHBITDEPTH -void extend_frame_highbd(uint16_t *data, int width, int height, int stride); +void extend_frame_highbd(uint16_t *data, int width, int height, int stride, + int border_horz, int border_vert); #endif // CONFIG_HIGHBITDEPTH void decode_xq(int *xqd, int *xq); void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
diff --git a/av1/common/x86/selfguided_sse4.c b/av1/common/x86/selfguided_sse4.c index d0716e6..4006b85 100644 --- a/av1/common/x86/selfguided_sse4.c +++ b/av1/common/x86/selfguided_sse4.c
@@ -664,17 +664,18 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height, int dgd_stride, int32_t *dst, - int dst_stride, int r, int eps, - int32_t *tmpbuf) { + int dst_stride, int r, int eps) { const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; - int32_t *A = tmpbuf; - int32_t *B = A + SGRPROJ_OUTBUF_SIZE; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; int i, j; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes for efficiency. - int buf_stride = ((width + 3) & ~3) + 16; + int buf_stride = ((width_ext + 3) & ~3) + 16; // Don't filter tiles with dimensions < 5 on any axis if ((width < 5) || (height < 5)) return; @@ -1059,7 +1060,6 @@ int xq[2]; int32_t *flt1 = tmpbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int i, j; assert(width * height <= RESTORATION_TILEPELS_MAX); #if USE_HIGHPASS_IN_SGRPROJ @@ -1067,12 +1067,10 @@ sgr_params[eps].corner, sgr_params[eps].edge); #else av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width, - sgr_params[eps].r1, sgr_params[eps].e1, - tmpbuf2); + sgr_params[eps].r1, sgr_params[eps].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width, - sgr_params[eps].r2, sgr_params[eps].e2, - tmpbuf2); + sgr_params[eps].r2, sgr_params[eps].e2); decode_xq(xqd, xq); __m128i xq0 = _mm_set1_epi32(xq[0]); @@ -1374,17 +1372,18 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, - int bit_depth, int r, int eps, - int32_t *tmpbuf) { + int bit_depth, int r, int eps) { const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; - int32_t *A = tmpbuf; - int32_t *B = A + SGRPROJ_OUTBUF_SIZE; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; int i, j; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes for efficiency. - int buf_stride = ((width + 3) & ~3) + 16; + int buf_stride = ((width_ext + 3) & ~3) + 16; // Don't filter tiles with dimensions < 5 on any axis if ((width < 5) || (height < 5)) return; @@ -1741,7 +1740,6 @@ int xq[2]; int32_t *flt1 = tmpbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int i, j; assert(width * height <= RESTORATION_TILEPELS_MAX); #if USE_HIGHPASS_IN_SGRPROJ @@ -1751,11 +1749,11 @@ #else av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1, width, bit_depth, sgr_params[eps].r1, - sgr_params[eps].e1, tmpbuf2); + sgr_params[eps].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2, width, bit_depth, sgr_params[eps].r2, - sgr_params[eps].e2, tmpbuf2); + sgr_params[eps].e2); decode_xq(xqd, xq); __m128i xq0 = _mm_set1_epi32(xq[0]);
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c index 98fbfb5..5650a48 100644 --- a/av1/encoder/pickrst.c +++ b/av1/encoder/pickrst.c
@@ -29,9 +29,9 @@ #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encoder.h" +#include "av1/encoder/mathutils.h" #include "av1/encoder/picklpf.h" #include "av1/encoder/pickrst.h" -#include "av1/encoder/mathutils.h" // When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed. // When set to RESTORE_TYPES we allow switchable. @@ -354,7 +354,6 @@ int32_t *rstbuf) { int32_t *flt1 = rstbuf; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; - int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int ep, bestep = 0; int64_t err, besterr = -1; int exqd[2], bestxqd[2] = { 0, 0 }; @@ -387,11 +386,11 @@ #else av1_selfguided_restoration_highbd( dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth, - sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2); + sgr_params[ep].r1, sgr_params[ep].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration_highbd( dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth, - sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2); + sgr_params[ep].r2, sgr_params[ep].e2); } } else { #endif @@ -407,12 +406,11 @@ sgr_params[ep].corner, sgr_params[ep].edge); #else av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride, - sgr_params[ep].r1, sgr_params[ep].e1, - tmpbuf2); + sgr_params[ep].r1, sgr_params[ep].e1); #endif // USE_HIGHPASS_IN_SGRPROJ av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p, flt2_stride, sgr_params[ep].r2, - sgr_params[ep].e2, tmpbuf2); + sgr_params[ep].e2); } #if CONFIG_HIGHBITDEPTH } @@ -640,11 +638,12 @@ #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width, - ctxt.plane_height, ctxt.dgd_stride); + ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, + SGRPROJ_BORDER_VERT); else #endif extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height, - ctxt.dgd_stride); + ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT); for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) { @@ -1242,14 +1241,17 @@ AV1_COMMON *const cm = &cpi->common; // Construct a (WIENER_HALFWIN)-pixel border around the frame +// Note use this border to gather stats even though the actual filter +// may use less border on the top/bottom of a processing unit. #if CONFIG_HIGHBITDEPTH if (cm->use_highbitdepth) extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width, - ctxt.plane_height, ctxt.dgd_stride); + ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN, + WIENER_HALFWIN); else #endif extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height, - ctxt.dgd_stride); + ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN); // Compute best Wiener filters for each rtile, one (encoder/decoder) // tile at a time.
diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc index 63b5bac..55ce1d5 100644 --- a/test/selfguided_filter_test.cc +++ b/test/selfguided_filter_test.cc
@@ -40,9 +40,11 @@ protected: void RunSpeedTest() { + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; const int width = 256, height = 256, stride = 288, out_stride = 288; const int NUM_ITERS = 2000; - int i, j; + int i, j, k; uint8_t *input_ = (uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t)); @@ -72,8 +74,15 @@ std::clock_t start = std::clock(); for (i = 0; i < NUM_ITERS; ++i) { - apply_selfguided_restoration(input, width, height, stride, eps, xqd, - output, out_stride, tmpbuf); + for (k = 0; k < height; k += pu_height) + for (j = 0; j < width; j += pu_width) { + int w = AOMMIN(pu_width, width - j); + int h = AOMMIN(pu_height, height - k); + uint8_t *input_p = input + k * stride + j; + uint8_t *output_p = output + k * out_stride + j; + apply_selfguided_restoration(input_p, w, h, stride, eps, xqd, + output_p, out_stride, tmpbuf); + } } std::clock_t end = std::clock(); double elapsed = ((end - start) / (double)CLOCKS_PER_SEC); @@ -87,6 +96,8 @@ } void RunCorrectnessTest() { + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; // Set the maximum width/height to test here. We actually test a small // range of sizes *up to* this size, so that we can check, eg., // the behaviour on tiles which are not a multiple of 4 wide. @@ -127,10 +138,24 @@ int test_w = max_w - (i / 9); int test_h = max_h - (i % 9); + for (k = 0; k < test_h; k += pu_height) + for (j = 0; j < test_w; j += pu_width) { + int w = AOMMIN(pu_width, test_w - j); + int h = AOMMIN(pu_height, test_h - k); + uint8_t *input_p = input + k * stride + j; + uint8_t *output_p = output + k * out_stride + j; + uint8_t *output2_p = output2 + k * out_stride + j; + apply_selfguided_restoration(input_p, w, h, stride, eps, xqd, + output_p, out_stride, tmpbuf); + apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd, + output2_p, out_stride, tmpbuf); + } + /* apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd, output, out_stride, tmpbuf); apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd, output2, out_stride, tmpbuf); + */ for (j = 0; j < test_h; ++j) for (k = 0; k < test_w; ++k) { ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]); @@ -167,9 +192,11 @@ protected: void RunSpeedTest() { + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; const int width = 256, height = 256, stride = 288, out_stride = 288; const int NUM_ITERS = 2000; - int i, j; + int i, j, k; int bit_depth = GET_PARAM(0); int mask = (1 << bit_depth) - 1; @@ -201,9 +228,16 @@ std::clock_t start = std::clock(); for (i = 0; i < NUM_ITERS; ++i) { - apply_selfguided_restoration_highbd(input, width, height, stride, - bit_depth, eps, xqd, output, - out_stride, tmpbuf); + for (k = 0; k < height; k += pu_height) + for (j = 0; j < width; j += pu_width) { + int w = AOMMIN(pu_width, width - j); + int h = AOMMIN(pu_height, height - k); + uint16_t *input_p = input + k * stride + j; + uint16_t *output_p = output + k * out_stride + j; + apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth, + eps, xqd, output_p, out_stride, + tmpbuf); + } } std::clock_t end = std::clock(); double elapsed = ((end - start) / (double)CLOCKS_PER_SEC); @@ -217,6 +251,8 @@ } void RunCorrectnessTest() { + const int pu_width = RESTORATION_PROC_UNIT_SIZE; + const int pu_height = RESTORATION_PROC_UNIT_SIZE; // Set the maximum width/height to test here. We actually test a small // range of sizes *up to* this size, so that we can check, eg., // the behaviour on tiles which are not a multiple of 4 wide. @@ -259,12 +295,29 @@ int test_w = max_w - (i / 9); int test_h = max_h - (i % 9); + for (k = 0; k < test_h; k += pu_height) + for (j = 0; j < test_w; j += pu_width) { + int w = AOMMIN(pu_width, test_w - j); + int h = AOMMIN(pu_height, test_h - k); + uint16_t *input_p = input + k * stride + j; + uint16_t *output_p = output + k * out_stride + j; + uint16_t *output2_p = output2 + k * out_stride + j; + apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth, + eps, xqd, output_p, out_stride, + tmpbuf); + apply_selfguided_restoration_highbd_c(input_p, w, h, stride, + bit_depth, eps, xqd, output2_p, + out_stride, tmpbuf); + } + + /* apply_selfguided_restoration_highbd(input, test_w, test_h, stride, bit_depth, eps, xqd, output, out_stride, tmpbuf); apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride, bit_depth, eps, xqd, output2, out_stride, tmpbuf); + */ for (j = 0; j < test_h; ++j) for (k = 0; k < test_w; ++k) ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);