Add new convolve variant for loop-restoration The convolve filters generated by loop_wiener_filter_tile are not compatible with some existing convolve implementations (they can have coefficients >128, sums of (certain subsets of) coefficients >128, etc.) So we implement a new variant, which takes a filter with 128 subtracted from its central element and which adds an extra copy of the source just before clipping to a pixel (reinstating the 128 we subtracted). This should be easy to adapt from the existing convolve functions, and this patch includes SSE2 highbd and SSSE3 lowbd implementations. Change-Id: I0abf4c2915f0665c49d88fe450dbc77b783f69e1
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c index a079d1b..addfc31 100644 --- a/aom_dsp/aom_convolve.c +++ b/aom_dsp/aom_convolve.c
@@ -332,6 +332,122 @@ filter_y, y_step_q4, w, h); } +#if CONFIG_LOOP_RESTORATION +static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_x[SUBPEL_TAPS / 2 - 1]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { + uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height); + convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, + dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h); +} + +void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +} +#endif // CONFIG_LOOP_RESTORATION + #if CONFIG_AOM_HIGHBITDEPTH static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, @@ -597,4 +713,142 @@ dst += dst_stride; } } -#endif + +#if CONFIG_LOOP_RESTORATION +static void highbd_convolve_add_src_horiz(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, + int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1], + bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_y[(SUBPEL_TAPS / 2 - 1) * src_stride], + bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_add_src_vert( + CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_horiz_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd); +} +#endif // CONFIG_LOOP_RESTORATION +#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index de150e6..11c8109 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -734,6 +734,16 @@ specialize qw/aom_convolve8_avg_vert sse2 ssse3/; specialize qw/aom_scaled_2d ssse3/; +if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { + add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + + specialize qw/aom_convolve8_add_src ssse3/; + specialize qw/aom_convolve8_add_src_horiz ssse3/; + specialize qw/aom_convolve8_add_src_vert ssse3/; +} # CONFIG_LOOP_RESTORATION + # TODO(any): These need to be extended to up to 128x128 block sizes if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) { specialize qw/aom_convolve_copy neon dspr2 msa/; @@ -770,6 +780,16 @@ add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/aom_highbd_convolve8_avg_vert/, "$sse2_x86_64"; + + if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { + add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + + specialize qw/aom_highbd_convolve8_add_src sse2/; + specialize qw/aom_highbd_convolve8_add_src_horiz sse2/; + specialize qw/aom_highbd_convolve8_add_src_vert sse2/; + } # CONFIG_LOOP_RESTORATION } # CONFIG_AOM_HIGHBITDEPTH #
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c index 1b71a9f..97a7df4 100644 --- a/aom_dsp/x86/aom_asm_stubs.c +++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -159,5 +159,24 @@ // int w, int h, int bd); HIGH_FUN_CONV_2D(, sse2); HIGH_FUN_CONV_2D(avg_, sse2); + +#if CONFIG_LOOP_RESTORATION +// The SSE2 highbd convolve functions can deal with coefficients up to 32767. +// So redirect highbd_convolve8_add_src to regular highbd_convolve8. +void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + ((int16_t *)filter_x)[3] += 128; + ((int16_t *)filter_y)[3] += 128; + aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + ((int16_t *)filter_x)[3] -= 128; + ((int16_t *)filter_y)[3] -= 128; +} +#endif // CONFIG_LOOP_RESTORATION #endif // CONFIG_AOM_HIGHBITDEPTH && ARCH_X86_64 #endif // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index 04a06e5..be37738 100644 --- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -291,6 +291,14 @@ filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3; filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3; filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3; +#if CONFIG_LOOP_RESTORATION +filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3; +#endif filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; @@ -331,6 +339,13 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, ssse3); +#if CONFIG_LOOP_RESTORATION +FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_, + ssse3); +FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v, + src - src_stride * 3, add_src_, ssse3); +#endif + #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7) \ { \ @@ -900,3 +915,6 @@ // int w, int h); FUN_CONV_2D(, ssse3); FUN_CONV_2D(avg_, ssse3); +#if CONFIG_LOOP_RESTORATION +FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3); +#endif
diff --git a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm index 91febbc..357f374 100644 --- a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -15,6 +15,7 @@ SECTION_RODATA pw_64: times 8 dw 64 +even_byte_mask: times 8 dw 0x00ff ; %define USE_PMULHRSW ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss @@ -142,6 +143,14 @@ paddsw m0, m1 paddsw m0, krd psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 + punpcklbw m4, m3 + paddsw m0, m4 +%endif packuswb m0, m0 psrldq m1, m0, 4 @@ -178,6 +187,12 @@ paddsw m0, m1 paddsw m0, krd psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + punpcklbw m4, m3 + paddsw m0, m4 +%endif packuswb m0, m0 %ifidn %1, h8_avg movd m4, [dstq] @@ -235,6 +250,15 @@ paddsw m6, m0 paddsw m6, krd psraw m6, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpcklbw m4, m3 + punpcklbw m5, m3 + paddsw m1, m4 + paddsw m6, m5 +%endif packuswb m1, m6 %ifidn %1, h8_avg pavgb m1, m2 @@ -269,6 +293,12 @@ paddsw m1, m4 paddsw m1, krd psraw m1, 7 +%ifidn %1, h8_add_src + pxor m6, m6 + movu m5, [srcq] + punpcklbw m5, m6 + paddsw m1, m5 +%endif packuswb m1, m1 %ifidn %1, h8_avg movh m0, [dstq] @@ -315,6 +345,14 @@ paddsw m4, krd psraw m0, 7 psraw m4, 7 +%ifidn %1, h8_add_src + movu m5, [srcq] + mova m7, m5 + pand m5, [even_byte_mask] + psrlw m7, 8 + paddsw m0, m5 + paddsw m4, m7 +%endif packuswb m0, m0 packuswb m4, m4 punpcklbw m0, m4 @@ -337,6 +375,12 @@ SUBPIX_HFILTER4 h8 SUBPIX_HFILTER4 h8_avg +%if CONFIG_LOOP_RESTORATION +SUBPIX_HFILTER16 h8_add_src +SUBPIX_HFILTER8 h8_add_src +SUBPIX_HFILTER4 h8_add_src +%endif + ;------------------------------------------------------------------------------- ; TODO(Linfeng): Detect cpu type and choose the code with better performance. @@ -413,12 +457,23 @@ paddsw m0, krd psraw m0, 7 paddsw m1, m5 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif packuswb m0, m0 paddsw m3, m7 paddsw m1, m3 paddsw m1, krd psraw m1, 7 +%ifidn %1, v8_add_src + movu m4, [src1q] + punpcklbw m4, m6 + paddsw m1, m4 +%endif lea srcq, [srcq + sstrideq * 2 ] lea src1q, [src1q + sstrideq * 2] packuswb m1, m1 @@ -462,6 +517,12 @@ paddsw m0, m2 paddsw m0, krd psraw m0, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif packuswb m0, m0 %ifidn %1, v8_avg movx m1, [dstq] @@ -643,6 +704,15 @@ paddsw m3, m7 paddsw m3, krd psraw m3, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down + mova m5, m4 + punpcklbw m4, m6 + punpckhbw m5, m6 + paddsw m0, m4 + paddsw m3, m5 +%endif packuswb m0, m3 add srcq, sstrideq @@ -804,3 +874,10 @@ SUBPIX_VFILTER v8_avg, 8 SUBPIX_VFILTER v8, 4 SUBPIX_VFILTER v8_avg, 4 + +%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \ + CONFIG_LOOP_RESTORATION +SUBPIX_VFILTER16 v8_add_src +SUBPIX_VFILTER v8_add_src, 8 +SUBPIX_VFILTER v8_add_src, 4 +%endif
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h index 09e4de8..ca2488d 100644 --- a/aom_dsp/x86/convolve.h +++ b/aom_dsp/x86/convolve.h
@@ -31,11 +31,7 @@ (void)x_step_q4; \ (void)filter_y; \ (void)y_step_q4; \ - if (filter[3] >= 128) { \ - aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - return; \ - } \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ assert(step_q4 == 16); \ if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ @@ -93,11 +89,8 @@ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h) { \ - if (filter_x[3] >= 128 || filter_y[3] >= 128) { \ - aom_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - return; \ - } \ + assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ + assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ assert(w <= MAX_SB_SIZE); \ assert(h <= MAX_SB_SIZE); \ assert(x_step_q4 == 16); \ @@ -122,8 +115,71 @@ } \ } -#if CONFIG_AOM_HIGHBITDEPTH +#if CONFIG_LOOP_RESTORATION +// convolve_add_src is only used by the Wiener filter, which will never +// end up calling the bilinear functions (it uses a symmetric filter, so +// the possible numbers of taps are 1,3,5,7) +#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \ + opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } +#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \ + void aom_convolve8_##type##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ + assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + aom_convolve8_##htype##horiz_##opt( \ + src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h + 7); \ + aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ + dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h); \ + } +#endif + +#if CONFIG_AOM_HIGHBITDEPTH typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, const ptrdiff_t src_pitch, uint16_t *output_ptr,
diff --git a/av1/common/restoration.c b/av1/common/restoration.c index 6573380..7bc2930 100644 --- a/av1/common/restoration.c +++ b/av1/common/restoration.c
@@ -208,6 +208,8 @@ } hkernel[WIENER_WIN] = 0; vkernel[WIENER_WIN] = 0; + hkernel[3] -= 128; + vkernel[3] -= 128; av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles, tile_width, tile_height, width, height, 0, 0, &h_start, &h_end, &v_start, &v_end); @@ -219,8 +221,8 @@ int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15); const uint8_t *data_p = data + i * stride + j; uint8_t *dst_p = dst + i * dst_stride + j; - aom_convolve8(data_p, stride, dst_p, dst_stride, hkernel, 16, vkernel, 16, - w, h); + aom_convolve8_add_src(data_p, stride, dst_p, dst_stride, hkernel, 16, + vkernel, 16, w, h); } } @@ -779,6 +781,8 @@ } hkernel[WIENER_WIN] = 0; vkernel[WIENER_WIN] = 0; + hkernel[3] -= 128; + vkernel[3] -= 128; av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles, tile_width, tile_height, width, height, 0, 0, &h_start, &h_end, &v_start, &v_end); @@ -790,9 +794,9 @@ int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15); const uint16_t *data_p = data + i * stride + j; uint16_t *dst_p = dst + i * dst_stride + j; - aom_highbd_convolve8_c(CONVERT_TO_BYTEPTR(data_p), stride, - CONVERT_TO_BYTEPTR(dst_p), dst_stride, hkernel, 16, - vkernel, 16, w, h, bit_depth); + aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride, + CONVERT_TO_BYTEPTR(dst_p), dst_stride, + hkernel, 16, vkernel, 16, w, h, bit_depth); } }