CLPF: Add quality dependent damping in the constrain function PSNR YCbCr: -0.17% -0.03% -0.40% APSNR YCbCr: -0.17% -0.02% -0.39% PSNRHVS: -0.06% SSIM: -0.17% MSSSIM: -0.07% CIEDE2000: -0.12% Change-Id: I69a4b6a4e18c22c3930069396540a6fee45cb30d
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index b64de8e..eafdb2b 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -854,8 +854,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") { if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; - add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd"; - add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd"; + add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp"; + add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp"; # VS compiling for 32 bit targets does not support vector types in # structs as arguments, which makes the v256 type of the intrinsics # hard to support, so optimizations for this target are disabled. @@ -866,8 +866,8 @@ } } add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; - add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd"; - add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd"; + add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp"; + add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp"; # VS compiling for 32 bit targets does not support vector types in # structs as arguments, which makes the v256 type of the intrinsics # hard to support, so optimizations for this target are disabled.
diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 8dfe5c0..7d9933d 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c
@@ -16,25 +16,25 @@ int sign(int i) { return i < 0 ? -1 : 1; } -int constrain(int x, int s, unsigned int bitdepth) { +int constrain(int x, int s, unsigned int damping) { return sign(x) * - AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + (abs(x) >> (bitdepth - 3 - - get_msb(s))))); + AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + + (abs(x) >> (damping - get_msb(s))))); } int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G, - int H, int s, unsigned int bd) { - int delta = 1 * constrain(A - X, s, bd) + 3 * constrain(B - X, s, bd) + - 1 * constrain(C - X, s, bd) + 3 * constrain(D - X, s, bd) + - 3 * constrain(E - X, s, bd) + 1 * constrain(F - X, s, bd) + - 3 * constrain(G - X, s, bd) + 1 * constrain(H - X, s, bd); + int H, int s, unsigned int dmp) { + int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) + + 1 * constrain(C - X, s, dmp) + 3 * constrain(D - X, s, dmp) + + 3 * constrain(E - X, s, dmp) + 1 * constrain(F - X, s, dmp) + + 3 * constrain(G - X, s, dmp) + 1 * constrain(H - X, s, dmp); return (8 + delta - (delta < 0)) >> 4; } void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, - unsigned int bitdepth) { + unsigned int damping) { int x, y; const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2; const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2; @@ -53,7 +53,7 @@ const int G = src[AOMMIN(ymax, y + 1) * sstride + x]; const int H = src[AOMMIN(ymax, y + 2) * sstride + x]; const int delta = - av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth); + av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping); dst[y * dstride + x] = X + delta; } } @@ -64,7 +64,7 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, - unsigned int bitdepth) { + unsigned int damping) { int x, y; const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2; const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2; @@ -83,7 +83,7 @@ const int G = src[AOMMIN(ymax, y + 1) * sstride + x]; const int H = src[AOMMIN(ymax, y + 2) * sstride + x]; const int delta = - av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth); + av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping); dst[y * dstride + x] = X + delta; } } @@ -91,14 +91,13 @@ #endif // Return number of filtered blocks -void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, - const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, - int enable_fb_flag, unsigned int strength, - unsigned int fb_size_log2, int plane, - int (*decision)(int, int, const YV12_BUFFER_CONFIG *, - const YV12_BUFFER_CONFIG *, - const AV1_COMMON *cm, int, int, int, - unsigned int, unsigned int, int8_t *)) { +void av1_clpf_frame( + const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org, + AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, + unsigned int fb_size_log2, int plane, + int (*decision)(int, int, const YV12_BUFFER_CONFIG *, + const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int, + int, unsigned int, unsigned int, int8_t *, int)) { /* Constrained low-pass filter (CLPF) */ int c, k, l, m, n; const int subx = plane != AOM_PLANE_Y && frame->subsampling_x; @@ -124,6 +123,11 @@ ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer) : frame->y_buffer; uint8_t *dst_buffer; + // Damping is the filter cut-off log2 point for the constrain function. + // For instance, if the damping is 5, neighbour differences above 32 will + // be ignored and half of the strength will be applied for a difference of 16. + int damping = + cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6); // Make buffer space for in-place filtering #if CONFIG_AOM_HIGHBITDEPTH @@ -169,7 +173,8 @@ decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength, fb_size_log2, cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride + - xoff / MIN_FB_SIZE))) { + xoff / MIN_FB_SIZE, + plane))) { // Iterate over all smaller blocks inside the filter block for (m = 0; m < ((h + bs - 1) >> bslog); m++) { for (n = 0; n < ((w + bs - 1) >> bslog); n++) { @@ -260,16 +265,16 @@ aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer), CONVERT_TO_SHORTPTR(dst_buffer), sstride, dstride, xpos, ypos, sizex, sizey, strength, - boundary_type, cm->bit_depth); + boundary_type, damping); } else { aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, ypos, sizex, sizey, strength, boundary_type, - cm->bit_depth); + damping); } #else aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, ypos, sizex, sizey, strength, boundary_type, - cm->bit_depth); + damping); #endif } }
diff --git a/av1/common/clpf.h b/av1/common/clpf.h index 1642fb3..b50b7a6 100644 --- a/av1/common/clpf.h +++ b/av1/common/clpf.h
@@ -19,7 +19,7 @@ #define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2) int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G, - int H, int b, unsigned int bd); + int H, int b, unsigned int dmp); void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, @@ -27,6 +27,6 @@ int (*decision)(int, int, const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int, int, - unsigned int, unsigned int, int8_t *)); + unsigned int, unsigned int, int8_t *, int)); #endif
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 636d2a1..b36553f 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h
@@ -17,7 +17,8 @@ // Process blocks of width 8, two lines at a time, 8 bit. static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizey, - BOUNDARY_TYPE bt, unsigned int strength) { + BOUNDARY_TYPE bt, unsigned int strength, + unsigned int dmp) { const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1; const int right = !(bt & TILE_RIGHT_BOUNDARY); const int left = !(bt & TILE_LEFT_BOUNDARY); @@ -68,7 +69,7 @@ f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); } - o = calc_delta(o, a, b, c, d, e, f, g, h, strength); + o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp); v64_store_aligned(dst, v128_high_v64(o)); v64_store_aligned(dst + dstride, v128_low_v64(o)); src += sstride * 2; @@ -79,7 +80,7 @@ // As above, but with no clipping tests static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizey, - unsigned int strength) { + unsigned int strength, unsigned int dmp) { int y; dst += x0 + y0 * dstride; @@ -102,8 +103,8 @@ v64_load_unaligned(src + 1 + sstride)); const v128 f = v128_from_v64(v64_load_unaligned(src + 2), v64_load_unaligned(src + 2 + sstride)); - const v128 o = - calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, strength); + const v128 o = calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, + strength, dmp); v64_store_aligned(dst, v128_high_v64(o)); v64_store_aligned(dst + dstride, v128_low_v64(o)); @@ -115,7 +116,8 @@ // Process blocks of width 4, four lines at a time, 8 bit. static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizey, - BOUNDARY_TYPE bt, unsigned int strength) { + BOUNDARY_TYPE bt, unsigned int strength, + unsigned int dmp) { const int right = !(bt & TILE_RIGHT_BOUNDARY); const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1; const int left = !(bt & TILE_LEFT_BOUNDARY); @@ -178,7 +180,7 @@ f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); } - o = calc_delta(o, a, b, c, d, e, f, g, h, strength); + o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp); u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4))); @@ -192,7 +194,7 @@ // As above, but with no clipping tests static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizey, - unsigned int strength) { + unsigned int strength, unsigned int dmp) { int y; dst += x0 + y0 * dstride; @@ -229,7 +231,7 @@ u32_load_unaligned(src + 3 * sstride + 2)); const v128 o = calc_delta(v128_from_32(l2, l3, l4, l5), a, b, c, d, e, f, g, - h, strength); + h, strength, dmp); u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); @@ -244,34 +246,34 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, - BOUNDARY_TYPE bt, unsigned int bd) { + BOUNDARY_TYPE bt, unsigned int dmp) { if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) { // Fallback to C for odd sizes: // * block widths not 4 or 8 // * block heights not a multiple of 4 if the block width is 4 aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength, - bt, bd); + bt, dmp); } else { if (bt) (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, - y0, sizey, bt, strength); + y0, sizey, bt, strength, dmp); else (sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)( - src, dst, sstride, dstride, x0, y0, sizey, strength); + src, dst, sstride, dstride, x0, y0, sizey, strength, dmp); } } #if CONFIG_AOM_HIGHBITDEPTH // sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) - -// strength + (abs(a - b) >> (bd - 3 - log2(s))))) +// strength + (abs(a - b) >> (dmp - log2(s))))) SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength, - unsigned int bd) { + unsigned int dmp) { const v128 diff = v128_sub_16(v128_max_s16(a, b), v128_min_s16(a, b)); const v128 sign = v128_cmpeq_16(v128_min_s16(a, b), a); // -(a <= b) const v128 zero = v128_zero(); const v128 s = v128_max_s16( zero, v128_sub_16(v128_dup_16(strength), - v128_shr_u16(diff, bd - 3 - get_msb(strength)))); + v128_shr_u16(diff, dmp - get_msb(strength)))); return v128_sub_16( v128_xor(sign, v128_max_s16( @@ -280,20 +282,21 @@ sign); } -// delta = 1/16 * constrain(a, x, s, bd) + 3/16 * constrain(b, x, s, bd) + -// 1/16 * constrain(c, x, s, bd) + 3/16 * constrain(d, x, s, bd) + -// 3/16 * constrain(e, x, s, bd) + 1/16 * constrain(f, x, s, bd) + -// 3/16 * constrain(g, x, s, bd) + 1/16 * constrain(h, x, s, bd) +// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) + +// 1/16 * constrain(c, x, s, dmp) + 3/16 * constrain(d, x, s, dmp) + +// 3/16 * constrain(e, x, s, dmp) + 1/16 * constrain(f, x, s, dmp) + +// 3/16 * constrain(g, x, s, dmp) + 1/16 * constrain(h, x, s, dmp) SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, v128 f, v128 g, v128 h, unsigned int s, - unsigned int bd) { + unsigned int dmp) { const v128 bdeg = v128_add_16( - v128_add_16(constrain_hbd(b, x, s, bd), constrain_hbd(d, x, s, bd)), - v128_add_16(constrain_hbd(e, x, s, bd), constrain_hbd(g, x, s, bd))); + v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)), + v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp))); const v128 delta = v128_add_16( v128_add_16( - v128_add_16(constrain_hbd(a, x, s, bd), constrain_hbd(c, x, s, bd)), - v128_add_16(constrain_hbd(f, x, s, bd), constrain_hbd(h, x, s, bd))), + v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)), + v128_add_16(constrain_hbd(f, x, s, dmp), + constrain_hbd(h, x, s, dmp))), v128_add_16(v128_add_16(bdeg, bdeg), bdeg)); return v128_add_16( x, @@ -305,23 +308,23 @@ static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, v128 f, v128 g, v128 h, uint16_t *dst, - unsigned int s, unsigned int bd, int dstride) { - o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd); + unsigned int s, unsigned int dmp, int dstride) { + o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp); v64_store_aligned(dst, v128_high_v64(o)); v64_store_aligned(dst + dstride, v128_low_v64(o)); } static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, v128 f, v128 g, v128 h, uint16_t *dst, - unsigned int s, unsigned int bd) { - v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd)); + unsigned int s, unsigned int dmp) { + v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp)); } // Process blocks of width 4, two lines at time. SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizey, unsigned int strength, - BOUNDARY_TYPE bt, unsigned int bd) { + BOUNDARY_TYPE bt, unsigned int dmp) { const int right = !(bt & TILE_RIGHT_BOUNDARY); const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1; const int left = !(bt & TILE_LEFT_BOUNDARY); @@ -372,7 +375,7 @@ e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); } - calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, bd, dstride); + calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, dmp, dstride); src += sstride * 2; dst += dstride * 2; } @@ -383,7 +386,7 @@ int sstride, int dstride, int x0, int y0, int sizey, unsigned int strength, - unsigned int bd) { + unsigned int dmp) { int y; dst += x0 + y0 * dstride; @@ -408,7 +411,7 @@ v64_load_unaligned(src + 2 + sstride)); calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst, - strength, bd, dstride); + strength, dmp, dstride); src += sstride * 2; dst += dstride * 2; } @@ -418,7 +421,7 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizey, unsigned int strength, BOUNDARY_TYPE bt, - unsigned int bd) { + unsigned int dmp) { const int right = !(bt & TILE_RIGHT_BOUNDARY); const int left = !(bt & TILE_LEFT_BOUNDARY); const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2; @@ -463,7 +466,7 @@ e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); f = v128_shuffle_8(o, v128_load_aligned(f_shuff)); } - calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd); + calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp); dst += dstride; } } @@ -472,7 +475,7 @@ SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizey, unsigned int strength, - unsigned int bd) { + unsigned int dmp) { int y; dst += x0 + y0 * dstride; @@ -489,7 +492,7 @@ const v128 e = v128_load_unaligned(src + 1); const v128 f = v128_load_unaligned(src + 2); - calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd); + calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp); src += sstride; dst += dstride; } @@ -498,20 +501,20 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, - BOUNDARY_TYPE bt, unsigned int bd) { + BOUNDARY_TYPE bt, unsigned int dmp) { if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { // Fallback to C for odd sizes: // * block width not 4 or 8 // * block heights not a multiple of 2 if the block width is 4 aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, - strength, bt, bd); + strength, bt, dmp); } else { if (bt) (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)( - src, dst, sstride, dstride, x0, y0, sizey, strength, bt, bd); + src, dst, sstride, dstride, x0, y0, sizey, strength, bt, dmp); else (sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)( - src, dst, sstride, dstride, x0, y0, sizey, strength, bd); + src, dst, sstride, dstride, x0, y0, sizey, strength, dmp); } } #endif
diff --git a/av1/common/clpf_simd_kernel.h b/av1/common/clpf_simd_kernel.h index 5412746..92ca340 100644 --- a/av1/common/clpf_simd_kernel.h +++ b/av1/common/clpf_simd_kernel.h
@@ -16,11 +16,12 @@ // sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) - // strength + (abs(a - b) >> (5 - log2(s))))) -SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength) { +SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength, + unsigned int damping) { const v128 diff = v128_sub_8(v128_max_u8(a, b), v128_min_u8(a, b)); const v128 sign = v128_cmpeq_8(v128_min_u8(a, b), a); // -(a <= b) const v128 s = v128_ssub_u8(v128_dup_8(strength), - v128_shr_u8(diff, 5 - get_msb(strength))); + v128_shr_u8(diff, damping - get_msb(strength))); return v128_sub_8(v128_xor(sign, v128_ssub_u8(diff, v128_ssub_u8(diff, s))), sign); } @@ -30,14 +31,15 @@ // 3/16 * constrain(e, x, s) + 1/16 * constrain(f, x, s) + // 3/16 * constrain(g, x, s) + 1/16 * constrain(h, x, s) SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, - v128 f, v128 g, v128 h, unsigned int s) { + v128 f, v128 g, v128 h, unsigned int s, + unsigned int dmp) { const v128 bdeg = - v128_add_8(v128_add_8(constrain(b, x, s), constrain(d, x, s)), - v128_add_8(constrain(e, x, s), constrain(g, x, s))); - const v128 delta = - v128_add_8(v128_add_8(v128_add_8(constrain(a, x, s), constrain(c, x, s)), - v128_add_8(constrain(f, x, s), constrain(h, x, s))), - v128_add_8(v128_add_8(bdeg, bdeg), bdeg)); + v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)), + v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp))); + const v128 delta = v128_add_8( + v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)), + v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))), + v128_add_8(v128_add_8(bdeg, bdeg), bdeg)); return v128_add_8( x, v128_shr_s8( v128_add_8(v128_dup_8(8),
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index 880ee83..57849cb 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c
@@ -2746,7 +2746,8 @@ UNUSED const YV12_BUFFER_CONFIG *org, UNUSED const AV1_COMMON *cm, UNUSED int block_size, UNUSED int w, UNUSED int h, UNUSED unsigned int strength, - UNUSED unsigned int fb_size_log2, int8_t *bit) { + UNUSED unsigned int fb_size_log2, int8_t *bit, + UNUSED int plane) { return *bit; }
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c index 4ea3989..3ef67cc 100644 --- a/av1/encoder/clpf_rdo.c +++ b/av1/encoder/clpf_rdo.c
@@ -19,7 +19,7 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, - unsigned int bd) { + unsigned int dmp) { int x, y; for (y = y0; y < y0 + size; y++) { for (x = x0; x < x0 + size; x++) { @@ -34,7 +34,7 @@ const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x]; const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x]; const int delta = - av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bd); + av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, dmp); const int Y = X + delta; *sum0 += (O - X) * (O - X); *sum1 += (O - Y) * (O - Y); @@ -45,7 +45,7 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, - unsigned int bd) { + unsigned int dmp) { int x, y; for (y = y0; y < y0 + size; y++) { @@ -60,9 +60,9 @@ const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)]; const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x]; const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x]; - const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd); - const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd); - const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd); + const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp); + const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp); + const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp); const int F1 = X + delta1; const int F2 = X + delta2; const int F3 = X + delta3; @@ -79,7 +79,8 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, - unsigned int strength, int size, unsigned int bd) { + unsigned int strength, int size, unsigned int bd, + unsigned int dmp) { const int shift = bd - 8; int x, y; for (y = y0; y < y0 + size; y++) { @@ -95,7 +96,7 @@ const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift; const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift; const int delta = av1_clpf_sample(X, A, B, C, D, E, F, G, H, - strength >> shift, bd - shift); + strength >> shift, dmp - shift); const int Y = X + delta; *sum0 += (O - X) * (O - X); *sum1 += (O - Y) * (O - Y); @@ -107,7 +108,7 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, - unsigned int bd) { + unsigned int bd, unsigned int dmp) { const int shift = bd - 8; int x, y; @@ -124,11 +125,11 @@ const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift; const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift; const int delta1 = - av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd - shift); + av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp - shift); const int delta2 = - av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd - shift); + av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp - shift); const int delta3 = - av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd - shift); + av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp - shift); const int F1 = X + delta1; const int F2 = X + delta2; const int F3 = X + delta3; @@ -144,8 +145,10 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, int block_size, int w, int h, unsigned int strength, - unsigned int fb_size_log2, int8_t *res) { + unsigned int fb_size_log2, int8_t *res, int plane) { int m, n, sum0 = 0, sum1 = 0; + int damping = + cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6); for (m = 0; m < h; m++) { for (n = 0; n < w; n++) { @@ -160,18 +163,18 @@ CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength, - block_size, cm->bit_depth); + block_size, cm->bit_depth, damping); } else { aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength, - block_size, cm->bit_depth); + block_size, damping); } #else aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength, block_size, - cm->bit_depth); + damping); #endif } } @@ -214,6 +217,9 @@ plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height; int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride; int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride; + int damping = + cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6); + sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0; if (plane == AOM_PLANE_Y && fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) { @@ -270,19 +276,19 @@ ->mbmi.skip; #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { - aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer), - CONVERT_TO_SHORTPTR(org_buffer), rec_stride, - org_stride, xpos, ypos, rec_width, rec_height, - sum + skip, block_size, cm->bit_depth); + aom_clpf_detect_multi_hbd( + CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer), + rec_stride, org_stride, xpos, ypos, rec_width, rec_height, + sum + skip, block_size, cm->bit_depth, damping); } else { aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum + skip, - block_size, cm->bit_depth); + block_size, damping); } #else aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum + skip, - block_size, cm->bit_depth); + block_size, damping); #endif filtered |= !skip; }
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h index 586eed0..f92f7d2 100644 --- a/av1/encoder/clpf_rdo.h +++ b/av1/encoder/clpf_rdo.h
@@ -17,7 +17,7 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, int block_size, int w, int h, unsigned int strength, - unsigned int fb_size_log2, int8_t *res); + unsigned int fb_size_log2, int8_t *res, int plane); void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h index 177359f..5f61997 100644 --- a/av1/encoder/clpf_rdo_simd.h +++ b/av1/encoder/clpf_rdo_simd.h
@@ -69,7 +69,7 @@ int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, - unsigned int bd) { + unsigned int dmp) { const int bottom = height - 2 - y0; const int right = width - 8 - x0; ssd128_internal ssd0 = v128_ssd_u8_init(); @@ -78,7 +78,7 @@ if (size != 8) { // Fallback to plain C aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0, - sum1, strength, size, bd); + sum1, strength, size, dmp); return; } @@ -90,8 +90,8 @@ read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, &a, &b, &c, &d, &e, &f, &g, &h); ssd0 = v128_ssd_u8(ssd0, o, r); - ssd1 = - v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength)); + ssd1 = v128_ssd_u8(ssd1, o, + calc_delta(r, a, b, c, d, e, f, g, h, strength, dmp)); rec += rstride * 2; org += ostride * 2; } @@ -102,17 +102,17 @@ SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, v128 f, v128 g, v128 h, ssd128_internal *ssd1, ssd128_internal *ssd2, - ssd128_internal *ssd3) { - *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1)); - *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2)); - *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4)); + ssd128_internal *ssd3, unsigned int dmp) { + *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1, dmp)); + *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2, dmp)); + *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4, dmp)); } // Test multiple filter strengths at once. void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, - unsigned int bd) { + unsigned int dmp) { const int bottom = height - 2 - y0; const int right = width - 8 - x0; ssd128_internal ssd0 = v128_ssd_u8_init(); @@ -123,7 +123,7 @@ if (size != 8) { // Fallback to plain C aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height, - sum, size, bd); + sum, size, dmp); return; } @@ -135,7 +135,7 @@ read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, &a, &b, &c, &d, &e, &f, &g, &h); ssd0 = v128_ssd_u8(ssd0, o, r); - calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3); + calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3, dmp); rec += 2 * rstride; org += 2 * ostride; } @@ -190,7 +190,8 @@ int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, - unsigned int bitdepth) { + unsigned int bitdepth, + unsigned int damping) { const int shift = bitdepth - 8; const int bottom = height - 2 - y0; const int right = width - 8 - x0; @@ -200,7 +201,7 @@ if (size != 8) { // Fallback to plain C aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height, - sum0, sum1, strength, size, bitdepth); + sum0, sum1, strength, size, bitdepth, damping); return; } @@ -212,8 +213,8 @@ read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, &a, &b, &c, &d, &e, &f, &g, &h, shift); ssd0 = v128_ssd_u8(ssd0, o, r); - ssd1 = v128_ssd_u8( - ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, strength >> shift)); + ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, + strength >> shift, damping)); rec += rstride * 2; org += ostride * 2; } @@ -225,7 +226,8 @@ const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, - int size, unsigned int bitdepth) { + int size, unsigned int bitdepth, + unsigned int damping) { const int bottom = height - 2 - y0; const int right = width - 8 - x0; ssd128_internal ssd0 = v128_ssd_u8_init(); @@ -236,7 +238,7 @@ if (size != 8) { // Fallback to plain C aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width, - height, sum, size, bitdepth); + height, sum, size, bitdepth, damping); return; } @@ -248,7 +250,8 @@ read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8); ssd0 = v128_ssd_u8(ssd0, o, r); - calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3); + calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3, + damping); rec += rstride * 2; org += ostride * 2; }