CDEF: Add damping to dering high-latency, cpu-used=0: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.1650 | 0.2545 | 0.2977 | -0.0423 | -0.0947 | -0.0725 | -0.0365 low-latency, cpu-used=0: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.4006 | 0.0501 | -0.0108 | -0.1790 | -0.1660 | -0.1992 | -0.2135 low latency, cpu-used=4: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.5508 | -0.2445 | -0.2762 | -0.1981 | -0.2878 | -0.2228 | -0.3733 Change-Id: Ia20df28c8bbb6182215b02016053af33bd498145
diff --git a/av1/av1.cmake b/av1/av1.cmake index dcf7fd1..9c8849e 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -227,6 +227,7 @@ "${AOM_ROOT}/av1/common/clpf.c" "${AOM_ROOT}/av1/common/clpf.h" "${AOM_ROOT}/av1/common/clpf_simd.h" + "${AOM_ROOT}/av1/common/cdef_simd.h" "${AOM_ROOT}/av1/common/cdef.c" "${AOM_ROOT}/av1/common/cdef.h" "${AOM_ROOT}/av1/common/od_dering.c"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk index ddd8f7a..edccbbe 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk
@@ -89,6 +89,7 @@ AV1_COMMON_SRCS-yes += common/clpf.c AV1_COMMON_SRCS-yes += common/clpf.h AV1_COMMON_SRCS-yes += common/clpf_simd.h +AV1_COMMON_SRCS-yes += common/cdef_simd.h AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 310e604..b600c66 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -626,8 +626,8 @@ add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd"; add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd"; add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift"; - add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir"; - add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir"; + add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping"; + add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping"; add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride"; add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
diff --git a/av1/common/cdef.h b/av1/common/cdef.h index 5f48ca0..c0e7dfc 100644 --- a/av1/common/cdef.h +++ b/av1/common/cdef.h
@@ -23,6 +23,18 @@ #include "av1/common/onyxc_int.h" #include "./od_dering.h" +static INLINE int sign(int i) { return i < 0 ? -1 : 1; } + +static INLINE int constrain(int diff, int threshold, unsigned int damping) { + return threshold + ? sign(diff) * + AOMMIN( + abs(diff), + AOMMAX(0, threshold - (abs(diff) >> + (damping - get_msb(threshold))))) + : 0; +} + #ifdef __cplusplus extern "C" { #endif
diff --git a/av1/common/cdef_simd.h b/av1/common/cdef_simd.h new file mode 100644 index 0000000..2649099 --- /dev/null +++ b/av1/common/cdef_simd.h
@@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AV1_COMMON_CDEF_SIMD_H_ +#define AV1_COMMON_CDEF_SIMD_H_ + +#include "aom_dsp/aom_simd.h" + +// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) +SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold, + unsigned int adjdamp) { + v128 diff = v128_sub_16(a, b); + const v128 sign = v128_shr_n_s16(diff, 15); + diff = v128_abs_s16(diff); + const v128 s = + v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp)); + return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign); +} + +#endif // AV1_COMMON_CDEF_SIMD_H_
diff --git a/av1/common/clpf.c b/av1/common/clpf.c index d3fd906..3637dee 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c
@@ -9,18 +9,12 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "av1/common/clpf.h" +#include "./clpf.h" #include "./av1_rtcd.h" +#include "./cdef.h" #include "aom/aom_image.h" #include "aom_dsp/aom_dsp_common.h" -static int sign(int i) { return i < 0 ? -1 : 1; } - -static int constrain(int x, int s, unsigned int damping) { - return sign(x) * - AOMMIN(abs(x), AOMMAX(0, s - (abs(x) >> (damping - get_msb(s))))); -} - int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G, int H, int s, unsigned int dmp) { int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 97fe87c..a615b5e 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h
@@ -10,8 +10,9 @@ */ #include "./av1_rtcd.h" -#include "aom_ports/mem.h" +#include "./cdef_simd.h" #include "aom_ports/bitops.h" +#include "aom_ports/mem.h" // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp))) SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength, @@ -242,17 +243,6 @@ } } -// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp))) -SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength, - unsigned int adjdamp) { - v128 diff = v128_sub_16(a, b); - const v128 sign = v128_shr_n_s16(diff, 15); - diff = v128_abs_s16(diff); - const v128 s = - v128_ssub_u16(v128_dup_16(strength), v128_shr_u16(diff, adjdamp)); - return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign); -} - // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) + // 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) + // 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) + @@ -261,13 +251,12 @@ v128 f, v128 g, v128 h, unsigned int s, unsigned int dmp) { const v128 bdeg = v128_add_16( - v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)), - v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp))); + v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)), + v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp))); const v128 delta = v128_add_16( v128_add_16( - v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)), - v128_add_16(constrain_hbd(f, x, s, dmp), - constrain_hbd(h, x, s, dmp))), + v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)), + v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))), v128_add_16(v128_add_16(bdeg, bdeg), bdeg)); return v128_add_16( x, @@ -297,9 +286,9 @@ SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, unsigned int s, unsigned int dmp) { const v128 bc = - v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp)); + v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp)); const v128 delta = v128_add_16( - v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)), + v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)), v128_add_16(v128_add_16(bc, bc), bc)); return v128_add_16( x,
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c index 646a3e9..ebbc524 100644 --- a/av1/common/od_dering.c +++ b/av1/common/od_dering.c
@@ -115,7 +115,7 @@ /* Smooth in the direction detected. */ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, - int dir) { + int dir, int damping) { int i; int j; int k; @@ -134,8 +134,8 @@ xx; p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] - xx; - if (abs(p0) < threshold) sum += taps[k] * p0; - if (abs(p1) < threshold) sum += taps[k] * p1; + sum += taps[k] * constrain(p0, threshold, damping); + sum += taps[k] * constrain(p1, threshold, damping); } sum = (sum + 8) >> 4; yy = xx + sum; @@ -147,7 +147,7 @@ /* Smooth in the direction detected. */ void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in, int threshold, - int dir) { + int dir, int damping) { int i; int j; int k; @@ -166,8 +166,8 @@ xx; p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] - xx; - if (abs(p0) < threshold) sum += taps[k] * p0; - if (abs(p1) < threshold) sum += taps[k] * p1; + sum += taps[k] * constrain(p0, threshold, damping); + sum += taps[k] * constrain(p1, threshold, damping); } sum = (sum + 8) >> 4; yy = xx + sum; @@ -298,6 +298,7 @@ od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = { od_filter_dering_direction_4x4, od_filter_dering_direction_8x8 }; + clpf_damping += coeff_shift; bsize = OD_DERING_SIZE_LOG2 - xdec; if (!skip_dering) { if (pli == 0) { @@ -325,7 +326,7 @@ (filter_dering_direction[bsize - OD_LOG_BSIZE0])( &y[bi << 2 * bsize], 1 << bsize, &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], - od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]); + od_adjust_thresh(threshold, var[by][bx]), dir[by][bx], 6); } } else { for (bi = 0; bi < dering_count; bi++) { @@ -334,7 +335,7 @@ (filter_dering_direction[bsize - OD_LOG_BSIZE0])( &y[bi << 2 * bsize], 1 << bsize, &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold, - dir[by][bx]); + dir[by][bx], threshold == 0 ? 0 : get_msb(threshold) + 1); } } } @@ -356,14 +357,14 @@ dst ? (uint16_t *)dst + py * dstride + px : &y[bi << 2 * bsize], in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsize, OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize, - clpf_strength << coeff_shift, clpf_damping + coeff_shift); + clpf_strength << coeff_shift, clpf_damping); } else { // Do clpf and write the result to an 8 bit destination (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block : aom_clpf_hblock)( dst + py * dstride + px, in + py * OD_FILT_BSTRIDE + px, dstride, OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize, - clpf_strength << coeff_shift, clpf_damping + coeff_shift); + clpf_strength << coeff_shift, clpf_damping); } } } else {
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h index 3db7150..f294dc8 100644 --- a/av1/common/od_dering.h +++ b/av1/common/od_dering.h
@@ -41,7 +41,8 @@ typedef void (*od_filter_dering_direction_func)(uint16_t *y, int ystride, const uint16_t *in, - int threshold, int dir); + int threshold, int dir, + int damping); void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src, dering_list *dlist, int dering_count, int bsize);
diff --git a/av1/common/od_dering_simd.h b/av1/common/od_dering_simd.h index 266cb3c..fc1981f 100644 --- a/av1/common/od_dering_simd.h +++ b/av1/common/od_dering_simd.h
@@ -10,6 +10,7 @@ */ #include "./av1_rtcd.h" +#include "./cdef_simd.h" #include "./od_dering.h" /* partial A is a 16-bit vector of the form: @@ -210,141 +211,109 @@ return best_dir; } -static INLINE v128 od_cmplt_abs_epi16(v128 in, v128 threshold) { - return v128_cmplt_s16(v128_abs_s16(in), threshold); -} - void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in, - int threshold, int dir) { + int threshold, int dir, + int damping) { int i; - v128 sum; - v128 p; - v128 cmp; - v128 row; - v128 res; - v128 tmp; - v128 thresh; - int off1, off2; - off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0]; - off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1]; - thresh = v128_dup_16(threshold); + v128 p0, p1, sum, row, res; + int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0]; + int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1]; + + if (threshold) damping -= get_msb(threshold); for (i = 0; i < 4; i += 2) { sum = v128_zero(); - row = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]), - v64_load_aligned(&in[i * OD_FILT_BSTRIDE])); + row = v128_from_v64(v64_load_aligned(&in[i * OD_FILT_BSTRIDE]), + v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE])); - /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/ - tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off1]), - v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off1])); - p = v128_sub_16(tmp, row); - /*if (abs(p) < thresh) sum += taps[k]*p*/ - cmp = od_cmplt_abs_epi16(p, thresh); - p = v128_shl_n_16(p, 2); - p = v128_and(p, cmp); - sum = v128_add_16(sum, p); - /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/ - tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off1]), - v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off1])); - p = v128_sub_16(tmp, row); - /*if (abs(p) < thresh) sum += taps[k]*p1*/ - cmp = od_cmplt_abs_epi16(p, thresh); - p = v128_shl_n_16(p, 2); - p = v128_and(p, cmp); - sum = v128_add_16(sum, p); + // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping) + p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]), + v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o1])); + p0 = constrain16(p0, row, threshold, damping); - /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/ - tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off2]), - v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off2])); - p = v128_sub_16(tmp, row); - /*if (abs(p) < thresh) sum += taps[k]*p*/ - cmp = od_cmplt_abs_epi16(p, thresh); - p = v128_and(p, cmp); - sum = v128_add_16(sum, p); - /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/ - tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off2]), - v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off2])); - p = v128_sub_16(tmp, row); - /*if (abs(p) < thresh) sum += taps[k]*p1*/ - cmp = od_cmplt_abs_epi16(p, thresh); - p = v128_and(p, cmp); - sum = v128_add_16(sum, p); + // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping) + p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]), + v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o1])); + p1 = constrain16(p1, row, threshold, damping); - /*res = row + ((sum + 8) >> 4)*/ + // sum += 4 * (p0 + p1) + sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2)); + + // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping) + p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]), + v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o2])); + p0 = constrain16(p0, row, threshold, damping); + + // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping) + p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]), + v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o2])); + p1 = constrain16(p1, row, threshold, damping); + + // sum += 1 * (p0 + p1) + sum = v128_add_16(sum, v128_add_16(p0, p1)); + + // res = row + ((sum + 8) >> 4) res = v128_add_16(sum, v128_dup_16(8)); res = v128_shr_n_s16(res, 4); res = v128_add_16(row, res); - v64_store_aligned(&y[i * ystride], v128_low_v64(res)); - v64_store_aligned(&y[(i + 1) * ystride], v128_high_v64(res)); + v64_store_aligned(&y[i * ystride], v128_high_v64(res)); + v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res)); } } void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride, const uint16_t *in, - int threshold, int dir) { + int threshold, int dir, + int damping) { int i; - v128 sum; - v128 p0, p1; - v128 cmp; - v128 row; - v128 res; - v128 thresh; - int off1, off2, off3; - off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0]; - off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1]; - off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2]; - thresh = v128_dup_16(threshold); + v128 sum, p0, p1, row, res; + int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0]; + int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1]; + int o3 = OD_DIRECTION_OFFSETS_TABLE[dir][2]; + + if (threshold) damping -= get_msb(threshold); for (i = 0; i < 8; i++) { sum = v128_zero(); row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]); - /*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/ - p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row); - /*p0 = abs(p0) < thresh ? p0 : 0*/ - cmp = od_cmplt_abs_epi16(p0, thresh); - p0 = v128_and(p0, cmp); + // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping) + p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]); + p0 = constrain16(p0, row, threshold, damping); - /*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/ - p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row); - /*p1 = abs(p1) < thresh ? p1 : 0*/ - cmp = od_cmplt_abs_epi16(p1, thresh); - p1 = v128_and(p1, cmp); - /*sum += 3*(p0 + p1)*/ + // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping) + p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]); + p1 = constrain16(p1, row, threshold, damping); + + // sum += 3 * (p0 + p1) p0 = v128_add_16(p0, p1); p0 = v128_add_16(p0, v128_shl_n_16(p0, 1)); sum = v128_add_16(sum, p0); - /*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/ - p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row); - /*p0 = abs(p0) < thresh ? p0 : 0*/ - cmp = od_cmplt_abs_epi16(p0, thresh); - p0 = v128_and(p0, cmp); + // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping) + p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]); + p0 = constrain16(p0, row, threshold, damping); - /*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/ - p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row); - /*p1 = abs(p1) < thresh ? p1 : 0*/ - cmp = od_cmplt_abs_epi16(p1, thresh); - p1 = v128_and(p1, cmp); - /* sum += 2*(p0 + p1)*/ + // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping) + p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]); + p1 = constrain16(p1, row, threshold, damping); + + // sum += 2 * (p0 + p1) p0 = v128_shl_n_16(v128_add_16(p0, p1), 1); sum = v128_add_16(sum, p0); - /*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/ - p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row); - /*p0 = abs(p0) < thresh ? p0 : 0*/ - cmp = od_cmplt_abs_epi16(p0, thresh); - p0 = v128_and(p0, cmp); + // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping) + p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o3]); + p0 = constrain16(p0, row, threshold, damping); - /*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/ - p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row); - /*p1 = abs(p1) < thresh ? p1 : 0*/ - cmp = od_cmplt_abs_epi16(p1, thresh); - p1 = v128_and(p1, cmp); - /*sum += (p0 + p1)*/ + // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping) + p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o3]); + p1 = constrain16(p1, row, threshold, damping); + + // sum += (p0 + p1) p0 = v128_add_16(p0, p1); sum = v128_add_16(sum, p0); - /*res = row + ((sum + 8) >> 4)*/ + // res = row + ((sum + 8) >> 4) res = v128_add_16(sum, v128_dup_16(8)); res = v128_shr_n_s16(res, 4); res = v128_add_16(row, res);
diff --git a/test/dering_test.cc b/test/dering_test.cc index 9b225ef..195a60f 100644 --- a/test/dering_test.cc +++ b/test/dering_test.cc
@@ -27,10 +27,9 @@ namespace { -typedef void (*dering_dir_t)(uint16_t *y, int ystride, const uint16_t *in, - int threshold, int dir); - -typedef std::tr1::tuple<dering_dir_t, dering_dir_t, int> dering_dir_param_t; +typedef std::tr1::tuple<od_filter_dering_direction_func, + od_filter_dering_direction_func, int> + dering_dir_param_t; class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> { public: @@ -45,18 +44,15 @@ protected: int bsize; - dering_dir_t dering; - dering_dir_t ref_dering; + od_filter_dering_direction_func dering; + od_filter_dering_direction_func ref_dering; }; typedef CDEFDeringDirTest CDEFDeringSpeedTest; void test_dering(int bsize, int iterations, - void (*dering)(uint16_t *y, int ystride, const uint16_t *in, - int threshold, int dir), - void (*ref_dering)(uint16_t *y, int ystride, - const uint16_t *in, int threshold, - int dir)) { + od_filter_dering_direction_func dering, + od_filter_dering_direction_func ref_dering) { const int size = 8; const int ysize = size + 2 * OD_FILT_VBORDER; ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -67,60 +63,65 @@ memset(d, 0, sizeof(d)); int error = 0, threshold = 0, dir; - int boundary, depth, bits, level, count, errdepth = 0, errthreshold = 0, - errboundary = 0; + int boundary, damping, depth, bits, level, count, + errdepth = 0, errthreshold = 0, errboundary = 0, errdamping = 0; unsigned int pos = 0; for (boundary = 0; boundary < 16; boundary++) { for (depth = 8; depth <= 12; depth += 2) { - for (count = 0; count < iterations; count++) { - for (level = 0; level < (1 << depth) && !error; - level += (1 + 4 * !!boundary) << (depth - 8)) { - for (bits = 1; bits <= depth && !error; bits++) { - for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++) - s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0, - (1 << depth) - 1); - if (boundary) { - if (boundary & 1) { // Left - for (int i = 0; i < ysize; i++) - for (int j = 0; j < OD_FILT_HBORDER; j++) - s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; + for (damping = 5 + depth - 8; damping < 7 + depth - 8; damping++) { + for (count = 0; count < iterations; count++) { + for (level = 0; level < (1 << depth) && !error; + level += (1 + 4 * !!boundary) << (depth - 8)) { + for (bits = 1; bits <= depth && !error; bits++) { + for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++) + s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0, + (1 << depth) - 1); + if (boundary) { + if (boundary & 1) { // Left + for (int i = 0; i < ysize; i++) + for (int j = 0; j < OD_FILT_HBORDER; j++) + s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; + } + if (boundary & 2) { // Right + for (int i = 0; i < ysize; i++) + for (int j = OD_FILT_HBORDER + size; j < OD_FILT_BSTRIDE; + j++) + s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; + } + if (boundary & 4) { // Above + for (int i = 0; i < OD_FILT_VBORDER; i++) + for (int j = 0; j < OD_FILT_BSTRIDE; j++) + s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; + } + if (boundary & 8) { // Below + for (int i = OD_FILT_VBORDER + size; i < ysize; i++) + for (int j = 0; j < OD_FILT_BSTRIDE; j++) + s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; + } } - if (boundary & 2) { // Right - for (int i = 0; i < ysize; i++) - for (int j = OD_FILT_HBORDER + size; j < OD_FILT_BSTRIDE; j++) - s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; - } - if (boundary & 4) { // Above - for (int i = 0; i < OD_FILT_VBORDER; i++) - for (int j = 0; j < OD_FILT_BSTRIDE; j++) - s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; - } - if (boundary & 8) { // Below - for (int i = OD_FILT_VBORDER + size; i < ysize; i++) - for (int j = 0; j < OD_FILT_BSTRIDE; j++) - s[i * OD_FILT_BSTRIDE + j] = OD_DERING_VERY_LARGE; - } - } - for (dir = 0; dir < 8; dir++) { - for (threshold = 0; threshold < 64 << (depth - 8) && !error; - threshold += !error << (depth - 8)) { - ref_dering(ref_d, size, s + OD_FILT_HBORDER + - OD_FILT_VBORDER * OD_FILT_BSTRIDE, - threshold, dir); - // If dering and ref_dering are the same, we're just testing - // speed - if (dering != ref_dering) - ASM_REGISTER_STATE_CHECK(dering( - d, size, - s + OD_FILT_HBORDER + OD_FILT_VBORDER * OD_FILT_BSTRIDE, - threshold, dir)); - if (ref_dering != dering) { - for (pos = 0; pos < sizeof(d) / sizeof(*d) && !error; pos++) { - error = ref_d[pos] != d[pos]; - errdepth = depth; - errthreshold = threshold; - errboundary = boundary; + for (dir = 0; dir < 8; dir++) { + for (threshold = 0; threshold < 64 << (depth - 8) && !error; + threshold += (1 + 4 * !!boundary) << (depth - 8)) { + ref_dering(ref_d, size, s + OD_FILT_HBORDER + + OD_FILT_VBORDER * OD_FILT_BSTRIDE, + threshold, dir, damping); + // If dering and ref_dering are the same, we're just testing + // speed + if (dering != ref_dering) + ASM_REGISTER_STATE_CHECK(dering( + d, size, + s + OD_FILT_HBORDER + OD_FILT_VBORDER * OD_FILT_BSTRIDE, + threshold, dir, damping)); + if (ref_dering != dering) { + for (pos = 0; pos < sizeof(d) / sizeof(*d) && !error; + pos++) { + error = ref_d[pos] != d[pos]; + errdepth = depth; + errthreshold = threshold; + errboundary = boundary; + errdamping = damping; + } } } } @@ -138,6 +139,7 @@ << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos] << ") " << std::endl << "threshold: " << errthreshold << std::endl + << "damping: " << errdamping << std::endl << "depth: " << errdepth << std::endl << "size: " << bsize << std::endl << "boundary: " << errboundary << std::endl @@ -145,12 +147,8 @@ } void test_dering_speed(int bsize, int iterations, - void (*dering)(uint16_t *y, int ystride, - const uint16_t *in, int threshold, - int dir), - void (*ref_dering)(uint16_t *y, int ystride, - const uint16_t *in, int threshold, - int dir)) { + od_filter_dering_direction_func dering, + od_filter_dering_direction_func ref_dering) { aom_usec_timer ref_timer; aom_usec_timer timer;