Add experiment CONFIG_CDEF_SINGLEPASS: Make CDEF single pass
Low latency, cpu-used=0:
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
-0.3162 | -0.6719 | -0.6535 | 0.0089 | -0.3890 | -0.1515 | -0.6682
High latency, cpu-used=0:
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
-0.0293 | -0.3556 | -0.5505 | 0.0684 | -0.0862 | 0.0513 | -0.2765
Low latency, cpu-used=4:
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
-0.2248 | -0.7764 | -0.6630 | -0.2109 | -0.3240 | -0.2532 | -0.6980
High latency, cpu-used=4:
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
-0.1118 | -0.5841 | -0.7406 | -0.0463 | -0.2442 | -0.1064 | -0.4187
Change-Id: I9ca8399c8f45489541a66f535fb3d771eb1d59ab
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 46e55bb..e7d7e10 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -236,13 +236,15 @@
if (CONFIG_CDEF)
set(AOM_AV1_COMMON_SOURCES
${AOM_AV1_COMMON_SOURCES}
- "${AOM_ROOT}/av1/common/clpf.c"
- "${AOM_ROOT}/av1/common/clpf_simd.h"
+ if (!CONFIG_CDEF_SINGLEPASS)
+ "${AOM_ROOT}/av1/common/clpf.c"
+ "${AOM_ROOT}/av1/common/clpf_simd.h"
+ "${AOM_ROOT}/av1/common/cdef_block_simd.h")
+ endif ()
"${AOM_ROOT}/av1/common/cdef.c"
"${AOM_ROOT}/av1/common/cdef.h"
"${AOM_ROOT}/av1/common/cdef_block.c"
- "${AOM_ROOT}/av1/common/cdef_block.h"
- "${AOM_ROOT}/av1/common/cdef_block_simd.h")
+ "${AOM_ROOT}/av1/common/cdef_block.h")
set(AOM_AV1_ENCODER_SOURCES
${AOM_AV1_ENCODER_SOURCES}
@@ -250,22 +252,34 @@
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
- "${AOM_ROOT}/av1/common/clpf_sse2.c"
+ if (!CONFIG_CDEF_SINGLEPASS)
+ "${AOM_ROOT}/av1/common/clpf_sse2.c"
+ endif ()
"${AOM_ROOT}/av1/common/cdef_block_sse2.c")
set(AOM_AV1_COMMON_INTRIN_SSSE3
${AOM_AV1_COMMON_INTRIN_SSSE3}
- "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+ if (!CONFIG_CDEF_SINGLEPASS)
+ "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+ endif ()
"${AOM_ROOT}/av1/common/cdef_block_ssse3.c")
set(AOM_AV1_COMMON_INTRIN_SSE4_1
${AOM_AV1_COMMON_INTRIN_SSE4_1}
- "${AOM_ROOT}/av1/common/clpf_sse4.c"
+ if (!CONFIG_CDEF_SINGLEPASS)
+ "${AOM_ROOT}/av1/common/clpf_sse4.c"
+ endif ()
"${AOM_ROOT}/av1/common/cdef_block_sse4.c")
+ set(AOM_AV1_COMMON_INTRIN_AVX2
+ ${AOM_AV1_COMMON_INTRIN_AVX2}
+ "${AOM_ROOT}/av1/common/cdef_block_avx2.c")
+
set(AOM_AV1_COMMON_INTRIN_NEON
${AOM_AV1_COMMON_INTRIN_NEON}
- "${AOM_ROOT}/av1/common/clpf_neon.c"
+ if (!CONFIG_CDEF_SINGLEPASS)
+ "${AOM_ROOT}/av1/common/clpf_neon.c"
+ endif ()
"${AOM_ROOT}/av1/common/cdef_block_neon.c")
endif ()
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 5411229..d5d40af 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -90,12 +90,16 @@
AV1_COMMON_SRCS-yes += common/warped_motion.c
endif
ifeq ($(CONFIG_CDEF),yes)
+ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/cdef_block_avx2.c
+else
AV1_COMMON_SRCS-yes += common/clpf.c
AV1_COMMON_SRCS-yes += common/clpf_simd.h
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
+endif
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/cdef_block_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/cdef_block_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/cdef_block_sse4.c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index edc07f8..dc6a844 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -520,18 +520,22 @@
# Deringing Functions
if (aom_config("CONFIG_CDEF") eq "yes") {
- add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
- add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
- add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+ if (aom_config("CONFIG_CDEF_SINGLEPASS") ne "yes") {
+ add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+ add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+ add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+ add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+ add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+ add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+ } else {
+ add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max";
+ }
- add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
- add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
- add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
- add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
@@ -539,20 +543,28 @@
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
- specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
- specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
- specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
- specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
- specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
- specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
- specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
+ if (aom_config("CONFIG_CDEF_SINGLEPASS") eq "yes") {
+ specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+ } else {
+ specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
+ specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
+ specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
+ specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
- specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
- specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
- specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
- specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
- specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
- specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+ specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+ specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+ specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+ specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+ specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
+ specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+ }
}
}
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 8bb3874..8417fea 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -260,14 +260,21 @@
uv_sec_strength == 0) ||
(cdef_count = sb_compute_cdef_list(
cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist,
- get_filter_skip(level) || get_filter_skip(uv_level))) == 0) {
+#if CONFIG_CDEF_SINGLEPASS
+ (level & 1) || (uv_level & 1))) == 0)
+#else
+ get_filter_skip(level) || get_filter_skip(uv_level))) == 0)
+#endif
+ {
cdef_left = 0;
continue;
}
curr_row_cdef[fbc] = 1;
for (pli = 0; pli < nplanes; pli++) {
+#if !CONFIG_CDEF_SINGLEPASS
uint16_t dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE];
+#endif
int coffset;
int rend, cend;
int pri_damping = cm->cdef_pri_damping;
@@ -386,15 +393,28 @@
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth) {
cdef_filter_fb(
+#if CONFIG_CDEF_SINGLEPASS
+ NULL,
+ &CONVERT_TO_SHORTPTR(
+#else
(uint8_t *)&CONVERT_TO_SHORTPTR(
+#endif
xd->plane[pli]
.dst.buf)[xd->plane[pli].dst.stride *
(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+#if CONFIG_CDEF_SINGLEPASS
+ xd->plane[pli].dst.stride,
+#else
xd->plane[pli].dst.stride, dst,
+#endif
&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+#if CONFIG_CDEF_SINGLEPASS
+ sec_strength, pri_damping, sec_damping, coeff_shift);
+#else
sec_strength, sec_damping, pri_damping, coeff_shift, 0, 1);
+#endif
} else {
#endif
cdef_filter_fb(
@@ -402,10 +422,18 @@
.dst.buf[xd->plane[pli].dst.stride *
(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+#if CONFIG_CDEF_SINGLEPASS
+ NULL, xd->plane[pli].dst.stride,
+#else
xd->plane[pli].dst.stride, dst,
+#endif
&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+#if CONFIG_CDEF_SINGLEPASS
+ sec_strength, pri_damping, sec_damping, coeff_shift);
+#else
sec_strength, sec_damping, pri_damping, coeff_shift, 0, 0);
+#endif
#if CONFIG_HIGHBITDEPTH
}
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index 3fe836a..874644a 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -21,6 +21,7 @@
#include "./cdef.h"
/* Generated from gen_filter_tables.c. */
+#if !CONFIG_CDEF_SINGLEPASS || CDEF_FULL
const int cdef_directions[8][3] = {
{ -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2, -3 * CDEF_BSTRIDE + 3 },
{ 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2, -1 * CDEF_BSTRIDE + 3 },
@@ -31,6 +32,18 @@
{ 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0, 3 * CDEF_BSTRIDE + 0 },
{ 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1, 3 * CDEF_BSTRIDE - 1 }
};
+#else
+const int cdef_directions[8][2] = {
+ { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
+ { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
+};
+#endif
/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
The search minimizes the weighted variance along all the lines in a
@@ -110,6 +123,94 @@
return best_dir;
}
+#if CONFIG_CDEF_SINGLEPASS
+#if CDEF_FULL
+const int cdef_pri_taps[2][3] = { { 3, 2, 1 }, { 2, 2, 2 } };
+const int cdef_sec_taps[2][2] = { { 3, 1 }, { 3, 1 } };
+#else
+const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+#endif
+
+/* Smooth in the direction detected. */
+#if CDEF_CAP
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int pri_strength, int sec_strength,
+ int dir, int pri_damping, int sec_damping, int bsize,
+ UNUSED int max_unused)
+#else
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int pri_strength, int sec_strength,
+ int dir, int pri_damping, int sec_damping, int bsize,
+ int max)
+#endif
+{
+ int i, j, k;
+ const int s = CDEF_BSTRIDE;
+ const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+ const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+ for (i = 0; i < 4 << (bsize == BLOCK_8X8); i++) {
+ for (j = 0; j < 4 << (bsize == BLOCK_8X8); j++) {
+ int16_t sum = 0;
+ int16_t y;
+ int16_t x = in[i * s + j];
+#if CDEF_CAP
+ int max = x;
+ int min = x;
+#endif
+#if CDEF_FULL
+ for (k = 0; k < 3; k++)
+#else
+ for (k = 0; k < 2; k++)
+#endif
+ {
+ int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+ int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+ sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+ sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+#if CDEF_CAP
+ if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+ if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+ min = AOMMIN(p0, min);
+ min = AOMMIN(p1, min);
+#endif
+#if CDEF_FULL
+ if (k == 2) continue;
+#endif
+ int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
+ int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
+ int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
+ int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
+#if CDEF_CAP
+ if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+ if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+ if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+ if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+ min = AOMMIN(s0, min);
+ min = AOMMIN(s1, min);
+ min = AOMMIN(s2, min);
+ min = AOMMIN(s3, min);
+#endif
+ sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+ }
+#if CDEF_CAP
+ y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
+#else
+ y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), 0, max);
+#endif
+ if (dst8)
+ dst8[i * dstride + j] = (uint8_t)y;
+ else
+ dst16[i * dstride + j] = (uint16_t)y;
+ }
+ }
+}
+
+#else
+
/* Smooth in the direction detected. */
void cdef_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in,
int threshold, int dir, int damping) {
@@ -167,6 +268,7 @@
}
}
}
+#endif
/* Compute the primary filter strength for an 8x8 block based on the
directional variance difference. A high variance difference means
@@ -180,6 +282,7 @@
return var ? (strength * (4 + i) + 8) >> 4 : 0;
}
+#if !CONFIG_CDEF_SINGLEPASS
void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
int sstride) {
int i, j;
@@ -303,25 +406,56 @@
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int sec_damping, int pri_damping,
int coeff_shift, int skip_dering, int hbd) {
+#else
+
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+ int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+ int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int pri_damping, int sec_damping,
+ int coeff_shift) {
+#endif
int bi;
int bx;
int by;
int bsize, bsizex, bsizey;
+#if CONFIG_CDEF_SINGLEPASS
+ int pri_strength = (level >> 1) << coeff_shift;
+ int filter_skip = level & 1;
+ if (!pri_strength && !sec_strength && filter_skip) {
+ pri_strength = 19 << coeff_shift;
+ sec_strength = 7 << coeff_shift;
+ }
+#else
int threshold = (level >> 1) << coeff_shift;
int filter_skip = get_filter_skip(level);
if (level == 1) threshold = 31 << coeff_shift;
cdef_direction_func cdef_direction[] = { cdef_direction_4x4,
cdef_direction_8x8 };
+#endif
sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
bsize =
ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
bsizex = 3 - xdec;
bsizey = 3 - ydec;
-
- if (!skip_dering) {
+#if CONFIG_CDEF_SINGLEPASS
+ if (dirinit && pri_strength == 0 && sec_strength == 0)
+#else
+ if (!skip_dering)
+#endif
+ {
+#if CONFIG_CDEF_SINGLEPASS
+ // If we're here, both primary and secondary strengths are 0, and
+ // we still haven't written anything to y[] yet, so we just copy
+ // the input to y[]. This is necessary only for av1_cdef_search()
+ // and only av1_cdef_search() sets dirinit.
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+#else
if (pli == 0) {
if (!dirinit || !*dirinit) {
for (bi = 0; bi < cdef_count; bi++) {
@@ -394,12 +528,56 @@
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
+#endif
int iy, ix;
// TODO(stemidts/jmvalin): SIMD optimisations
for (iy = 0; iy < 1 << bsizey; iy++)
for (ix = 0; ix < 1 << bsizex; ix++)
+#if CONFIG_CDEF_SINGLEPASS
+ dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+#else
y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+#endif
in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
}
+#if CONFIG_CDEF_SINGLEPASS
+ return;
+#endif
}
+
+#if CONFIG_CDEF_SINGLEPASS
+ if (pli == 0) {
+ if (!dirinit || !*dirinit) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+ CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+ }
+ if (dirinit) *dirinit = 1;
+ }
+ }
+
+ assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
+ for (bi = 0; bi < cdef_count; bi++) {
+ int t = !filter_skip && dlist[bi].skip ? 0 : pri_strength;
+ int s = !filter_skip && dlist[bi].skip ? 0 : sec_strength;
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ if (dst8)
+ cdef_filter_block(
+ &dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL, dstride,
+ &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+ (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+ pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+ else
+ cdef_filter_block(
+ NULL, &dst16[dirinit ? bi << (bsizex + bsizey)
+ : (by << bsizey) * dstride + (bx << bsizex)],
+ dirinit ? 1 << bsizex : dstride,
+ &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+ (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+ pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+ }
+#endif
}
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 3891e2b..bf277fa 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -17,6 +17,9 @@
#define CDEF_BLOCKSIZE 64
#define CDEF_BLOCKSIZE_LOG2 6
#define CDEF_NBLOCKS (CDEF_BLOCKSIZE / 8)
+#if CONFIG_CDEF_SINGLEPASS
+#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
+#endif
/* We need to buffer three vertical lines. */
#define CDEF_VBORDER (3)
@@ -28,7 +31,24 @@
#define CDEF_VERY_LARGE (30000)
#define CDEF_INBUF_SIZE (CDEF_BSTRIDE * (CDEF_BLOCKSIZE + 2 * CDEF_VBORDER))
+#if CONFIG_CDEF_SINGLEPASS
+// Filter configuration
+#define CDEF_CAP 1 // 1 = Cap change to largest diff
+#define CDEF_FULL 0 // 1 = 7x7 filter, 0 = 5x5 filter
+
+#if CDEF_FULL
+extern const int cdef_pri_taps[2][3];
+extern const int cdef_sec_taps[2][2];
extern const int cdef_directions[8][3];
+#else
+extern const int cdef_pri_taps[2][2];
+extern const int cdef_sec_taps[2][2];
+extern const int cdef_directions[8][2];
+#endif
+
+#else // CONFIG_CDEF_SINGLEPASS
+extern const int cdef_directions[8][3];
+#endif
typedef struct {
uint8_t by;
@@ -36,12 +56,30 @@
uint8_t skip;
} cdef_list;
+#if CONFIG_CDEF_SINGLEPASS
+typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
+ int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength,
+ int dir, int pri_damping,
+ int sec_damping, int bsize, int max);
+void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count, int bsize);
+#else
typedef void (*cdef_direction_func)(uint16_t *y, int ystride,
const uint16_t *in, int threshold, int dir,
int damping);
int get_filter_skip(int level);
+#endif
+#if CONFIG_CDEF_SINGLEPASS
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+ int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+ int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int pri_damping, int sec_damping,
+ int coeff_shift);
+#else
void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
@@ -49,3 +87,4 @@
int sec_strength, int sec_damping, int pri_damping,
int coeff_shift, int skip_dering, int hbd);
#endif
+#endif
diff --git a/av1/common/cdef_block_avx2.c b/av1/common/cdef_block_avx2.c
new file mode 100644
index 0000000..5e48045
--- /dev/null
+++ b/av1/common/cdef_block_avx2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "./cdef_block_simd.h"
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 358d919..b53b88b 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -221,6 +221,815 @@
return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
}
+#if CONFIG_CDEF_SINGLEPASS
+// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
+SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
+ unsigned int adjdamp) {
+ const v256 diff16 = v256_sub_16(a, b);
+ v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
+ const v128 sign = v128_cmplt_s8(diff, v128_zero());
+ diff = v128_abs_s8(diff);
+ return v128_xor(
+ v128_add_8(sign,
+ v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
+ v128_shr_u8(diff, adjdamp)))),
+ sign);
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int max)
+#endif
+{
+ v128 p0, p1, p2, p3;
+ v256 sum, row, tap, res;
+#if CDEF_CAP
+ v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+#endif
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+ int po3 = cdef_directions[dir][2];
+#endif
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+ const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+ if (pri_strength) pri_damping -= get_msb(pri_strength);
+ if (sec_strength) sec_damping -= get_msb(sec_strength);
+
+ sum = v256_zero();
+ row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
+#if CDEF_CAP
+ max = min = row;
+#endif
+
+ if (pri_strength) {
+ // Primary near taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+ // Primary far taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+#if CDEF_FULL
+ // Primary extra taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po3]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po3]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po3]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po3]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po3]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po3]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po3]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po3]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[2] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+#endif
+ }
+
+ if (sec_strength) {
+ // Secondary near taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+
+ // Secondary far taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+ }
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+#if CDEF_CAP
+ res = v256_min_s16(v256_max_s16(res, min), max);
+#else
+ res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
+#endif
+ res = v256_pack_s16_u8(res, res);
+
+ p0 = v256_low_v128(res);
+ u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
+ u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
+ u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
+ u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int max)
+#endif
+{
+ int i;
+ v128 p0, p1, p2, p3;
+ v256 sum, row, res, tap;
+#if CDEF_CAP
+ v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+#endif
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+ int po3 = cdef_directions[dir][2];
+#endif
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+ const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+ if (pri_strength) pri_damping -= get_msb(pri_strength);
+ if (sec_strength) sec_damping -= get_msb(sec_strength);
+ for (i = 0; i < 8; i += 2) {
+ sum = v256_zero();
+ row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+ v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+#if CDEF_CAP
+ max = min = row;
+#endif
+ // Primary near taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+ // Primary far taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+#if CDEF_FULL
+ // Primary extra taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[2] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+#endif
+
+ // Secondary near taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+
+ // Secondary far taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+#if CDEF_CAP
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+#endif
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+#if CDEF_CAP
+ res = v256_min_s16(v256_max_s16(res, min), max);
+#else
+ res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
+#endif
+ res = v256_pack_s16_u8(res, res);
+
+ p0 = v256_low_v128(res);
+ v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
+ v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
+ }
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int max)
+#endif
+{
+ int i;
+ v128 p0, p1, p2, p3, sum, row, res;
+#if CDEF_CAP
+ v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
+#endif
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+ int po3 = cdef_directions[dir][2];
+#endif
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+ const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+ if (pri_strength) pri_damping -= get_msb(pri_strength);
+ if (sec_strength) sec_damping -= get_msb(sec_strength);
+ for (i = 0; i < 4; i += 2) {
+ sum = v128_zero();
+ row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+#if CDEF_CAP
+ min = max = row;
+#endif
+
+ // Primary near taps
+ p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+ p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v128_add_16(
+ sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+
+ // Primary far taps
+ p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+ p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v128_add_16(
+ sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
+
+#if CDEF_FULL
+ // Primary extra taps
+ p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
+ p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[2] * (p0 + p1)
+ sum = v128_add_16(
+ sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
+#endif
+
+ // Secondary near taps
+ p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+ p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+ p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+ p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+ v128_andn(p3, v128_cmpeq_16(p3, large)));
+ min = v128_min_s16(
+ v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
+ v128_add_16(v128_add_16(p0, p1),
+ v128_add_16(p2, p3))));
+
+ // Secondary far taps
+ p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+ p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+ p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+ p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+ v128_andn(p3, v128_cmpeq_16(p3, large)));
+ min = v128_min_s16(
+ v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
+ v128_add_16(v128_add_16(p0, p1),
+ v128_add_16(p2, p3))));
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
+ res = v128_add_16(sum, v128_dup_16(8));
+ res = v128_shr_n_s16(res, 4);
+ res = v128_add_16(row, res);
+#if CDEF_CAP
+ res = v128_min_s16(v128_max_s16(res, min), max);
+#else
+ res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
+#endif
+ v64_store_aligned(&dst[i * dstride], v128_high_v64(res));
+ v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(res));
+ }
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ int max)
+#endif
+{
+ int i;
+ v128 sum, p0, p1, p2, p3, row, res;
+#if CDEF_CAP
+ v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
+#endif
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+ int po3 = cdef_directions[dir][2];
+#endif
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+ const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+ if (pri_strength) pri_damping -= get_msb(pri_strength);
+ if (sec_strength) sec_damping -= get_msb(sec_strength);
+
+ for (i = 0; i < 8; i++) {
+ sum = v128_zero();
+ row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
+
+#if CDEF_CAP
+ min = max = row;
+#endif
+ // Primary near taps
+ p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]);
+ p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]);
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v128_add_16(
+ sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+
+ // Primary far taps
+ p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]);
+ p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]);
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v128_add_16(
+ sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
+
+#if CDEF_FULL
+ // Primary extra taps
+ p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]);
+ p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]);
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[2] * (p0 + p1)
+ sum = v128_add_16(
+ sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
+#endif
+
+ // Secondary near taps
+ p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]);
+ p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]);
+ p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]);
+ p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]);
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+ v128_andn(p3, v128_cmpeq_16(p3, large)));
+ min = v128_min_s16(
+ v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
+ v128_add_16(v128_add_16(p0, p1),
+ v128_add_16(p2, p3))));
+
+ // Secondary far taps
+ p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]);
+ p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]);
+ p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]);
+ p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]);
+#if CDEF_CAP
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+ v128_andn(p1, v128_cmpeq_16(p1, large)));
+ max =
+ v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+ v128_andn(p3, v128_cmpeq_16(p3, large)));
+ min = v128_min_s16(
+ v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
+ v128_add_16(v128_add_16(p0, p1),
+ v128_add_16(p2, p3))));
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
+ res = v128_add_16(sum, v128_dup_16(8));
+ res = v128_shr_n_s16(res, 4);
+ res = v128_add_16(row, res);
+#if CDEF_CAP
+ res = v128_min_s16(v128_max_s16(res, min), max);
+#else
+ res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
+#endif
+ v128_store_unaligned(&dst[i * dstride], res);
+ }
+}
+
+void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir, int pri_damping,
+ int sec_damping, int bsize, int max) {
+ if (dst8)
+ (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_8)
+ : SIMD_FUNC(cdef_filter_block_4x4_8))(
+ dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max);
+ else
+ (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_16)
+ : SIMD_FUNC(cdef_filter_block_4x4_16))(
+ dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max);
+}
+
+#else
+
void SIMD_FUNC(cdef_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in,
int threshold, int dir, int damping) {
int i;
@@ -364,6 +1173,7 @@
v64_store_unaligned(&dst[i * dstride], row);
}
}
+#endif
void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
const uint8_t *src, int sstride, int v,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 2d2f113..fb7550f 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3011,8 +3011,12 @@
#if CONFIG_CDEF
static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
int i;
+#if CONFIG_CDEF_SINGLEPASS
+ cm->cdef_pri_damping = cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
+#else
cm->cdef_pri_damping = aom_rb_read_literal(rb, 1) + 5;
cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
+#endif
cm->cdef_bits = aom_rb_read_literal(rb, 2);
cm->nb_cdef_strengths = 1 << cm->cdef_bits;
for (i = 0; i < cm->nb_cdef_strengths; i++) {
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 4a6fbd8..c86ef3a 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3454,8 +3454,13 @@
#if CONFIG_CDEF
static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
int i;
+#if CONFIG_CDEF_SINGLEPASS
+ aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
+ assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
+#else
aom_wb_write_literal(wb, cm->cdef_pri_damping - 5, 1);
aom_wb_write_literal(wb, cm->cdef_sec_damping - 3, 2);
+#endif
aom_wb_write_literal(wb, cm->cdef_bits, 2);
for (i = 0; i < cm->nb_cdef_strengths; i++) {
aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index da2370b..f3f7799 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4206,7 +4206,7 @@
cm->cdef_strengths[0] = 0;
cm->nb_cdef_strengths = 1;
} else {
- // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+ // Find CDEF parameters
av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
cpi->oxcf.speed > 0);
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 443e9e5..accc97e 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -68,11 +68,16 @@
uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
int fast) {
uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+#if !CONFIG_CDEF_SINGLEPASS
const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+#endif
int i, j;
uint64_t best_tot_mse = (uint64_t)1 << 63;
int best_id0 = 0;
int best_id1 = 0;
+#if CONFIG_CDEF_SINGLEPASS
+ const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+#endif
memset(tot_mse, 0, sizeof(tot_mse));
for (i = 0; i < sb_count; i++) {
int gi;
@@ -305,7 +310,11 @@
int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
uint64_t(*mse[2])[TOTAL_STRENGTHS];
+#if CONFIG_CDEF_SINGLEPASS
+ int pri_damping = 3 + (cm->base_qindex >> 6);
+#else
int pri_damping = 6;
+#endif
int sec_damping = 3 + (cm->base_qindex >> 6);
int i;
int nb_strengths;
@@ -414,6 +423,17 @@
int xsize = (nhb << mi_wide_l2[pli]) +
CDEF_HBORDER * (fbc != nhfb - 1) + xoff;
sec_strength = gi % CDEF_SEC_STRENGTHS;
+#if CONFIG_CDEF_SINGLEPASS
+ copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+ src[pli],
+ (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+ (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
+ stride[pli], ysize, xsize);
+ cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
+ dir, &dirinit, var, pli, dlist, cdef_count, threshold,
+ sec_strength + (sec_strength == 3), pri_damping,
+ sec_damping, coeff_shift);
+#else
if (sec_strength == 0)
copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
src[pli],
@@ -425,6 +445,7 @@
pli, dlist, cdef_count, threshold,
sec_strength + (sec_strength == 3), sec_damping,
pri_damping, coeff_shift, sec_strength != 0, 1);
+#endif
curr_mse = compute_cdef_dist(
ref_coeff[pli] +
(fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index aacb1ac..4d45a57 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -114,6 +114,7 @@
set(CONFIG_BGSPRITE 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CB4X4 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CDEF 1 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_CDEF_SINGLEPASS 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CFL 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CHROMA_2X2 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_CHROMA_SUB8X8 1 CACHE NUMBER "AV1 experiment flag.")
diff --git a/configure b/configure
index ce82e73..2f2f130 100755
--- a/configure
+++ b/configure
@@ -244,6 +244,7 @@
EXPERIMENT_LIST="
fp_mb_stats
cdef
+ cdef_singlepass
var_tx
rect_tx
rect_tx_ext
@@ -561,6 +562,7 @@
enabled altref2 && enable_feature flex_refs
enabled rect_tx_ext && enable_feature rect_tx
enabled cfl && enable_feature smooth_hv
+ enabled cdef_singlepass && enable_feature cdef
if ! enabled delta_q && enabled ext_delta_q; then
log_echo "ext_delta_q requires delta_q, so disabling ext_delta_q"
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
new file mode 100644
index 0000000..d2ab692
--- /dev/null
+++ b/test/cdef_test.cc
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+*/
+
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/cdef_block.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef std::tr1::tuple<cdef_filter_block_func, cdef_filter_block_func, int>
+ cdef_dir_param_t;
+
+class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
+ public:
+ virtual ~CDEFBlockTest() {}
+ virtual void SetUp() {
+ cdef = GET_PARAM(0);
+ ref_cdef = GET_PARAM(1);
+ bsize = GET_PARAM(2);
+ }
+
+ virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+ int bsize;
+ cdef_filter_block_func cdef;
+ cdef_filter_block_func ref_cdef;
+};
+
+typedef CDEFBlockTest CDEFSpeedTest;
+
+void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
+ cdef_filter_block_func ref_cdef) {
+ const int size = 8;
+ const int ysize = size + 2 * CDEF_VBORDER;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
+ DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
+ DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
+ memset(ref_d, 0, sizeof(ref_d));
+ memset(d, 0, sizeof(d));
+
+ int error = 0, pristrength = 0, secstrength, dir;
+ int boundary, pridamping, secdamping, depth, bits, level, count,
+ errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
+ errpridamping = 0, errsecdamping = 0;
+ unsigned int pos = 0;
+
+ for (boundary = 0; boundary < 16; boundary++) {
+ for (depth = 8; depth <= 12; depth += 2) {
+ for (pridamping = 3 + depth - 8;
+ pridamping < 7 - 3 * !!boundary + depth - 8; pridamping++) {
+ for (secdamping = 3 + depth - 8;
+ secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+ for (count = 0; count < iterations; count++) {
+ for (level = 0; level < (1 << depth) && !error;
+ level += (2 + 6 * !!boundary) << (depth - 8)) {
+ for (bits = 1; bits <= depth && !error;
+ bits += 1 + 3 * !!boundary) {
+ for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+ s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << depth) - 1);
+ if (boundary) {
+ if (boundary & 1) { // Left
+ for (int i = 0; i < ysize; i++)
+ for (int j = 0; j < CDEF_HBORDER; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ if (boundary & 2) { // Right
+ for (int i = 0; i < ysize; i++)
+ for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ if (boundary & 4) { // Above
+ for (int i = 0; i < CDEF_VBORDER; i++)
+ for (int j = 0; j < CDEF_BSTRIDE; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ if (boundary & 8) { // Below
+ for (int i = CDEF_VBORDER + size; i < ysize; i++)
+ for (int j = 0; j < CDEF_BSTRIDE; j++)
+ s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+ }
+ }
+ for (dir = 0; dir < 8; dir++) {
+ for (pristrength = 0;
+ pristrength <= 19 << (depth - 8) && !error;
+ pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+ if (pristrength == 16) pristrength = 19;
+ for (secstrength = 0;
+ secstrength <= 4 << (depth - 8) && !error;
+ secstrength += 1 << (depth - 8)) {
+ if (secstrength == 3 << (depth - 8)) continue;
+ ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
+ s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+ pristrength, secstrength, dir, pridamping,
+ secdamping, bsize, (1 << depth) - 1);
+ // If cdef and ref_cdef are the same, we're just testing
+ // speed
+ if (cdef != ref_cdef)
+ ASM_REGISTER_STATE_CHECK(
+ cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
+ s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+ pristrength, secstrength, dir, pridamping,
+ secdamping, bsize, (1 << depth) - 1));
+ if (ref_cdef != cdef) {
+ for (pos = 0;
+ pos<sizeof(d) / sizeof(*d)>> (depth == 8) &&
+ !error;
+ pos++) {
+ error = ref_d[pos] != d[pos];
+ errdepth = depth;
+ errpristrength = pristrength;
+ errsecstrength = secstrength;
+ errboundary = boundary;
+ errpridamping = pridamping;
+ errsecdamping = secdamping;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ pos--;
+ EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
+ << std::endl
+ << "First error at " << pos % size << "," << pos / size
+ << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
+ << ") " << std::endl
+ << "pristrength: " << errpristrength << std::endl
+ << "pridamping: " << errpridamping << std::endl
+ << "secstrength: " << errsecstrength << std::endl
+ << "secdamping: " << errsecdamping << std::endl
+ << "depth: " << errdepth << std::endl
+ << "size: " << bsize << std::endl
+ << "boundary: " << errboundary << std::endl
+ << std::endl;
+}
+
+void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
+ cdef_filter_block_func ref_cdef) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&ref_timer);
+ test_cdef(bsize, iterations, ref_cdef, ref_cdef);
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer_start(&timer);
+ test_cdef(bsize, iterations, cdef, cdef);
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+#if 0
+ std::cout << "[ ] C time = " << ref_elapsed_time / 1000
+ << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
+#endif
+
+ EXPECT_GT(ref_elapsed_time, elapsed_time)
+ << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
+ << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift);
+
+typedef std::tr1::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+
+class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
+ public:
+ virtual ~CDEFFindDirTest() {}
+ virtual void SetUp() {
+ finddir = GET_PARAM(0);
+ ref_finddir = GET_PARAM(1);
+ }
+
+ virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+ find_dir_t finddir;
+ find_dir_t ref_finddir;
+};
+
+typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift),
+ int (*ref_finddir)(const uint16_t *img, int stride,
+ int32_t *var, int coeff_shift)) {
+ const int size = 8;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, uint16_t, s[size * size]);
+
+ int error = 0;
+ int depth, bits, level, count, errdepth = 0;
+ int ref_res = 0, res = 0;
+ int32_t ref_var = 0, var = 0;
+
+ for (depth = 8; depth <= 12 && !error; depth += 2) {
+ for (count = 0; count < 512 && !error; count++) {
+ for (level = 0; level < (1 << depth) && !error;
+ level += 1 << (depth - 8)) {
+ for (bits = 1; bits <= depth && !error; bits++) {
+ for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+ s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+ (1 << depth) - 1);
+ for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+ ref_res = ref_finddir(s, size, &ref_var, depth - 8);
+ if (finddir != ref_finddir)
+ ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+ if (ref_finddir != finddir) {
+ if (res != ref_res || var != ref_var) error = 1;
+ errdepth = depth;
+ }
+ }
+ }
+ }
+ }
+
+ EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+ << std::endl
+ << "return: " << res << " : " << ref_res << std::endl
+ << "var: " << var << " : " << ref_var << std::endl
+ << "depth: " << errdepth << std::endl
+ << std::endl;
+}
+
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
+ int32_t *var, int coeff_shift),
+ int (*ref_finddir)(const uint16_t *img, int stride,
+ int32_t *var, int coeff_shift)) {
+ aom_usec_timer ref_timer;
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&ref_timer);
+ test_finddir(ref_finddir, ref_finddir);
+ aom_usec_timer_mark(&ref_timer);
+ int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+ aom_usec_timer_start(&timer);
+ test_finddir(finddir, finddir);
+ aom_usec_timer_mark(&timer);
+ int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+#if 0
+ std::cout << "[ ] C time = " << ref_elapsed_time / 1000
+ << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
+#endif
+
+ EXPECT_GT(ref_elapsed_time, elapsed_time)
+ << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
+ << "C time: " << ref_elapsed_time << " us" << std::endl
+ << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
+ test_cdef(bsize, 1, cdef, ref_cdef);
+}
+
+TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
+ test_cdef_speed(bsize, 4, cdef, ref_cdef);
+}
+
+TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
+ test_finddir(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
+ test_finddir_speed(finddir, ref_finddir);
+}
+
+using std::tr1::make_tuple;
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, CDEFBlockTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+ &cdef_find_dir_c)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, CDEFBlockTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, CDEFBlockTest,
+ ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
+ &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_sse4_1,
+ &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, CDEFBlockTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, CDEFBlockTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_neon,
+ &cdef_find_dir_c)));
+#endif
+
+// Test speed for all supported architectures
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, CDEFSpeedTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, CDEFSpeedTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, CDEFSpeedTest,
+ ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
+ &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_sse4_1,
+ &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, CDEFSpeedTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+ &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, CDEFSpeedTest,
+ ::testing::Values(
+ make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
+ make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
+ ::testing::Values(make_tuple(&cdef_find_dir_neon,
+ &cdef_find_dir_c)));
+#endif
+
+#endif // defined(_WIN64) || !defined(_MSC_VER)
+} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 43b975c..b035184 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -137,10 +137,16 @@
"${AOM_ROOT}/test/simd_cmp_impl.h")
if (CONFIG_CDEF)
- set(AOM_UNIT_TEST_COMMON_SOURCES
- ${AOM_UNIT_TEST_COMMON_SOURCES}
- "${AOM_ROOT}/test/clpf_test.cc"
- "${AOM_ROOT}/test/dering_test.cc")
+ if (CONFIG_CDEF_SINGLEPASS)
+ set(AOM_UNIT_TEST_COMMON_SOURCES
+ ${AOM_UNIT_TEST_COMMON_SOURCES}
+ "${AOM_ROOT}/test/cdef_test.cc")
+ else ()
+ set(AOM_UNIT_TEST_COMMON_SOURCES
+ ${AOM_UNIT_TEST_COMMON_SOURCES}
+ "${AOM_ROOT}/test/clpf_test.cc"
+ "${AOM_ROOT}/test/dering_test.cc")
+ endif ()
endif ()
# Omit 4-tap filter intra predictor test-- currently a 3-tap filter is in
diff --git a/test/test.mk b/test/test.mk
index 2f99ab7..9f60d57 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -135,8 +135,12 @@
LIBAOM_TEST_SRCS-$(CONFIG_ADAPT_SCAN) += scan_test.cc
LIBAOM_TEST_SRCS-yes += convolve_test.cc
LIBAOM_TEST_SRCS-yes += lpf_8_test.cc
+ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
+LIBAOM_TEST_SRCS-$(CONFIG_CDEF) += cdef_test.cc
+else
LIBAOM_TEST_SRCS-$(CONFIG_CDEF) += dering_test.cc
LIBAOM_TEST_SRCS-$(CONFIG_CDEF) += clpf_test.cc
+endif
LIBAOM_TEST_SRCS-yes += simd_cmp_impl.h
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += simd_cmp_sse2.cc
LIBAOM_TEST_SRCS-$(HAVE_SSSE3) += simd_cmp_ssse3.cc