Add SIMD support for CDEF dering for sse2/ssse3 and neon
Change-Id: Ibaaed850ddceba9c3db542eaf4a1c623ce6b412b
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 856e393..079f45e 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -851,22 +851,6 @@
add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
-if (aom_config("CONFIG_CDEF") eq "yes") {
- add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
- # VS compiling for 32 bit targets does not support vector types in
- # structs as arguments, which makes the v256 type of the intrinsics
- # hard to support, so optimizations for this target are disabled.
- if ($opts{config} !~ /libs-x86-win32-vs.*/) {
- specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
- specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
- specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
- specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
- }
-}
-
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
specialize qw/aom_highbd_lpf_vertical_16 sse2/;
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f922d9d..06178b7 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -231,47 +231,32 @@
"${AOM_ROOT}/av1/common/cdef.c"
"${AOM_ROOT}/av1/common/cdef.h"
"${AOM_ROOT}/av1/common/od_dering.c"
- "${AOM_ROOT}/av1/common/od_dering.h")
+ "${AOM_ROOT}/av1/common/od_dering.h"
+ "${AOM_ROOT}/av1/common/od_dering_simd.h")
set(AOM_AV1_ENCODER_SOURCES
${AOM_AV1_ENCODER_SOURCES}
- "${AOM_ROOT}/av1/encoder/clpf_rdo.c"
- "${AOM_ROOT}/av1/encoder/clpf_rdo.h"
"${AOM_ROOT}/av1/encoder/pickcdef.c")
set(AOM_AV1_COMMON_SSE2_INTRIN
${AOM_AV1_COMMON_SSE2_INTRIN}
- "${AOM_ROOT}/av1/common/clpf_sse2.c")
+ "${AOM_ROOT}/av1/common/clpf_sse2.c"
+ "${AOM_ROOT}/av1/common/od_dering_sse2.c")
set(AOM_AV1_COMMON_SSSE3_INTRIN
${AOM_AV1_COMMON_SSSE3_INTRIN}
- "${AOM_ROOT}/av1/common/clpf_ssse3.c")
+ "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+ "${AOM_ROOT}/av1/common/od_dering_ssse3.c")
set(AOM_AV1_COMMON_SSE4_1_INTRIN
${AOM_AV1_COMMON_SSE4_1_INTRIN}
- "${AOM_ROOT}/av1/common/clpf_sse4.c")
+ "${AOM_ROOT}/av1/common/clpf_sse4.c"
+ "${AOM_ROOT}/av1/common/od_dering_sse4.c")
set(AOM_AV1_COMMON_NEON_INTRIN
${AOM_AV1_COMMON_NEON_INTRIN}
- "${AOM_ROOT}/av1/common/clpf_neon.c")
-
- set(AOM_AV1_ENCODER_SSE2_INTRIN
- ${AOM_AV1_ENCODER_SSE2_INTRIN}
- "${AOM_ROOT}/av1/encoder/clpf_rdo_sse2.c")
-
- set(AOM_AV1_ENCODER_SSSE3_INTRIN
- ${AOM_AV1_ENCODER_SSSE3_INTRIN}
- "${AOM_ROOT}/av1/encoder/clpf_rdo_ssse3.c")
-
- set(AOM_AV1_ENCODER_SSE4_1_INTRIN
- ${AOM_AV1_ENCODER_SSE4_1_INTRIN}
- "${AOM_ROOT}/av1/encoder/clpf_rdo_sse4.c"
- "${AOM_ROOT}/av1/common/x86/od_dering_sse4.c"
- "${AOM_ROOT}/av1/common/x86/od_dering_sse4.h")
-
- set(AOM_AV1_ENCODER_NEON_INTRIN
- ${AOM_AV1_ENCODER_NEON_INTRIN}
- "${AOM_ROOT}/av1/encoder/clpf_rdo_neon.c")
+ "${AOM_ROOT}/av1/common/clpf_neon.c"
+ "${AOM_ROOT}/av1/common/od_dering_neon.c")
endif ()
if (CONFIG_EXT_INTER)
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 2d94845..fc311da 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -94,10 +94,13 @@
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/od_dering_sse2.c
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/od_dering_ssse3.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/od_dering_sse4.c
+AV1_COMMON_SRCS-$(HAVE_NEON) += common/od_dering_neon.c
AV1_COMMON_SRCS-yes += common/od_dering.c
AV1_COMMON_SRCS-yes += common/od_dering.h
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
+AV1_COMMON_SRCS-yes += common/od_dering_simd.h
AV1_COMMON_SRCS-yes += common/cdef.c
AV1_COMMON_SRCS-yes += common/cdef.h
endif
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index aa604c8..e8a3341 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -753,14 +753,25 @@
# Deringing Functions
if (aom_config("CONFIG_CDEF") eq "yes") {
+ add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+ add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
- specialize qw/od_dir_find8 sse4_1/;
-
add_proto qw/int od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
- specialize qw/od_filter_dering_direction_4x4 sse4_1/;
-
add_proto qw/int od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
- specialize qw/od_filter_dering_direction_8x8 sse4_1/;
+ # VS compiling for 32 bit targets does not support vector types in
+ # structs as arguments, which makes the v256 type of the intrinsics
+ # hard to support, so optimizations for this target are disabled.
+ if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+ specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
+ specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
+ specialize qw/od_dir_find8 sse2 ssse3 sse4_1 neon/;
+ specialize qw/od_filter_dering_direction_4x4 sse2 ssse3 sse4_1 neon/;
+ specialize qw/od_filter_dering_direction_8x8 sse2 ssse3 sse4_1 neon/;
+ }
}
# PVQ Functions
diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 252226b..abd54fb 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -10,7 +10,7 @@
*/
#include "av1/common/clpf.h"
-#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h
index 0ae35d8..4a0b220 100644
--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@@ -9,7 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
#include "aom_ports/mem.h"
#include "aom_ports/bitops.h"
#include "av1/common/clpf_simd_kernel.h"
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 062614f..7d173a0 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -56,10 +56,4 @@
dering_list *dlist, int dering_count, int level,
int clpf_strength, int clpf_damping, int coeff_shift,
int skip_dering, int hbd);
-int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
- const uint16_t *in, int threshold,
- int dir);
-int od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
- const uint16_t *in, int threshold,
- int dir);
#endif
diff --git a/av1/common/x86/od_dering_sse4.h b/av1/common/od_dering_neon.c
similarity index 76%
rename from av1/common/x86/od_dering_sse4.h
rename to av1/common/od_dering_neon.c
index 950ec5f..9944105 100644
--- a/av1/common/x86/od_dering_sse4.h
+++ b/av1/common/od_dering_neon.c
@@ -8,7 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "av1/common/od_dering.h"
-#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
-#define AOM_COMMON_OD_DERING_X86_SSE4_H_
-#endif // AOM_COMMON_OD_DERING_X86_SSE4_H_
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "./od_dering_simd.h"
diff --git a/av1/common/od_dering_simd.h b/av1/common/od_dering_simd.h
new file mode 100644
index 0000000..6cba080
--- /dev/null
+++ b/av1/common/od_dering_simd.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./od_dering.h"
+
+/* partial A is a 16-bit vector of the form:
+ [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+ [0 y1 y2 y3 y4 y5 y6 y7].
+ This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+ (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+ and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+ v128 const2) {
+ v128 tmp;
+ /* Reverse partial B. */
+ partialb = v128_shuffle_8(
+ partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+ /* Interleave the x and y values of identical indices and pair x8 with 0. */
+ tmp = partiala;
+ partiala = v128_ziplo_16(partialb, partiala);
+ partialb = v128_ziphi_16(partialb, tmp);
+ /* Square and add the corresponding x and y values. */
+ partiala = v128_madd_s16(partiala, partiala);
+ partialb = v128_madd_s16(partialb, partialb);
+ /* Multiply by constant. */
+ partiala = v128_mullo_s32(partiala, const1);
+ partialb = v128_mullo_s32(partialb, const2);
+ /* Sum all results. */
+ partiala = v128_add_32(partiala, partialb);
+ return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+ v128 t0, t1, t2, t3;
+ t0 = v128_ziplo_32(x1, x0);
+ t1 = v128_ziplo_32(x3, x2);
+ t2 = v128_ziphi_32(x1, x0);
+ t3 = v128_ziphi_32(x3, x2);
+ x0 = v128_ziplo_64(t1, t0);
+ x1 = v128_ziphi_64(t1, t0);
+ x2 = v128_ziplo_64(t3, t2);
+ x3 = v128_ziphi_64(t3, t2);
+ return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+ to compute the remaining directions. */
+static INLINE void compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+ v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+ v128 partial6;
+ v128 tmp;
+ /* Partial sums for lines 0 and 1. */
+ partial4a = v128_shl_n_byte(lines[0], 14);
+ partial4b = v128_shr_n_byte(lines[0], 2);
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+ tmp = v128_add_16(lines[0], lines[1]);
+ partial5a = v128_shl_n_byte(tmp, 10);
+ partial5b = v128_shr_n_byte(tmp, 6);
+ partial7a = v128_shl_n_byte(tmp, 4);
+ partial7b = v128_shr_n_byte(tmp, 12);
+ partial6 = tmp;
+
+ /* Partial sums for lines 2 and 3. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+ tmp = v128_add_16(lines[2], lines[3]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Partial sums for lines 4 and 5. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+ tmp = v128_add_16(lines[4], lines[5]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Partial sums for lines 6 and 7. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+ partial4a = v128_add_16(partial4a, lines[7]);
+ tmp = v128_add_16(lines[6], lines[7]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Compute costs in terms of partial sums. */
+ partial4a =
+ fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+ v128_from_32(105, 120, 140, 168));
+ partial7a =
+ fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+ v128_from_32(105, 105, 105, 140));
+ partial5a =
+ fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+ v128_from_32(105, 105, 105, 140));
+ partial6 = v128_madd_s16(partial6, partial6);
+ partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+ partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+ v128_store_unaligned(tmp_cost1, partial4a);
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+ counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+ const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+ const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+ const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+ const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+ const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+ const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+ const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+ const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+ const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+ const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+ const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+ const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+ const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+ const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+ const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+ const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+ res[7] = v128_ziplo_64(tr1_1, tr1_0);
+ res[6] = v128_ziphi_64(tr1_1, tr1_0);
+ res[5] = v128_ziplo_64(tr1_3, tr1_2);
+ res[4] = v128_ziphi_64(tr1_3, tr1_2);
+ res[3] = v128_ziplo_64(tr1_5, tr1_4);
+ res[2] = v128_ziphi_64(tr1_5, tr1_4);
+ res[1] = v128_ziplo_64(tr1_7, tr1_6);
+ res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
+ int coeff_shift) {
+ int i;
+ int32_t cost[8];
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ v128 lines[8];
+ for (i = 0; i < 8; i++) {
+ lines[i] = v128_load_unaligned(&img[i * stride]);
+ lines[i] =
+ v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+ }
+
+ /* Compute "mostly vertical" directions. */
+ compute_directions(lines, cost + 4);
+
+ array_reverse_transpose_8x8(lines, lines);
+
+ /* Compute "mostly horizontal" directions. */
+ compute_directions(lines, cost);
+
+ for (i = 0; i < 8; i++) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ best_dir = i;
+ }
+ }
+
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+static INLINE v128 od_cmplt_abs_epi16(v128 in, v128 threshold) {
+ return v128_cmplt_s16(v128_abs_s16(in), threshold);
+}
+
+int SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
+ const uint16_t *in, int threshold,
+ int dir) {
+ int i;
+ v128 sum;
+ v128 p;
+ v128 cmp;
+ v128 row;
+ v128 res;
+ v128 tmp;
+ v128 thresh;
+ v128 total_abs;
+ int off1, off2;
+ off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+ off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+ total_abs = v128_zero();
+ thresh = v128_dup_16(threshold);
+ for (i = 0; i < 4; i += 2) {
+ sum = v128_zero();
+ row = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]),
+ v64_load_aligned(&in[i * OD_FILT_BSTRIDE]));
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off1]),
+ v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off1]));
+ p = v128_sub_16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_shl_n_16(p, 2);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off1]),
+ v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off1]));
+ p = v128_sub_16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_shl_n_16(p, 2);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off2]),
+ v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off2]));
+ p = v128_sub_16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off2]),
+ v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off2]));
+ p = v128_sub_16(tmp, row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*res = row + ((sum + 8) >> 4)*/
+ res = v128_add_16(sum, v128_dup_16(8));
+ res = v128_shr_n_s16(res, 4);
+ total_abs = v128_add_16(total_abs, v128_abs_s16(res));
+ res = v128_add_16(row, res);
+ v64_store_aligned(&y[i * ystride], v128_low_v64(res));
+ v64_store_aligned(&y[(i + 1) * ystride], v128_high_v64(res));
+ }
+ return (v128_dotp_s16(total_abs, v128_dup_16(1)) + 2) >> 2;
+}
+
+int SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
+ const uint16_t *in, int threshold,
+ int dir) {
+ int i;
+ v128 sum;
+ v128 p;
+ v128 cmp;
+ v128 row;
+ v128 res;
+ v128 thresh;
+ v128 total_abs;
+ int off1, off2, off3;
+ off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+ off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+ off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
+ total_abs = v128_zero();
+ thresh = v128_dup_16(threshold);
+ for (i = 0; i < 8; i++) {
+ sum = v128_zero();
+ row = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE]);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_add_16(p, v128_shl_n_16(p, 1));
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_add_16(p, v128_shl_n_16(p, 1));
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_shl_n_16(p, 1);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_shl_n_16(p, 1);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+ p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+ p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
+ /*if (abs(p) < thresh) sum += taps[k]*p1*/
+ cmp = od_cmplt_abs_epi16(p, thresh);
+ p = v128_and(p, cmp);
+ sum = v128_add_16(sum, p);
+
+ /*res = row + ((sum + 8) >> 4)*/
+ res = v128_add_16(sum, v128_dup_16(8));
+ res = v128_shr_n_s16(res, 4);
+ total_abs = v128_add_16(total_abs, v128_abs_s16(res));
+ res = v128_add_16(row, res);
+ v128_store_unaligned(&y[i * ystride], res);
+ }
+ return (v128_dotp_s16(total_abs, v128_dup_16(1)) + 8) >> 4;
+}
diff --git a/av1/common/x86/od_dering_sse4.h b/av1/common/od_dering_sse2.c
similarity index 76%
copy from av1/common/x86/od_dering_sse4.h
copy to av1/common/od_dering_sse2.c
index 950ec5f..8a2a62f 100644
--- a/av1/common/x86/od_dering_sse4.h
+++ b/av1/common/od_dering_sse2.c
@@ -8,7 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "av1/common/od_dering.h"
-#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
-#define AOM_COMMON_OD_DERING_X86_SSE4_H_
-#endif // AOM_COMMON_OD_DERING_X86_SSE4_H_
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "./od_dering_simd.h"
diff --git a/av1/common/x86/od_dering_sse4.h b/av1/common/od_dering_sse4.c
similarity index 76%
copy from av1/common/x86/od_dering_sse4.h
copy to av1/common/od_dering_sse4.c
index 950ec5f..0769db9 100644
--- a/av1/common/x86/od_dering_sse4.h
+++ b/av1/common/od_dering_sse4.c
@@ -8,7 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "av1/common/od_dering.h"
-#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
-#define AOM_COMMON_OD_DERING_X86_SSE4_H_
-#endif // AOM_COMMON_OD_DERING_X86_SSE4_H_
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "./od_dering_simd.h"
diff --git a/av1/common/x86/od_dering_sse4.h b/av1/common/od_dering_ssse3.c
similarity index 76%
copy from av1/common/x86/od_dering_sse4.h
copy to av1/common/od_dering_ssse3.c
index 950ec5f..99df62b 100644
--- a/av1/common/x86/od_dering_sse4.h
+++ b/av1/common/od_dering_ssse3.c
@@ -8,7 +8,7 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "av1/common/od_dering.h"
-#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
-#define AOM_COMMON_OD_DERING_X86_SSE4_H_
-#endif // AOM_COMMON_OD_DERING_X86_SSE4_H_
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "./od_dering_simd.h"
diff --git a/av1/common/x86/od_dering_sse4.c b/av1/common/x86/od_dering_sse4.c
deleted file mode 100644
index 58b601f..0000000
--- a/av1/common/x86/od_dering_sse4.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/od_dering_sse4.h"
-
-/* partial A is a 16-bit vector of the form:
- [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
- [0 y1 y2 y3 y4 y5 y6 y7].
- This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
- (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
- and const2. */
-static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
- __m128i const1, __m128i const2) {
- __m128i tmp;
- /* Reverse partial B. */
- partialb = _mm_shuffle_epi8(
- partialb,
- _mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
- /* Interleave the x and y values of identical indices and pair x8 with 0. */
- tmp = partiala;
- partiala = _mm_unpacklo_epi16(partiala, partialb);
- partialb = _mm_unpackhi_epi16(tmp, partialb);
- /* Square and add the corresponding x and y values. */
- partiala = _mm_madd_epi16(partiala, partiala);
- partialb = _mm_madd_epi16(partialb, partialb);
- /* Multiply by constant. */
- partiala = _mm_mullo_epi32(partiala, const1);
- partialb = _mm_mullo_epi32(partialb, const2);
- /* Sum all results. */
- partiala = _mm_add_epi32(partiala, partialb);
- return partiala;
-}
-
-static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
- __m128i t0, t1, t2, t3;
- t0 = _mm_unpacklo_epi32(x0, x1);
- t1 = _mm_unpacklo_epi32(x2, x3);
- t2 = _mm_unpackhi_epi32(x0, x1);
- t3 = _mm_unpackhi_epi32(x2, x3);
- x0 = _mm_unpacklo_epi64(t0, t1);
- x1 = _mm_unpackhi_epi64(t0, t1);
- x2 = _mm_unpacklo_epi64(t2, t3);
- x3 = _mm_unpackhi_epi64(t2, t3);
- return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
-}
-
-/* Horizontal sum of 8x16-bit unsigned values. */
-static INLINE int32_t hsum_epi16(__m128i a) {
- a = _mm_madd_epi16(a, _mm_set1_epi16(1));
- a = _mm_hadd_epi32(a, a);
- a = _mm_hadd_epi32(a, a);
- return _mm_cvtsi128_si32(a);
-}
-
-/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
- to compute the remaining directions. */
-static INLINE __m128i compute_directions(__m128i lines[8],
- int32_t tmp_cost1[4]) {
- __m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
- __m128i partial6;
- __m128i tmp;
- /* Partial sums for lines 0 and 1. */
- partial4a = _mm_slli_si128(lines[0], 14);
- partial4b = _mm_srli_si128(lines[0], 2);
- partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
- partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
- tmp = _mm_add_epi16(lines[0], lines[1]);
- partial5a = _mm_slli_si128(tmp, 10);
- partial5b = _mm_srli_si128(tmp, 6);
- partial7a = _mm_slli_si128(tmp, 4);
- partial7b = _mm_srli_si128(tmp, 12);
- partial6 = tmp;
-
- /* Partial sums for lines 2 and 3. */
- partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
- partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
- partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
- partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
- tmp = _mm_add_epi16(lines[2], lines[3]);
- partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
- partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
- partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
- partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
- partial6 = _mm_add_epi16(partial6, tmp);
-
- /* Partial sums for lines 4 and 5. */
- partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
- partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
- partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
- partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
- tmp = _mm_add_epi16(lines[4], lines[5]);
- partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
- partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
- partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
- partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
- partial6 = _mm_add_epi16(partial6, tmp);
-
- /* Partial sums for lines 6 and 7. */
- partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
- partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
- partial4a = _mm_add_epi16(partial4a, lines[7]);
- tmp = _mm_add_epi16(lines[6], lines[7]);
- partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
- partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
- partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
- partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
- partial6 = _mm_add_epi16(partial6, tmp);
-
- /* Compute costs in terms of partial sums. */
- partial4a =
- fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
- _mm_set_epi32(105, 120, 140, 168));
- partial7a =
- fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
- _mm_set_epi32(105, 105, 105, 140));
- partial5a =
- fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
- _mm_set_epi32(105, 105, 105, 140));
- partial6 = _mm_madd_epi16(partial6, partial6);
- partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
-
- partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
- _mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
- return partial4a;
-}
-
-/* transpose and reverse the order of the lines -- equivalent to a 90-degree
- counter-clockwise rotation of the pixels. */
-static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
- res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
- int coeff_shift) {
- int i;
- int32_t cost[8];
- int32_t best_cost = 0;
- int best_dir = 0;
- __m128i lines[8];
- __m128i dir03, dir47;
- __m128i max;
- for (i = 0; i < 8; i++) {
- lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
- lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
- _mm_set1_epi16(128));
- }
-
- /* Compute "mostly vertical" directions. */
- dir47 = compute_directions(lines, cost + 4);
-
- array_reverse_transpose_8x8(lines, lines);
-
- /* Compute "mostly horizontal" directions. */
- dir03 = compute_directions(lines, cost);
-
-#if 1
- max = _mm_max_epi32(dir03, dir47);
- max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
- max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
- dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
- _mm_setr_epi32(-1, -2, -3, -4));
- dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
- _mm_setr_epi32(-5, -6, -7, -8));
- dir03 = _mm_max_epu32(dir03, dir47);
- dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
- dir03 =
- _mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
- dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
-
- best_dir = _mm_cvtsi128_si32(dir03);
- best_cost = _mm_cvtsi128_si32(max);
-#else
- for (i = 0; i < 8; i++) {
- if (cost[i] > best_cost) {
- best_cost = cost[i];
- best_dir = i;
- }
- }
-#endif
- /* Difference between the optimal variance and the variance along the
- orthogonal direction. Again, the sum(x^2) terms cancel out. */
- *var = best_cost - cost[(best_dir + 4) & 7];
- /* We'd normally divide by 840, but dividing by 1024 is close enough
- for what we're going to do with this. */
- *var >>= 10;
- return best_dir;
-}
-
-static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
- return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
-}
-
-int od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride,
- const uint16_t *in, int threshold,
- int dir) {
- int i;
- __m128i sum;
- __m128i p;
- __m128i cmp;
- __m128i row;
- __m128i res;
- __m128i tmp;
- __m128i thresh;
- __m128i total_abs;
- int off1, off2;
- off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
- off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
- total_abs = _mm_setzero_si128();
- thresh = _mm_set1_epi16(threshold);
- for (i = 0; i < 4; i += 2) {
- sum = _mm_set1_epi16(0);
- row = _mm_unpacklo_epi64(
- _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
- _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
-
- /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
- tmp = _mm_unpacklo_epi64(
- _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
- _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
- p = _mm_sub_epi16(tmp, row);
- /*if (abs(p) < thresh) sum += taps[k]*p*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_slli_epi16(p, 2);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
- /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
- tmp = _mm_unpacklo_epi64(
- _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
- _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
- p = _mm_sub_epi16(tmp, row);
- /*if (abs(p) < thresh) sum += taps[k]*p1*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_slli_epi16(p, 2);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
- tmp = _mm_unpacklo_epi64(
- _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
- _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
- p = _mm_sub_epi16(tmp, row);
- /*if (abs(p) < thresh) sum += taps[k]*p*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
- /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
- tmp = _mm_unpacklo_epi64(
- _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
- _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
- p = _mm_sub_epi16(tmp, row);
- /*if (abs(p) < thresh) sum += taps[k]*p1*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*res = row + ((sum + 8) >> 4)*/
- res = _mm_add_epi16(sum, _mm_set1_epi16(8));
- res = _mm_srai_epi16(res, 4);
- total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
- res = _mm_add_epi16(row, res);
- _mm_storel_epi64((__m128i *)&y[i * ystride], res);
- _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
- _mm_unpackhi_epi64(res, res));
- }
- return (hsum_epi16(total_abs) + 2) >> 2;
-}
-
-int od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride,
- const uint16_t *in, int threshold,
- int dir) {
- int i;
- __m128i sum;
- __m128i p;
- __m128i cmp;
- __m128i row;
- __m128i res;
- __m128i thresh;
- __m128i total_abs;
- int off1, off2, off3;
- off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
- off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
- off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
- total_abs = _mm_setzero_si128();
- thresh = _mm_set1_epi16(threshold);
- for (i = 0; i < 8; i++) {
- sum = _mm_set1_epi16(0);
- row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
-
- /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
- p = _mm_sub_epi16(
- _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
- /*if (abs(p) < thresh) sum += taps[k]*p*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
- p = _mm_sub_epi16(
- _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
- /*if (abs(p) < thresh) sum += taps[k]*p1*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
- p = _mm_sub_epi16(
- _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
- /*if (abs(p) < thresh) sum += taps[k]*p*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_slli_epi16(p, 1);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
- p = _mm_sub_epi16(
- _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
- /*if (abs(p) < thresh) sum += taps[k]*p1*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_slli_epi16(p, 1);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
- p = _mm_sub_epi16(
- _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
- /*if (abs(p) < thresh) sum += taps[k]*p*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
- p = _mm_sub_epi16(
- _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
- /*if (abs(p) < thresh) sum += taps[k]*p1*/
- cmp = od_cmplt_abs_epi16(p, thresh);
- p = _mm_and_si128(p, cmp);
- sum = _mm_add_epi16(sum, p);
-
- /*res = row + ((sum + 8) >> 4)*/
- res = _mm_add_epi16(sum, _mm_set1_epi16(8));
- res = _mm_srai_epi16(res, 4);
- total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
- res = _mm_add_epi16(row, res);
- _mm_storeu_si128((__m128i *)&y[i * ystride], res);
- }
- return (hsum_epi16(total_abs) + 8) >> 4;
-}
diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h
deleted file mode 100644
index 5f61997..0000000
--- a/av1/encoder/clpf_rdo_simd.h
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_simd.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/bitops.h"
-#include "av1/common/clpf_simd_kernel.h"
-
-SIMD_INLINE void clip_sides(v128 *c, v128 *d, v128 *e, v128 *f, int left,
- int right) {
- DECLARE_ALIGNED(16, static const uint64_t,
- c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
- DECLARE_ALIGNED(16, static const uint64_t,
- d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
- DECLARE_ALIGNED(16, static const uint64_t,
- e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
- DECLARE_ALIGNED(16, static const uint64_t,
- f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
-
- if (!left) { // Left clipping
- *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
- *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
- }
- if (!right) { // Right clipping
- *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
- *f = v128_shuffle_8(*f, v128_load_aligned(f_shuff));
- }
-}
-
-SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
- int rstride, int ostride, int x0, int y0,
- int bottom, int right, int y, v128 *o, v128 *r,
- v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
- v128 *f, v128 *g, v128 *h) {
- const v64 k1 = v64_load_aligned(org);
- const v64 k2 = v64_load_aligned(org + ostride);
- const v64 l1 = v64_load_aligned(rec);
- const v64 l2 = v64_load_aligned(rec + rstride);
- const v64 l3 = v64_load_aligned(rec - (y != -y0) * rstride);
- const v64 l4 = v64_load_aligned(rec + ((y != bottom) + 1) * rstride);
- *o = v128_from_v64(k1, k2);
- *r = v128_from_v64(l1, l2);
- *a = v128_from_v64(v64_load_aligned(rec - 2 * (y != -y0) * rstride), l3);
- *b = v128_from_v64(l3, l1);
- *g = v128_from_v64(l2, l4);
- *h = v128_from_v64(l4,
- v64_load_aligned(rec + (2 * (y != bottom) + 1) * rstride));
- *c = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
- v64_load_unaligned(rec - 2 * !!x0 + rstride));
- *d = v128_from_v64(v64_load_unaligned(rec - !!x0),
- v64_load_unaligned(rec - !!x0 + rstride));
- *e = v128_from_v64(v64_load_unaligned(rec + !!right),
- v64_load_unaligned(rec + !!right + rstride));
- *f = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
- v64_load_unaligned(rec + 2 * !!right + rstride));
- clip_sides(c, d, e, f, x0, right);
-}
-
-void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
- int rstride, int ostride, int x0, int y0,
- int width, int height, int *sum0, int *sum1,
- unsigned int strength, int size,
- unsigned int dmp) {
- const int bottom = height - 2 - y0;
- const int right = width - 8 - x0;
- ssd128_internal ssd0 = v128_ssd_u8_init();
- ssd128_internal ssd1 = v128_ssd_u8_init();
- int y;
-
- if (size != 8) { // Fallback to plain C
- aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
- sum1, strength, size, dmp);
- return;
- }
-
- rec += x0 + y0 * rstride;
- org += x0 + y0 * ostride;
-
- for (y = 0; y < 8; y += 2) {
- v128 a, b, c, d, e, f, g, h, o, r;
- read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
- &a, &b, &c, &d, &e, &f, &g, &h);
- ssd0 = v128_ssd_u8(ssd0, o, r);
- ssd1 = v128_ssd_u8(ssd1, o,
- calc_delta(r, a, b, c, d, e, f, g, h, strength, dmp));
- rec += rstride * 2;
- org += ostride * 2;
- }
- *sum0 += v128_ssd_u8_sum(ssd0);
- *sum1 += v128_ssd_u8_sum(ssd1);
-}
-
-SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
- v128 d, v128 e, v128 f, v128 g, v128 h,
- ssd128_internal *ssd1, ssd128_internal *ssd2,
- ssd128_internal *ssd3, unsigned int dmp) {
- *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1, dmp));
- *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2, dmp));
- *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4, dmp));
-}
-
-// Test multiple filter strengths at once.
-void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
- int rstride, int ostride, int x0, int y0,
- int width, int height, int *sum, int size,
- unsigned int dmp) {
- const int bottom = height - 2 - y0;
- const int right = width - 8 - x0;
- ssd128_internal ssd0 = v128_ssd_u8_init();
- ssd128_internal ssd1 = v128_ssd_u8_init();
- ssd128_internal ssd2 = v128_ssd_u8_init();
- ssd128_internal ssd3 = v128_ssd_u8_init();
- int y;
-
- if (size != 8) { // Fallback to plain C
- aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
- sum, size, dmp);
- return;
- }
-
- rec += x0 + y0 * rstride;
- org += x0 + y0 * ostride;
-
- for (y = 0; y < 8; y += 2) {
- v128 a, b, c, d, e, f, g, h, o, r;
- read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
- &a, &b, &c, &d, &e, &f, &g, &h);
- ssd0 = v128_ssd_u8(ssd0, o, r);
- calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3, dmp);
- rec += 2 * rstride;
- org += 2 * ostride;
- }
- sum[0] += v128_ssd_u8_sum(ssd0);
- sum[1] += v128_ssd_u8_sum(ssd1);
- sum[2] += v128_ssd_u8_sum(ssd2);
- sum[3] += v128_ssd_u8_sum(ssd3);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
- int rstride, int ostride, int x0, int y0,
- int bottom, int right, int y, v128 *o,
- v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
- v128 *e, v128 *f, v128 *g, v128 *h,
- int shift) {
- const v128 k1 = v128_shr_u16(v128_load_aligned(org), shift);
- const v128 k2 = v128_shr_u16(v128_load_aligned(org + ostride), shift);
- const v128 l1 = v128_shr_u16(v128_load_aligned(rec), shift);
- const v128 l2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
- const v128 l3 =
- v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift);
- const v128 l4 = v128_shr_u16(
- v128_load_aligned(rec + ((y != bottom) + 1) * rstride), shift);
- *o = v128_unziplo_8(k1, k2);
- *r = v128_unziplo_8(l1, l2);
- *a = v128_unziplo_8(
- v128_shr_u16(v128_load_aligned(rec - 2 * (y != -y0) * rstride), shift),
- l3);
- *b = v128_unziplo_8(l3, l1);
- *g = v128_unziplo_8(l2, l4);
- *h = v128_unziplo_8(
- l4,
- v128_shr_u16(v128_load_unaligned(rec + (2 * (y != bottom) + 1) * rstride),
- shift));
- *c = v128_unziplo_8(
- v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
- v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
- *d = v128_unziplo_8(
- v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
- v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
- *e = v128_unziplo_8(
- v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
- v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
- *f = v128_unziplo_8(
- v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
- v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
- clip_sides(c, d, e, f, x0, right);
-}
-
-void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
- int rstride, int ostride, int x0, int y0,
- int width, int height, int *sum0, int *sum1,
- unsigned int strength, int size,
- unsigned int bitdepth,
- unsigned int damping) {
- const int shift = bitdepth - 8;
- const int bottom = height - 2 - y0;
- const int right = width - 8 - x0;
- ssd128_internal ssd0 = v128_ssd_u8_init();
- ssd128_internal ssd1 = v128_ssd_u8_init();
- int y;
-
- if (size != 8) { // Fallback to plain C
- aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
- sum0, sum1, strength, size, bitdepth, damping);
- return;
- }
-
- rec += x0 + y0 * rstride;
- org += x0 + y0 * ostride;
-
- for (y = 0; y < 8; y += 2) {
- v128 a, b, c, d, e, f, g, h, o, r;
- read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
- &r, &a, &b, &c, &d, &e, &f, &g, &h, shift);
- ssd0 = v128_ssd_u8(ssd0, o, r);
- ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h,
- strength >> shift, damping));
- rec += rstride * 2;
- org += ostride * 2;
- }
- *sum0 += v128_ssd_u8_sum(ssd0);
- *sum1 += v128_ssd_u8_sum(ssd1);
-}
-
-void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
- const uint16_t *org, int rstride,
- int ostride, int x0, int y0,
- int width, int height, int *sum,
- int size, unsigned int bitdepth,
- unsigned int damping) {
- const int bottom = height - 2 - y0;
- const int right = width - 8 - x0;
- ssd128_internal ssd0 = v128_ssd_u8_init();
- ssd128_internal ssd1 = v128_ssd_u8_init();
- ssd128_internal ssd2 = v128_ssd_u8_init();
- ssd128_internal ssd3 = v128_ssd_u8_init();
- int y;
-
- if (size != 8) { // Fallback to plain C
- aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
- height, sum, size, bitdepth, damping);
- return;
- }
-
- rec += x0 + y0 * rstride;
- org += x0 + y0 * ostride;
-
- for (y = 0; y < 8; y += 2) {
- v128 a, b, c, d, e, f, g, h, o, r;
- read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
- &r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8);
- ssd0 = v128_ssd_u8(ssd0, o, r);
- calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3,
- damping);
- rec += rstride * 2;
- org += ostride * 2;
- }
- sum[0] += v128_ssd_u8_sum(ssd0);
- sum[1] += v128_ssd_u8_sum(ssd1);
- sum[2] += v128_ssd_u8_sum(ssd2);
- sum[3] += v128_ssd_u8_sum(ssd3);
-}
-#endif
diff --git a/test/clpf_test.cc b/test/clpf_test.cc
index a8a1ed6..d2335e2 100644
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@@ -15,7 +15,7 @@
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
#include "aom_ports/aom_timer.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"