Remove the compile time flag for CONFIG_NEW_DF
This commit is removing the compile time flag of CONFIG_NEW_DF.
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4951eea..2926e31 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -73,7 +73,6 @@
"${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
"${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
@@ -108,7 +107,6 @@
"${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6f5f471..3423edb 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -166,58 +166,9 @@
#
# Loopfilter
#
-if (aom_config("CONFIG_NEW_DF") eq "yes") {
- add_proto qw/void aom_highbd_lpf_horizontal_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
- add_proto qw/void aom_highbd_lpf_vertical_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
-} else {
-add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
+add_proto qw/void aom_highbd_lpf_vertical_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_6 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
-specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-}
#
# Encoder functions.
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index eb14d7d..6092ca3 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -18,8 +18,6 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
-#if CONFIG_NEW_DF
-
#define DF_SPARSE 1
#define DF_FILT26 1
#define DF_8_THRESH 3
@@ -281,498 +279,3 @@
s += pitch;
}
}
-
-#else // !CONFIG_NEW_DF
-
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
- switch (bd) {
- case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
- case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
- case 8:
- default: return (int16_t)clamp(t, -128, 128 - 1);
- }
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
- uint16_t p1, uint16_t p0, uint16_t q0,
- uint16_t q1, int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
- uint16_t p3, uint16_t p2, uint16_t p1,
- uint16_t p0, uint16_t q0, uint16_t q1,
- uint16_t q2, uint16_t q3, int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p3 - p2) > limit16) * -1;
- mask |= (abs(p2 - p1) > limit16) * -1;
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(q2 - q1) > limit16) * -1;
- mask |= (abs(q3 - q2) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
- uint16_t p2, uint16_t p1,
- uint16_t p0, uint16_t q0,
- uint16_t q1, uint16_t q2,
- int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p2 - p1) > limit16) * -1;
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(q2 - q1) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
- uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1,
- uint16_t q2, int bd) {
- int8_t mask = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- mask |= (abs(p1 - p0) > thresh16) * -1;
- mask |= (abs(q1 - q0) > thresh16) * -1;
- mask |= (abs(p2 - p0) > thresh16) * -1;
- mask |= (abs(q2 - q0) > thresh16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
- uint16_t p1, uint16_t p0, uint16_t q0,
- uint16_t q1, uint16_t q2, uint16_t q3,
- int bd) {
- int8_t mask = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- mask |= (abs(p1 - p0) > thresh16) * -1;
- mask |= (abs(q1 - q0) > thresh16) * -1;
- mask |= (abs(p2 - p0) > thresh16) * -1;
- mask |= (abs(q2 - q0) > thresh16) * -1;
- mask |= (abs(p3 - p0) > thresh16) * -1;
- mask |= (abs(q3 - q0) > thresh16) * -1;
- return ~mask;
-}
-
-// Is there high edge variance internal edge:
-// 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1, int bd) {
- int16_t hev = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- hev |= (abs(p1 - p0) > thresh16) * -1;
- hev |= (abs(q1 - q0) > thresh16) * -1;
- return hev;
-}
-
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
- uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
- int bd) {
- int16_t filter1, filter2;
- // ^0x80 equivalent to subtracting 0x80 from the values to turn them
- // into -128 to +127 instead of 0 to 255.
- int shift = bd - 8;
- const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
- const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
- const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
- const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
- const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
-
- // Add outer taps if we have high edge variance.
- int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
-
- // Inner taps.
- filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
-
- // Save bottom 3 bits so that we round one side +4 and the other +3
- // if it equals 4 we'll set to adjust by -1 to account for the fact
- // we'd round 3 the other way.
- filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
- filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
-
- *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
- *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
-
- // Outer tap adjustments.
- filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
- *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
- *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
-}
-
-void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const int8_t mask =
- highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
- highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_4_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1];
- const int8_t mask =
- highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
- highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
- s += pitch;
- }
-}
-
-void aom_highbd_lpf_vertical_4_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- bd);
-}
-
-static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
- uint16_t *op2, uint16_t *op1, uint16_t *op0,
- uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
- int bd) {
- if (flat && mask) {
- const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
- const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
- // 5-tap filter [1, 2, 2, 2, 1]
- *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
- *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
- *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
- *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
- } else {
- highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
- }
-}
-
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
- uint16_t *op3, uint16_t *op2, uint16_t *op1,
- uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
- uint16_t *oq2, uint16_t *oq3, int bd) {
- if (flat && mask) {
- const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
- // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
- } else {
- highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
- }
-}
-
-void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
- s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
- const int8_t mask =
- highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
- const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
- highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
- s + 1 * p, s + 2 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_6_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
- const int8_t mask =
- highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
- const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
- highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
- bd);
- s += pitch;
- }
-}
-
-void aom_highbd_lpf_vertical_6_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- bd);
-}
-
-void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
- s + 2, s + 3, bd);
- s += pitch;
- }
-}
-
-void aom_highbd_lpf_vertical_8_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- bd);
-}
-
-static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
- int8_t flat2, uint16_t *op6, uint16_t *op5,
- uint16_t *op4, uint16_t *op3, uint16_t *op2,
- uint16_t *op1, uint16_t *op0, uint16_t *oq0,
- uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
- uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
- int bd) {
- if (flat2 && flat && mask) {
- const uint16_t p6 = *op6;
- const uint16_t p5 = *op5;
- const uint16_t p4 = *op4;
- const uint16_t p3 = *op3;
- const uint16_t p2 = *op2;
- const uint16_t p1 = *op1;
- const uint16_t p0 = *op0;
- const uint16_t q0 = *oq0;
- const uint16_t q1 = *oq1;
- const uint16_t q2 = *oq2;
- const uint16_t q3 = *oq3;
- const uint16_t q4 = *oq4;
- const uint16_t q5 = *oq5;
- const uint16_t q6 = *oq6;
-
- // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
- *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
- 4);
- *op4 = ROUND_POWER_OF_TWO(
- p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
- *op3 = ROUND_POWER_OF_TWO(
- p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
- *op2 = ROUND_POWER_OF_TWO(
- p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
- 4);
- *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4,
- 4);
- *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5,
- 4);
- *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
- q1 * 2 + q2 + q3 + q4 + q5 + q6,
- 4);
- *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
- q2 * 2 + q3 + q4 + q5 + q6 * 2,
- 4);
- *oq2 = ROUND_POWER_OF_TWO(
- p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
- 4);
- *oq3 = ROUND_POWER_OF_TWO(
- p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(
- p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
- 4);
- } else {
- highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
- bd);
- }
-}
-
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int count,
- int bd) {
- int i;
- int step = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < step * count; ++i) {
- const uint16_t p3 = s[-4 * p];
- const uint16_t p2 = s[-3 * p];
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const uint16_t q2 = s[2 * p];
- const uint16_t q3 = s[3 * p];
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-
- const int8_t flat2 =
- highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
- s[5 * p], s[6 * p], bd);
-
- highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
- s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
-}
-
-void aom_highbd_lpf_horizontal_14_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
- highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
-}
-
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int count,
- int bd) {
- int i;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4];
- const uint16_t p2 = s[-3];
- const uint16_t p1 = s[-2];
- const uint16_t p0 = s[-1];
- const uint16_t q0 = s[0];
- const uint16_t q1 = s[1];
- const uint16_t q2 = s[2];
- const uint16_t q3 = s[3];
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat2 =
- highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
-
- highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
- s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
- s + 6, bd);
- s += p;
- }
-}
-
-void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
- highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- 4, bd);
-}
-
-#endif // !CONFIG_NEW_DF
diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c
deleted file mode 100644
index ffc60f1..0000000
--- a/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/common_avx2.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom/aom_integer.h"
-#if !CONFIG_NEW_DF
-void aom_highbd_lpf_horizontal_14_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-#endif // !CONFIG_NEW_DF
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
deleted file mode 100644
index 12a5066..0000000
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ /dev/null
@@ -1,1700 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#if !CONFIG_NEW_DF
-static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
- __m128i *pixel) {
- *pixel = _mm_min_epi16(*pixel, *max);
- *pixel = _mm_max_epi16(*pixel, *min);
-}
-
-static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
- return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-}
-
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
- const uint8_t *t, int bd, __m128i *blt,
- __m128i *lt, __m128i *thr, __m128i *t80_out) {
- const int shift = bd - 8;
- const __m128i zero = _mm_setzero_si128();
-
- __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
- *blt = _mm_slli_epi16(x, shift);
-
- x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
- *lt = _mm_slli_epi16(x, shift);
-
- x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
- *thr = _mm_slli_epi16(x, shift);
-
- *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void get_limit_dual(
- const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
- const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
- int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
- __m128i *t80_out) {
- const int shift = bd - 8;
- const __m128i zero = _mm_setzero_si128();
-
- __m128i x0 =
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
- __m128i x1 =
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
- x0 = _mm_unpacklo_epi64(x0, x1);
- *blt_out = _mm_slli_epi16(x0, shift);
-
- x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
- x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
- x0 = _mm_unpacklo_epi64(x0, x1);
- *lt_out = _mm_slli_epi16(x0, shift);
-
- x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
- x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
- x0 = _mm_unpacklo_epi64(x0, x1);
- *thr_out = _mm_slli_epi16(x0, shift);
-
- *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
- __m128i *p, __m128i *q) {
- int i;
- for (i = 0; i < size; i++) {
- p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
- q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
- }
-}
-
-static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
- const __m128i *l, const __m128i *bl,
- __m128i *mask) {
- __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
- __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
-
- __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
- max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
- max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
- int i;
- for (i = 1; i < 4; ++i) {
- max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
- max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
- }
- max = _mm_subs_epu16(max, *l);
- *mask = _mm_cmpeq_epi16(max, zero); // return ~mask
-}
-
-static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
- __m128i *p1p0, __m128i *q1q0,
- __m128i *abs_p1p0, __m128i *l,
- __m128i *bl, __m128i *t,
- __m128i *hev, __m128i *mask) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
- __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
- __m128i max, max01, h;
-
- *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
- *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
-
- abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
- abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
- abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2
-
- max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
- max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
- *abs_p1p0 = abs_diff16(pq[0], pq[1]);
- abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
- max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
- // mask |= (abs(*p1 - *p0) > limit) * -1;
- // mask |= (abs(*q1 - *q0) > limit) * -1;
- h = _mm_subs_epu16(max01, *t);
-
- *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
- // replicate for the further "merged variables" usage
- *hev = _mm_unpacklo_epi64(*hev, *hev);
-
- max = _mm_max_epi16(max, max01);
- int i;
- for (i = 2; i < x; ++i) {
- max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
- }
- max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
- max = _mm_subs_epu16(max, *l);
- *mask = _mm_cmpeq_epi16(max, zero); // ~mask
-}
-
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
- int start, int end, __m128i *flat) {
- int i;
- __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
- abs_diff16(pq[start + 1], pq[0]));
-
- for (i = start + 2; i < end; ++i) {
- max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
- }
- max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
- __m128i ft;
- ft = _mm_subs_epu16(max, *th);
-
- const __m128i zero = _mm_setzero_si128();
- *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
- const __m128i *q, int start, int end,
- __m128i *flat) {
- int i;
- __m128i max =
- _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
-
- for (i = start + 1; i < end; ++i) {
- max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
- max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
- }
-
- __m128i ft;
- ft = _mm_subs_epu16(max, *th);
-
- const __m128i zero = _mm_setzero_si128();
- *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
- __m128i *flat2, int bd) {
- // check the distance 1,2,3 against 0
- __m128i th = _mm_set1_epi16(1);
- th = _mm_slli_epi16(th, bd - 8);
- flat_mask_internal(&th, pq, 1, 4, flat);
- flat_mask_internal(&th, pq, 4, 7, flat2);
-}
-
-static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
- const __m128i *q, __m128i *flat,
- __m128i *flat2, int bd) {
- // check the distance 1,2,3 against 0
- __m128i th = _mm_set1_epi16(1);
- th = _mm_slli_epi16(th, bd - 8);
- flat_mask_internal_dual(&th, p, q, 1, 4, flat);
- flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
-}
-
-static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
- __m128i *hev, __m128i *mask,
- __m128i *qs1qs0,
- __m128i *ps1ps0, __m128i *t80,
- int bd) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i pmax =
- _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
- const __m128i pmin = _mm_subs_epi16(zero, *t80);
-
- const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
- __m128i ps1ps0_work, qs1qs0_work, work;
- __m128i filt, filter2filter1, filter2filt, filter1filt;
-
- ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
- qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
-
- work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
- pixel_clamp(&pmin, &pmax, &work);
- filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-
- filt = _mm_subs_epi16(filt, work);
- filt = _mm_subs_epi16(filt, work);
- filt = _mm_subs_epi16(filt, work);
- // (aom_filter + 3 * (qs0 - ps0)) & mask
- pixel_clamp(&pmin, &pmax, &filt);
- filt = _mm_and_si128(filt, *mask);
- filt = _mm_unpacklo_epi64(filt, filt);
-
- filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
- pixel_clamp(&pmin, &pmax, &filter2filter1);
- filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
-
- filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
-
- // filt >> 1
- filt = _mm_adds_epi16(filt, one);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(*hev, filt);
-
- filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
- filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
-
- qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
- ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
-
- pixel_clamp(&pmin, &pmax, &qs1qs0_work);
- pixel_clamp(&pmin, &pmax, &ps1ps0_work);
-
- *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
- *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
-}
-
-static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
- __m128i *qs, const __m128i *mask,
- const __m128i *th, int bd,
- __m128i *t80) {
- __m128i ps0 = _mm_subs_epi16(p[0], *t80);
- __m128i ps1 = _mm_subs_epi16(p[1], *t80);
- __m128i qs0 = _mm_subs_epi16(q[0], *t80);
- __m128i qs1 = _mm_subs_epi16(q[1], *t80);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i pmax =
- _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i pmin = _mm_subs_epi16(zero, *t80);
- __m128i filter = _mm_subs_epi16(ps1, qs1);
- pixel_clamp(&pmin, &pmax, &filter);
-
- // hev_filter
- __m128i hev;
- const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
- const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
- __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
- h = _mm_subs_epu16(h, *th);
- const __m128i ffff = _mm_cmpeq_epi16(h, h);
- hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-
- filter = _mm_and_si128(filter, hev);
-
- const __m128i x = _mm_subs_epi16(qs0, ps0);
- filter = _mm_adds_epi16(filter, x);
- filter = _mm_adds_epi16(filter, x);
- filter = _mm_adds_epi16(filter, x);
- pixel_clamp(&pmin, &pmax, &filter);
- filter = _mm_and_si128(filter, *mask);
- const __m128i t3 = _mm_set1_epi16(3);
- const __m128i t4 = _mm_set1_epi16(4);
- __m128i filter1 = _mm_adds_epi16(filter, t4);
- __m128i filter2 = _mm_adds_epi16(filter, t3);
- pixel_clamp(&pmin, &pmax, &filter1);
- pixel_clamp(&pmin, &pmax, &filter2);
- filter1 = _mm_srai_epi16(filter1, 3);
- filter2 = _mm_srai_epi16(filter2, 3);
- qs0 = _mm_subs_epi16(qs0, filter1);
- pixel_clamp(&pmin, &pmax, &qs0);
- ps0 = _mm_adds_epi16(ps0, filter2);
- pixel_clamp(&pmin, &pmax, &ps0);
- qs[0] = _mm_adds_epi16(qs0, *t80);
- ps[0] = _mm_adds_epi16(ps0, *t80);
- filter = _mm_adds_epi16(filter1, one);
- filter = _mm_srai_epi16(filter, 1);
- filter = _mm_andnot_si128(hev, filter);
- qs1 = _mm_subs_epi16(qs1, filter);
- pixel_clamp(&pmin, &pmax, &qs1);
- ps1 = _mm_adds_epi16(ps1, filter);
- pixel_clamp(&pmin, &pmax, &ps1);
- qs[1] = _mm_adds_epi16(qs1, *t80);
- ps[1] = _mm_adds_epi16(ps1, *t80);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
- __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
- const unsigned char *lt, const unsigned char *thr, int bd) {
- int i;
- const __m128i zero = _mm_setzero_si128();
- __m128i blimit, limit, thresh;
- __m128i t80;
- get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
-
- for (i = 0; i < 7; i++) {
- pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
- }
- __m128i mask, hevhev;
- __m128i p1p0, q1q0, abs_p1p0;
-
- highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hevhev, &mask);
-
- __m128i ps0ps1, qs0qs1;
- // filter4
- highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
-
- __m128i flat, flat2;
- highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
-
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_and_si128(flat2, flat);
-
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
- flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
- // flat and wide flat calculations
-
- // if flat ==0 then flat2 is zero as well and we don't need any calc below
- // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i flat_p[3], flat_q[3], flat_pq[3];
- __m128i flat2_p[6], flat2_q[6];
- __m128i flat2_pq[6];
- __m128i sum_p6, sum_p3;
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
-
- __m128i work0, work0_0, work0_1, sum_p_0;
- __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
- __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
- sum_p = _mm_add_epi16(sum_p, sum_lp);
-
- __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
- __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
- sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
- sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
- flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
- flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
-
- sum_p6 = _mm_add_epi16(pq[6], pq[6]);
- sum_p3 = _mm_add_epi16(pq[3], pq[3]);
-
- sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
- sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
- work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
- work0_1 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
-
- sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
- sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-
- work0 = _mm_add_epi16(sum_p3, pq[1]);
- flat_p[1] = _mm_add_epi16(sum_lp, work0);
- flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
- flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
- flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-
- sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
-
- sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
- work0 = _mm_add_epi16(sum_p3, pq[2]);
-
- flat_p[2] = _mm_add_epi16(sum_lp, work0);
- flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
- flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-
- int flat2_mask =
- (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
- if (flat2_mask) {
- flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
- flat2_q[0] = _mm_add_epi16(
- sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
-
- flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
- flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
- flat2_pq[0] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
- flat2_pq[1] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, pq[4]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
- flat2_p[2] = _mm_add_epi16(sum_p, work0);
- flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[2] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, pq[3]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
- flat2_p[3] = _mm_add_epi16(sum_p, work0);
- flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[3] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, pq[2]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
- flat2_p[4] = _mm_add_epi16(sum_p, work0);
- flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[4] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, pq[1]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
- flat2_p[5] = _mm_add_epi16(sum_p, work0);
- flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[5] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
- } // flat2
- // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // highbd_filter8
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
- for (i = 0; i < 3; i++) {
- pq[i] = _mm_andnot_si128(flat, pq[i]);
- flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
- }
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- if (flat2_mask) {
- for (i = 0; i < 6; i++) {
- pq[i] = _mm_andnot_si128(flat2, pq[i]);
- flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
- }
- }
- } else {
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
- }
-}
-
-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- __m128i p[7], q[7], pq[7];
- int i;
-
- for (i = 0; i < 7; i++) {
- p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
- q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
- }
-
- highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
- for (i = 0; i < 6; i++) {
- _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
- _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
- __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
- const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
- const uint8_t *thr1, int bd) {
- __m128i blimit, limit, thresh, t80;
- const __m128i zero = _mm_setzero_si128();
-
- get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
- &t80);
- __m128i mask;
- highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
- __m128i flat, flat2;
- highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
-
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_and_si128(flat2, flat);
- __m128i ps[2], qs[2];
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
- // flat and wide flat calculations
-
- // if flat ==0 then flat2 is zero as well and we don't need any calc below
- // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i flat_p[3], flat_q[3];
- __m128i flat2_p[6], flat2_q[6];
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
- __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
- __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
- sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
- __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
- sum_q = _mm_add_epi16(sum_q, sum_lq);
- sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
- sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- flat_p[0] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
- flat_q[0] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
- __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
- __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
- __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
- __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
- sum_q = _mm_sub_epi16(sum_p_0, p[5]);
- __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
- sum_lq = _mm_sub_epi16(sum_lp, p[2]);
- sum_lp = _mm_sub_epi16(sum_lp, q[2]);
- flat_p[1] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
- flat_q[1] =
- _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
- sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, p[1]);
- sum_p3 = _mm_add_epi16(sum_p3, p[3]);
- sum_q3 = _mm_add_epi16(sum_q3, q[3]);
- flat_p[2] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
- flat_q[2] =
- _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
- int flat2_mask =
- (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
- if (flat2_mask) {
- flat2_p[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
- _mm_add_epi16(p[1], q[0]))),
- 4);
- flat2_q[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
- _mm_add_epi16(p[0], q[1]))),
- 4);
-
- flat2_p[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
- 4);
- flat2_q[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
- flat2_p[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
- 4);
- flat2_q[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
- flat2_p[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
- 4);
- flat2_q[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
- flat2_p[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
- 4);
- flat2_q[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
- flat2_p[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
- 4);
- flat2_q[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
- 4);
- }
- // highbd_filter8
- int i;
- for (i = 0; i < 2; i++) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- p[2] = _mm_andnot_si128(flat, p[2]);
- // p2 remains unchanged if !(flat && mask)
- flat_p[2] = _mm_and_si128(flat, flat_p[2]);
- // when (flat && mask)
- p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
- q[2] = _mm_andnot_si128(flat, q[2]);
- flat_q[2] = _mm_and_si128(flat, flat_q[2]);
- q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
-
- for (i = 0; i < 2; i++) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- // highbd_filter16
- if (flat2_mask) {
- for (i = 0; i < 6; i++) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- p[i] = _mm_andnot_si128(flat2, p[i]);
- flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
- // get values for when (flat2 && flat && mask)
- p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
- q[i] = _mm_andnot_si128(flat2, q[i]);
- flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
- q[i] = _mm_or_si128(q[i], flat2_q[i]);
- }
- }
- } else {
- p[0] = ps[0];
- q[0] = qs[0];
- p[1] = ps[1];
- q[1] = qs[1];
- }
-}
-
-void aom_highbd_lpf_horizontal_14_dual_sse2(
- uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p[7], q[7];
- int i;
- load_highbd_pixel(s, 7, pitch, p, q);
-
- highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
- _limit1, _thresh1, bd);
-
- for (i = 0; i < 6; i++) {
- _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
- _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
- __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
- __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
- const uint8_t *_limit, const uint8_t *_thresh, int bd) {
- __m128i blimit, limit, thresh;
- __m128i mask, hev, flat;
- __m128i pq[3];
- __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
- __m128i flat_p1p0, flat_q0q1;
-
- pq[0] = _mm_unpacklo_epi64(*p0, *q0);
- pq[1] = _mm_unpacklo_epi64(*p1, *q1);
- pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i four = _mm_set1_epi16(4);
- __m128i t80;
- const __m128i one = _mm_set1_epi16(0x1);
-
- get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
- highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hev, &mask);
-
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
- // flat_mask
- flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
- flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
-
- // 5 tap filter
- // need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b, workp_c;
- __m128i pq0x2_pq1, pq1_pq2;
-
- // op1
- pq0x2_pq1 =
- _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
- pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
- workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
- pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
-
- workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
- workp_b =
- _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
- // op0
- workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
- workp_a = _mm_add_epi16(workp_a,
- workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
- workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
- flat_p1p0 = _mm_srli_epi16(workp_b, 3);
-
- // oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
- pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
- workp_b = _mm_srli_si128(pq1_pq2, 8);
- workp_a = _mm_add_epi16(
- workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
- // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
- // oq1
- workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
- pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
- workp_b = _mm_add_epi16(*q2, *q2);
- workp_b =
- _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
- workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
- flat_q0q1 = _mm_srli_epi16(workp_a, 3);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
- __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
- __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
- const unsigned char *_thresh0, const unsigned char *_blimit1,
- const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
- const __m128i zero = _mm_setzero_si128();
- __m128i blimit0, limit0, thresh0;
- __m128i t80;
- __m128i mask, flat, work;
- __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
- __m128i op1, op0, oq0, oq1;
- const __m128i four = _mm_set1_epi16(4);
- const __m128i one = _mm_set1_epi16(0x1);
- const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
- get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
- &blimit0, &limit0, &thresh0, &t80);
-
- abs_p2p1 = abs_diff16(*p2, *p1);
- abs_p1p0 = abs_diff16(*p1, *p0);
- abs_q1q0 = abs_diff16(*q1, *q0);
- abs_q2q1 = abs_diff16(*q2, *q1);
-
- abs_p0q0 = abs_diff16(*p0, *q0);
- abs_p1q1 = abs_diff16(*p1, *q1);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
- mask = _mm_max_epi16(abs_q2q1, mask);
- work = _mm_max_epi16(abs_p1p0, abs_q1q0);
- mask = _mm_max_epi16(work, mask);
- mask = _mm_max_epi16(mask, abs_p2p1);
- mask = _mm_subs_epu16(mask, limit0);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
-
- // flat_mask
- flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
- flat = _mm_max_epi16(flat, work);
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask); // flat & mask
-
- // 5 tap filter
- // need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-
- // op1
- workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
- _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
- *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
-
- workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
- op1 = _mm_srli_epi16(workp_shft0, 3);
-
- // op0
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
- workp_a =
- _mm_add_epi16(workp_a,
- workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
- op0 = _mm_srli_epi16(workp_a, 3);
-
- // oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
- *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
- workp_b = _mm_add_epi16(*q1, *q2);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
- oq0 = _mm_srli_epi16(workp_shft0, 3);
-
- // oq1
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
- *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
- workp_b = _mm_add_epi16(*q2, *q2);
- workp_shft1 = _mm_add_epi16(
- workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
- oq1 = _mm_srli_epi16(workp_shft1, 3);
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
-
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
- } else {
- *q0 = qs[0];
- *q1 = qs[1];
- *p0 = ps[0];
- *p1 = ps[1];
- }
-}
-
-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
-
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
- highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
- _blimit, _limit, _thresh, bd);
-
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
-}
-
-void aom_highbd_lpf_horizontal_6_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p2, p1, p0, q0, q1, q2;
-
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-
- highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
- _limit0, _thresh0, _blimit1, _limit1,
- _thresh1, bd);
-
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
- __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
- __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
- const unsigned char *_blimit, const unsigned char *_limit,
- const unsigned char *_thresh, int bd) {
- const __m128i zero = _mm_setzero_si128();
- __m128i blimit, limit, thresh;
- __m128i mask, hev, flat;
- __m128i pq[4];
- __m128i p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i work_a, opq2, flat_p1p0, flat_q0q1;
-
- pq[0] = _mm_unpacklo_epi64(*p0, *q0);
- pq[1] = _mm_unpacklo_epi64(*p1, *q1);
- pq[2] = _mm_unpacklo_epi64(*p2, *q2);
- pq[3] = _mm_unpacklo_epi64(*p3, *q3);
-
- __m128i abs_p1p0;
-
- const __m128i four = _mm_set1_epi16(4);
- __m128i t80;
- const __m128i one = _mm_set1_epi16(0x1);
-
- get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
- highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hev, &mask);
-
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
- // flat_mask4
- flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
- flat = _mm_max_epi16(abs_p1p0, flat);
- flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
-
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
- // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
- // o*p2
- workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- workp_c = _mm_add_epi16(workp_a, workp_c);
-
- // o*p1
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
- workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
- // o*p0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
- workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
- flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
-
- // oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
- workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
- // oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
- workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
- flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
-
- // oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- workp_a = _mm_add_epi16(workp_a, workp_b);
- opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-
- work_a = _mm_andnot_si128(flat, pq[2]);
- *p2 = _mm_and_si128(flat, opq2);
- *p2 = _mm_or_si128(work_a, *p2);
- *q2 = _mm_srli_si128(*p2, 8);
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
- __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
- __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
- const unsigned char *_limit0, const unsigned char *_thresh0,
- const unsigned char *_blimit1, const unsigned char *_limit1,
- const unsigned char *_thresh1, int bd) {
- __m128i blimit0, limit0, thresh0;
- __m128i t80;
- __m128i mask, flat;
- __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
- __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i four = _mm_set1_epi16(4);
- const __m128i one = _mm_set1_epi16(0x1);
- const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
- get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
- &blimit0, &limit0, &thresh0, &t80);
-
- abs_p0q0 = abs_diff16(*p0, *q0);
- abs_p1q1 = abs_diff16(*p1, *q1);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1;
-
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
- work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
- work1 =
- _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat
- work0 = _mm_max_epi16(work0, work1);
- work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
- work2 = _mm_max_epi16(work2, work0);
- mask = _mm_max_epi16(work2, mask);
-
- mask = _mm_subs_epu16(mask, limit0);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
-
- flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
- flat = _mm_max_epi16(work1, flat);
- work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
- flat = _mm_max_epi16(work0, flat);
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask); // flat & mask
-
- // filter8 need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b;
- // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
- // o*p2
- workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // o*p1
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
- op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // o*p0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
- op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
- oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
- oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
-
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
-
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
-
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
- } else {
- *q0 = qs[0];
- *q1 = qs[1];
- *p0 = ps[0];
- *p1 = ps[1];
- }
-}
-
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0;
-
- p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-
- highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
- &p1p0, _blimit, _limit, _thresh, bd);
-
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-
- highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
- _blimit0, _limit0, _thresh0, _blimit1,
- _limit1, _thresh1, bd);
-
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
- __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
- __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i blimit, limit, thresh;
- __m128i mask, hev;
- __m128i p1p0, q1q0;
- __m128i pq[2];
-
- __m128i abs_p1p0;
-
- __m128i t80;
- get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
- pq[0] = _mm_unpacklo_epi64(*p0, *q0);
- pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-
- highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hev, &mask);
-
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
- __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
- __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i blimit0, limit0, thresh0;
- __m128i mask, flat;
- __m128i p[2], q[2];
-
- const __m128i zero = _mm_setzero_si128();
- __m128i abs_p0q0 = abs_diff16(*q0, *p0);
- __m128i abs_p1q1 = abs_diff16(*q1, *p1);
-
- __m128i abs_p1p0 = abs_diff16(*p1, *p0);
- __m128i abs_q1q0 = abs_diff16(*q1, *q0);
-
- const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
- const __m128i one = _mm_set1_epi16(1);
-
- __m128i t80;
-
- get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
- &blimit0, &limit0, &thresh0, &t80);
-
- // filter_mask and hev_mask
- flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
- mask = _mm_max_epi16(flat, mask);
-
- mask = _mm_subs_epu16(mask, limit0);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
-
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-}
-
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i p1p0, q1q0;
- __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
- highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
- _thresh, bd);
-
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-void aom_highbd_lpf_horizontal_4_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- __m128i ps[2], qs[2];
-
- highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
- _thresh0, _blimit1, _limit1, _thresh1, bd);
-
- _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
- _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
- _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
- _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
-}
-
-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- __m128i x0, x1, x2, x3, d0, d1, d2, d3;
- __m128i p1p0, q1q0;
- __m128i p1, q1;
-
- x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
- highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
-
- highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
- thresh, bd);
-
- p1 = _mm_srli_si128(p1p0, 8);
- q1 = _mm_srli_si128(q1q0, 8);
-
- // transpose from 8x4 to 4x8
- highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_4_dual_sse2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i ps[2], qs[2];
-
- x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
- x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
- x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
- x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
- x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
-
- highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
- &d2, &d3);
-
- highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
- thresh0, blimit1, limit1, thresh1, bd);
-
- highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
- &d3, &d4, &d5, &d6, &d7);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
- _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
- _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
- _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
- _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i x3, x2, x1, x0, p0, q0;
- __m128i p1p0, q1q0;
-
- x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
- x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
- x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
- x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-
- highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
- limit, thresh, bd);
-
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
-
- highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_6_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i p0, q0, p1, q1, p2, q2;
-
- x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
- x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
- x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
- x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
- x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
- x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
- x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
- x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
-
- highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
- &p0, &q0, &q1, &q2, &d6, &d7);
-
- highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
- _limit0, _thresh0, _blimit1, _limit1,
- _thresh1, bd);
-
- highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
- _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
- _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
- _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
- _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p2, p1, p0, p3, q0;
- __m128i q1q0, p1p0;
-
- p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
- p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
- p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
- p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
-
- highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- // Loop filtering
- highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
- &p1p0, blimit, limit, thresh, bd);
-
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
-
- highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
- &d1, &d2, &d3);
-
- _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
- _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
- _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
- _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_8_dual_sse2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
- x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
- x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
- x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
- x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
- x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
- x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
- x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
- x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
-
- highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
- &d2, &d3, &d4, &d5, &d6, &d7);
-
- highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
- blimit0, limit0, thresh0, blimit1, limit1,
- thresh1, bd);
-
- highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
- &x2, &x3, &x4, &x5, &x6, &x7);
-
- _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
- _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
- _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
- _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
- _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
- _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
- _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
- _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
-}
-
-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- __m128i q[7], p[7], pq[7];
- __m128i p6, p5, p4, p3;
- __m128i p6_2, p5_2, p4_2, p3_2;
- __m128i d0, d1, d2, d3;
- __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
-
- p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
- p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
- p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
- p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-
- highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
- &p[3], &p[2], &p[1], &p[0]);
-
- p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
- p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
- p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
- p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-
- highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
- &q[3], &q[4], &q[5], &q[6], &d7_2);
-
- highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
- highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
- &pq[1], &pq[0], &d0, &d1, &d2, &d3);
-
- q[0] = _mm_srli_si128(pq[0], 8);
- q[1] = _mm_srli_si128(pq[1], 8);
- q[2] = _mm_srli_si128(pq[2], 8);
- q[3] = _mm_srli_si128(pq[3], 8);
- q[4] = _mm_srli_si128(pq[4], 8);
- q[5] = _mm_srli_si128(pq[5], 8);
-
- highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
- &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
- _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
- _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
- _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
- _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
-}
-
-void aom_highbd_lpf_vertical_14_dual_sse2(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- __m128i q[7], p[7];
- __m128i p6, p5, p4, p3, p2, p1, p0, q0;
- __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
- __m128i d0, d7;
- __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
-
- p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
- p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
- p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
- p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
- p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
- p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
- p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
- q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
-
- highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
- &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
-
- p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
- p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
- p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
- p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
- p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
- p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
- p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
- q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
-
- highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
- &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
- &q[6], &d7);
-
- highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-
- highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
- &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
- &d6_out, &d7_out);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
-
- highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
- &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
- &d6_out, &d7_out);
-
- _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
- _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
- _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
- _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
- _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
- _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
- _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
- _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
-}
-#endif // !CONFIG_NEW_DF
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 9ada1e6..4e2b038 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -23,7 +23,6 @@
#include "av1/common/reconinter.h"
#include "av1/common/seg_common.h"
-#if CONFIG_NEW_DF || CONFIG_PEF
#define DF_MVS 0
#if DF_MVS
#define DF_MV_THRESH 8
@@ -57,7 +56,6 @@
1460, 1470, 1480, 1489, 1499, 1509, 1519, 1529, 1539, 1549, 1559, 1569, 1579,
1589, 1599, 1608, 1618, 1628, 1638, 1648, 1658, 1668, 1678
};
-#endif // CONFIG_NEW_DF || CONFIG_PEF
static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
{ SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
@@ -65,11 +63,6 @@
{ SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
};
-#if !CONFIG_NEW_DF
-static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
- { 2, 2 },
- { 3, 3 } };
-#endif
static const int mode_lf_lut[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES
1, 0, 1, // INTER_SINGLE_MODES (GLOBALMV == 0)
@@ -97,7 +90,6 @@
#endif // CONFIG_OPTFLOW_REFINEMENT
};
-#if CONFIG_NEW_DF || CONFIG_PEF
// Function obtains q_threshold from the quantization index.
int df_quant_from_qindex(int q_index, int bit_depth) {
int qstep = ROUND_POWER_OF_TWO(av1_ac_quant_QTX(q_index, 0, bit_depth),
@@ -120,32 +112,7 @@
return side_threshold;
}
-#endif // CONFIG_NEW_DF || CONFIG_PEF
-#if !CONFIG_NEW_DF
-static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
- int lvl;
-
- // For each possible value for the loop filter fill out limits
- for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
- // Set loop filter parameters that control sharpness.
- int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
-
- if (sharpness_lvl > 0) {
- if (block_inside_limit > (9 - sharpness_lvl))
- block_inside_limit = (9 - sharpness_lvl);
- }
-
- if (block_inside_limit < 1) block_inside_limit = 1;
-
- memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
- memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
- SIMD_WIDTH);
- }
-}
-#endif // !CONFIG_NEW_DF
-
-#if CONFIG_NEW_DF
uint16_t av1_get_filter_q(const loop_filter_info_n *lfi_n, const int dir_idx,
int plane, const MB_MODE_INFO *mbmi) {
const int segment_id = mbmi->segment_id;
@@ -162,80 +129,19 @@
mbmi->ref_frame[0])][mode_lf_lut[mbmi->mode]];
}
-#else
-uint8_t av1_get_filter_level(const AV1_COMMON *cm,
- const loop_filter_info_n *lfi_n, const int dir_idx,
- int plane, const MB_MODE_INFO *mbmi) {
- const int segment_id = mbmi->segment_id;
- if (cm->delta_q_info.delta_lf_present_flag) {
- int8_t delta_lf;
- if (cm->delta_q_info.delta_lf_multi) {
- const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
- delta_lf = mbmi->delta_lf[delta_lf_idx];
- } else {
- delta_lf = mbmi->delta_lf_from_base;
- }
- int base_level;
- if (plane == 0)
- base_level = cm->lf.filter_level[dir_idx];
- else if (plane == 1)
- base_level = cm->lf.filter_level_u;
- else
- base_level = cm->lf.filter_level_v;
- int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
- assert(plane >= 0 && plane <= 2);
- const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
- if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
- const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
- lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
- }
-
- if (cm->lf.mode_ref_delta_enabled) {
- const int scale = 1 << (lvl_seg >> 5);
- lvl_seg +=
- cm->lf.ref_deltas[COMPACT_INDEX0_NRS(mbmi->ref_frame[0])] * scale;
- if (is_inter_ref_frame(mbmi->ref_frame[0]))
- lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
- lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
- }
- return lvl_seg;
- } else {
- return lfi_n->lvl[plane][segment_id][dir_idx][COMPACT_INDEX0_NRS(
- mbmi->ref_frame[0])][mode_lf_lut[mbmi->mode]];
- }
-}
-#endif // CONFIG_NEW_DF
-
void av1_loop_filter_init(AV1_COMMON *cm) {
assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
struct loopfilter *lf = &cm->lf;
-#if !CONFIG_NEW_DF
- loop_filter_info_n *lfi = &cm->lf_info;
- int lvl;
-#endif // !CONFIG_NEW_DF
lf->combine_vert_horz_lf = 1;
-#if !CONFIG_NEW_DF
- // init limits for given sharpness
- update_sharpness(lfi, lf->sharpness_level);
-
- // init hev threshold const vectors
- for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
- memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
-#endif
}
-#if CONFIG_NEW_DF
// Update the loop filter for the current frame.
// This should be called before loop_filter_rows(),
// av1_loop_filter_frame() calls this function directly.
void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
int plane_end) {
-#if CONFIG_NEW_DF
int q_ind[MAX_MB_PLANE], q_ind_r[MAX_MB_PLANE], side_ind[MAX_MB_PLANE],
side_ind_r[MAX_MB_PLANE];
-#else
- int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
-#endif // CONFIG_NEW_DF
int plane;
int seg_id;
// n_shift is the multiplier for lf_deltas
@@ -245,7 +151,6 @@
struct loopfilter *const lf = &cm->lf;
const struct segmentation *const seg = &cm->seg;
-#if CONFIG_NEW_DF
#if DF_DUAL
q_ind[0] =
cm->quant_params.base_qindex + cm->lf.delta_q_luma[0] * DF_DELTA_SCALE;
@@ -286,23 +191,10 @@
cm->lf.delta_q_v * DF_DELTA_SCALE;
side_ind_r[2] = cm->quant_params.base_qindex + cm->quant_params.v_ac_delta_q +
cm->lf.delta_side_v * DF_DELTA_SCALE;
-#else
- // update sharpness limits
- update_sharpness(lfi, lf->sharpness_level);
-
- filt_lvl[0] = cm->lf.filter_level[0];
- filt_lvl[1] = cm->lf.filter_level_u;
- filt_lvl[2] = cm->lf.filter_level_v;
-
- filt_lvl_r[0] = cm->lf.filter_level[1];
- filt_lvl_r[1] = cm->lf.filter_level_u;
- filt_lvl_r[2] = cm->lf.filter_level_v;
-#endif // CONFIG_NEW_DF
assert(plane_start >= AOM_PLANE_Y);
assert(plane_end <= MAX_MB_PLANE);
-#if CONFIG_NEW_DF
for (plane = plane_start; plane < plane_end; plane++) {
if (plane == 0 && !cm->lf.filter_level[0] && !cm->lf.filter_level[1])
break;
@@ -310,36 +202,19 @@
continue;
else if (plane == 2 && !cm->lf.filter_level_v)
continue;
-#else
- for (plane = plane_start; plane < plane_end; plane++) {
- if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
- break;
- else if (plane == 1 && !filt_lvl[1])
- continue;
- else if (plane == 2 && !filt_lvl[2])
- continue;
-#endif // CONFIG_NEW_DF
+
for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
for (int dir = 0; dir < 2; ++dir) {
-#if CONFIG_NEW_DF
int q_ind_seg = (dir == 0) ? q_ind[plane] : q_ind_r[plane];
int side_ind_seg = (dir == 0) ? side_ind[plane] : side_ind_r[plane];
-
-#else
- int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
-#endif // CONFIG_NEW_DF
const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
-#if CONFIG_NEW_DF
// TODO(Andrey): add separate offsets to segments for q and side
// thresholds // add clamp
q_ind_seg += data;
side_ind_seg += data;
-#else
- lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
-#endif // CONFIG_NEW_DF
}
if (!lf->mode_ref_delta_enabled) {
@@ -350,7 +225,6 @@
// we could get rid of this if we assume that deltas are set to
// zero when not in use; encoder always uses deltas
-#if CONFIG_NEW_DF
int ref, mode;
lfi->q_thr[plane][seg_id][dir][INTRA_FRAME_INDEX][0] = q_thr_seg;
lfi->side_thr[plane][seg_id][dir][INTRA_FRAME_INDEX][0] =
@@ -369,12 +243,7 @@
side_thr_seg;
}
#endif // CONFIG_TIP
-#else
- memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
- sizeof(lfi->lvl[plane][seg_id][dir]));
-#endif // CONFIG_NEW_DF
} else {
-#if CONFIG_NEW_DF
// we could get rid of this if we assume that deltas are set to
// zero when not in use; encoder always uses deltas
const int scale = 4;
@@ -415,118 +284,11 @@
cm->seq_params.bit_depth);
}
#endif // CONFIG_TIP
-#else
- int ref, mode;
- const int scale = 1 << (lvl_seg >> 5);
- const int intra_lvl =
- lvl_seg + lf->ref_deltas[INTRA_FRAME_INDEX] * scale;
- lfi->lvl[plane][seg_id][dir][INTRA_FRAME_INDEX][0] =
- clamp(intra_lvl, 0, MAX_LOOP_FILTER);
- for (ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
- for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
- const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
- lf->mode_deltas[mode] * scale;
- lfi->lvl[plane][seg_id][dir][ref][mode] =
- clamp(inter_lvl, 0, MAX_LOOP_FILTER);
- }
- }
-#if CONFIG_TIP
- for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
- const int inter_lvl = lvl_seg +
- lf->ref_deltas[TIP_FRAME_INDEX] * scale +
- lf->mode_deltas[mode] * scale;
- lfi->lvl[plane][seg_id][dir][TIP_FRAME_INDEX][mode] =
- clamp(inter_lvl, 0, MAX_LOOP_FILTER);
- }
-#endif // CONFIG_TIP
-#endif // CONFIG_NEW_DF
}
}
}
}
}
-#else
-// Update the loop filter for the current frame.
-// This should be called before loop_filter_rows(),
-// av1_loop_filter_frame() calls this function directly.
-void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
- int plane_end) {
- int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
- int plane;
- int seg_id;
- // n_shift is the multiplier for lf_deltas
- // the multiplier is 1 for when filter_lvl is between 0 and 31;
- // 2 when filter_lvl is between 32 and 63
- loop_filter_info_n *const lfi = &cm->lf_info;
- struct loopfilter *const lf = &cm->lf;
- const struct segmentation *const seg = &cm->seg;
-
- // update sharpness limits
- update_sharpness(lfi, lf->sharpness_level);
-
- filt_lvl[0] = cm->lf.filter_level[0];
- filt_lvl[1] = cm->lf.filter_level_u;
- filt_lvl[2] = cm->lf.filter_level_v;
-
- filt_lvl_r[0] = cm->lf.filter_level[1];
- filt_lvl_r[1] = cm->lf.filter_level_u;
- filt_lvl_r[2] = cm->lf.filter_level_v;
-
- assert(plane_start >= AOM_PLANE_Y);
- assert(plane_end <= MAX_MB_PLANE);
-
- for (plane = plane_start; plane < plane_end; plane++) {
- if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
- break;
- else if (plane == 1 && !filt_lvl[1])
- continue;
- else if (plane == 2 && !filt_lvl[2])
- continue;
-
- for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
- for (int dir = 0; dir < 2; ++dir) {
- int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
- const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
- if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
- const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
- lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
- }
-
- if (!lf->mode_ref_delta_enabled) {
- // we could get rid of this if we assume that deltas are set to
- // zero when not in use; encoder always uses deltas
- memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
- sizeof(lfi->lvl[plane][seg_id][dir]));
- } else {
- int ref, mode;
- const int scale = 1 << (lvl_seg >> 5);
- const int intra_lvl =
- lvl_seg + lf->ref_deltas[INTRA_FRAME_INDEX] * scale;
- lfi->lvl[plane][seg_id][dir][INTRA_FRAME_INDEX][0] =
- clamp(intra_lvl, 0, MAX_LOOP_FILTER);
- for (ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
- for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
- const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
- lf->mode_deltas[mode] * scale;
- lfi->lvl[plane][seg_id][dir][ref][mode] =
- clamp(inter_lvl, 0, MAX_LOOP_FILTER);
- }
- }
-#if CONFIG_TIP
- for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
- const int inter_lvl = lvl_seg +
- lf->ref_deltas[TIP_FRAME_INDEX] * scale +
- lf->mode_deltas[mode] * scale;
- lfi->lvl[plane][seg_id][dir][TIP_FRAME_INDEX][mode] =
- clamp(inter_lvl, 0, MAX_LOOP_FILTER);
- }
-#endif // CONFIG_TIP
- }
- }
- }
- }
-}
-#endif // CONFIG_NEW_DF
static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
const MB_MODE_INFO *const mbmi,
@@ -584,10 +346,8 @@
const uint8_t *lim;
const uint8_t *mblim;
const uint8_t *hev_thr;
-#if CONFIG_NEW_DF
uint16_t q_threshold;
uint16_t side_threshold;
-#endif // CONFIG_NEW_DF
} AV1_DEBLOCKING_PARAMETERS;
// Return TX_SIZE from get_transform_size(), so it is plane and direction
@@ -646,21 +406,13 @@
// prepare outer edge parameters. deblock the edge if it's an edge of a TU
{
-#if CONFIG_NEW_DF
const uint32_t curr_q =
av1_get_filter_q(&cm->lf_info, edge_dir, plane, mbmi);
const uint32_t curr_side =
av1_get_filter_side(&cm->lf_info, edge_dir, plane, mbmi);
-#else
- const uint32_t curr_level =
- av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
-#endif // CONFIG_NEW_DF
const int curr_skipped =
mbmi->skip_txfm[plane_type] && is_inter_block(mbmi, tree_type);
-#if !CONFIG_NEW_DF
- uint32_t level = curr_level;
-#endif // !CONFIG_NEW_DF
if (coord) {
{
const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
@@ -673,15 +425,11 @@
const TX_SIZE pv_ts =
get_transform_size(xd, mi_prev, edge_dir, pv_row, pv_col, plane,
tree_type, plane_ptr);
-#if CONFIG_NEW_DF
const uint32_t pv_q =
av1_get_filter_q(&cm->lf_info, edge_dir, plane, mi_prev);
const uint32_t pv_side =
av1_get_filter_side(&cm->lf_info, edge_dir, plane, mi_prev);
-#else
- const uint32_t pv_lvl =
- av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
-#endif // CONFIG_NEW_DF
+
const int pv_skip_txfm = mi_prev->skip_txfm[plane_type] &&
is_inter_block(mi_prev, tree_type);
const BLOCK_SIZE bsize = get_mb_plane_block_size_from_tree_type(
@@ -699,7 +447,6 @@
const int32_t pu_edge = !(coord & prediction_masks);
// if the current and the previous blocks are skipped,
// deblock the edge if the edge belongs to a PU's edge only.
-#if CONFIG_NEW_DF
#if DF_REDUCED_SB_EDGE
const BLOCK_SIZE superblock_size = get_plane_block_size(
cm->seq_params.sb_size, plane_ptr->subsampling_x,
@@ -770,19 +517,13 @@
}
}
#endif // DF_MVS
-#endif // CONFIG_NEW_DF
-#if CONFIG_NEW_DF
if (((curr_q && curr_side) || (pv_q && pv_side)) &&
-#else
- if ((curr_level || pv_lvl) &&
-#endif
-#if CONFIG_NEW_DF && DF_MVS
+#if DF_MVS
(!pv_skip_txfm || !curr_skipped || diff_mvs)) {
#else
(!pv_skip_txfm || !curr_skipped || pu_edge)) {
#endif
-#if CONFIG_NEW_DF
TX_SIZE clipped_ts = ts;
if (!plane) {
if (((VERT_EDGE == edge_dir) && (width < x + 16)) ||
@@ -798,9 +539,6 @@
}
}
const TX_SIZE min_ts = AOMMIN(clipped_ts, pv_ts);
-#else
- const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
-#endif // CONFIG_NEW_DF
if (TX_4X4 >= min_ts) {
params->filter_length = 4;
} else if (TX_8X8 == min_ts) {
@@ -853,28 +591,13 @@
}
#endif // DF_FILT26
-#if CONFIG_NEW_DF
// update the level if the current block is skipped,
// but the previous one is not
params->q_threshold = (curr_q) ? (curr_q) : (pv_q);
params->side_threshold = (curr_side) ? (curr_side) : (pv_side);
-#else
- // update the level if the current block is skipped,
- // but the previous one is not
- level = (curr_level) ? (curr_level) : (pv_lvl);
-#endif // CONFIG_NEW_DF
}
}
}
-#if !CONFIG_NEW_DF
- // prepare common parameters
- if (params->filter_length) {
- const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
- params->lim = limits->lim;
- params->mblim = limits->mblim;
- params->hev_thr = limits->hev_thr;
- }
-#endif // !CONFIG_NEW_DF
}
}
return ts;
@@ -884,9 +607,7 @@
const MACROBLOCKD *const xd, const int plane,
const MACROBLOCKD_PLANE *const plane_ptr,
const uint32_t mi_row, const uint32_t mi_col) {
-#if CONFIG_NEW_DF
if (!plane && !cm->lf.filter_level[0]) return;
-#endif
const uint32_t scale_horz = plane_ptr->subsampling_x;
const uint32_t scale_vert = plane_ptr->subsampling_y;
uint16_t *const dst_ptr = plane_ptr->dst.buf;
@@ -916,39 +637,12 @@
}
const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-#if CONFIG_NEW_DF
-
if (params.filter_length) {
aom_highbd_lpf_vertical_generic_c(p, dst_stride, params.filter_length,
¶ms.q_threshold,
¶ms.side_threshold, bit_depth);
}
-#else
- switch (params.filter_length) {
- // apply 4-tap filtering
- case 4:
- aom_highbd_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- case 6: // apply 6-tap filter for chroma plane only
- assert(plane != 0);
- aom_highbd_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // apply 8-tap filtering
- case 8:
- aom_highbd_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // apply 14-tap filtering
- case 14:
- aom_highbd_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // no filtering
- default: break;
- }
-#endif // !CONFIG_NEW_DF
+
// advance the destination pointer
advance_units = tx_size_wide_unit[tx_size];
x += advance_units;
@@ -961,9 +655,7 @@
const MACROBLOCKD *const xd, const int plane,
const MACROBLOCKD_PLANE *const plane_ptr,
const uint32_t mi_row, const uint32_t mi_col) {
-#if CONFIG_NEW_DF
if (!plane && !cm->lf.filter_level[1]) return;
-#endif
const uint32_t scale_horz = plane_ptr->subsampling_x;
const uint32_t scale_vert = plane_ptr->subsampling_y;
uint16_t *const dst_ptr = plane_ptr->dst.buf;
@@ -993,41 +685,12 @@
}
const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-#if CONFIG_NEW_DF
if (params.filter_length) {
aom_highbd_lpf_horizontal_generic_c(p, dst_stride, params.filter_length,
¶ms.q_threshold,
¶ms.side_threshold, bit_depth);
}
-#else
- switch (params.filter_length) {
- // apply 4-tap filtering
- case 4:
- aom_highbd_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // apply 6-tap filtering
- case 6:
- assert(plane != 0);
- aom_highbd_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // apply 8-tap filtering
- case 8:
- aom_highbd_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // apply 14-tap filtering
- case 14:
- aom_highbd_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
- params.hev_thr, bit_depth);
- break;
- // no filtering
- default: break;
- }
-#endif //! CONFIG_NEW_DF
-
// advance the destination pointer
advance_units = tx_size_high_unit[tx_size];
y += advance_units;
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 094bbb4..e01f1f7 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -23,8 +23,6 @@
extern "C" {
#endif
-#if CONFIG_NEW_DF
-
#define MAX_DF_OFFSETS 64
#define ZERO_DF_OFFSET 32
@@ -42,10 +40,6 @@
#define DF_CHROMA_WIDE 1
#define DF_REDUCED_SB_EDGE 1
-#else
-#define DF_FILT26 0
-#define DF_CHROMA_WIDE 0
-#endif // CONFIG_NEW_DF
#define MAX_LOOP_FILTER 63
#define MAX_SHARPNESS 7
@@ -110,7 +104,6 @@
int filter_level_u;
int filter_level_v;
-#if CONFIG_NEW_DF
#if DF_DUAL
int delta_q_luma[2];
int delta_side_luma[2];
@@ -122,9 +115,7 @@
int delta_side_u;
int delta_q_v;
int delta_side_v;
-#else
- int sharpness_level;
-#endif // CONFIG_NEW_DF
+
uint8_t mode_ref_delta_enabled;
uint8_t mode_ref_delta_update;
@@ -152,16 +143,10 @@
} loop_filter_thresh;
typedef struct {
-#if CONFIG_NEW_DF
uint16_t q_thr[MAX_MB_PLANE][MAX_SEGMENTS][2][SINGLE_REF_FRAMES]
[MAX_MODE_LF_DELTAS];
uint16_t side_thr[MAX_MB_PLANE][MAX_SEGMENTS][2][SINGLE_REF_FRAMES]
[MAX_MODE_LF_DELTAS];
-#else
- loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
- uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][SINGLE_REF_FRAMES]
- [MAX_MODE_LF_DELTAS];
-#endif
} loop_filter_info_n;
typedef struct LoopFilterWorkerData {
@@ -208,16 +193,9 @@
const MACROBLOCKD *const xd, const int plane,
const MACROBLOCKD_PLANE *const plane_ptr,
const uint32_t mi_row, const uint32_t mi_col);
-#if !CONFIG_NEW_DF
-uint8_t av1_get_filter_level(const struct AV1Common *cm,
- const loop_filter_info_n *lfi_n, const int dir_idx,
- int plane, const MB_MODE_INFO *mbmi);
-#endif
-#if CONFIG_NEW_DF || CONFIG_PEF
int df_quant_from_qindex(int q_index, int bit_depth);
int df_side_from_qindex(int q_index, int bit_depth);
-#endif // CONFIG_NEW_DF || CONFIG_PEF
#if CONFIG_LPF_MASK
void av1_filter_block_plane_ver(struct AV1Common *const cm,
struct macroblockd_plane *const plane_ptr,
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 6d3ecec..b4f0ad1 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -3245,16 +3245,8 @@
}
static void set_default_lf_deltas(struct loopfilter *lf) {
-#if !CONFIG_NEW_DF // was DF_REF_DELTAS
- lf->mode_ref_delta_enabled = 1;
- lf->mode_ref_delta_update = 1;
-
- av1_set_default_ref_deltas(lf->ref_deltas);
- av1_set_default_mode_deltas(lf->mode_deltas);
-#else
lf->mode_ref_delta_enabled = 0;
lf->mode_ref_delta_update = 0;
-#endif
}
void av1_setup_frame_contexts(AV1_COMMON *cm) {
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index d193249..e11e0b2 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -23,12 +23,8 @@
extern "C" {
#endif
-#if CONFIG_NEW_DF
#define DF_RESTRICT_ORIP 1
#define ORIP_BLOCK_SIZE 32
-#else
-#define DF_RESTRICT_ORIP 0
-#endif
#if CONFIG_AIMC
/*! \brief set the luma intra mode and delta angles for a given mode index.
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 3920205..c3c0d35 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3125,7 +3125,6 @@
1) == 0);
#endif // CONFIG_LR_FLEX_SYNTAX
}
-#if CONFIG_NEW_DF
static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
struct aom_read_bit_buffer *rb) {
const int num_planes = av1_num_planes(cm);
@@ -3281,60 +3280,6 @@
lf->mode_ref_delta_update = 0;
lf->mode_ref_delta_enabled = 0;
}
-#else
-static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
- struct aom_read_bit_buffer *rb) {
- const int num_planes = av1_num_planes(cm);
- struct loopfilter *lf = &cm->lf;
-
- if (is_global_intrabc_allowed(cm) || cm->features.coded_lossless) {
- // write default deltas to frame buffer
- av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
- av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
- return;
- }
- assert(!cm->features.coded_lossless);
- if (cm->prev_frame) {
- // write deltas to frame buffer
- memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, SINGLE_REF_FRAMES);
- memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
- } else {
- av1_set_default_ref_deltas(lf->ref_deltas);
- av1_set_default_mode_deltas(lf->mode_deltas);
- }
- lf->filter_level[0] = aom_rb_read_literal(rb, 6);
- lf->filter_level[1] = aom_rb_read_literal(rb, 6);
- if (num_planes > 1) {
- if (lf->filter_level[0] || lf->filter_level[1]) {
- lf->filter_level_u = aom_rb_read_literal(rb, 6);
- lf->filter_level_v = aom_rb_read_literal(rb, 6);
- }
- }
- lf->sharpness_level = aom_rb_read_literal(rb, 3);
-
- // Read in loop filter deltas applied at the MB level based on mode or ref
- // frame.
- lf->mode_ref_delta_update = 0;
-
- lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
- if (lf->mode_ref_delta_enabled) {
- lf->mode_ref_delta_update = aom_rb_read_bit(rb);
- if (lf->mode_ref_delta_update) {
- for (int i = 0; i < SINGLE_REF_FRAMES; i++)
- if (aom_rb_read_bit(rb))
- lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
-
- for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
- if (aom_rb_read_bit(rb))
- lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
- }
- }
-
- // write deltas to frame buffer
- memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, SINGLE_REF_FRAMES);
- memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
-}
-#endif // CONFIG_NEW_DF
static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
struct aom_read_bit_buffer *rb) {
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 110f436..4bbcaaf 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3659,38 +3659,7 @@
#endif // CONFIG_PC_WIENER
}
}
-#if !CONFIG_NEW_DF
-// Only write out the ref delta section if any of the elements
-// will signal a delta.
-static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
- struct loopfilter *lf = &cm->lf;
- if (!lf->mode_ref_delta_update) {
- return 0;
- }
- const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
- int8_t last_ref_deltas[SINGLE_REF_FRAMES];
- int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
- if (buf == NULL) {
- av1_set_default_ref_deltas(last_ref_deltas);
- av1_set_default_mode_deltas(last_mode_deltas);
- } else {
- memcpy(last_ref_deltas, buf->ref_deltas, SINGLE_REF_FRAMES);
- memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
- }
- for (int i = 0; i < SINGLE_REF_FRAMES; i++) {
- if (lf->ref_deltas[i] != last_ref_deltas[i]) {
- return true;
- }
- }
- for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- if (lf->mode_deltas[i] != last_mode_deltas[i]) {
- return true;
- }
- }
- return false;
-}
-#endif // !CONFIG_NEW_DF
-#if CONFIG_NEW_DF
+
static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
struct aom_write_bit_buffer *wb) {
assert(!cm->features.coded_lossless);
@@ -3805,60 +3774,6 @@
#endif // DF_TWO_PARAM
}
}
-#else
-static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
- struct aom_write_bit_buffer *wb) {
- assert(!cm->features.coded_lossless);
- if (is_global_intrabc_allowed(cm)) return;
- const int num_planes = av1_num_planes(cm);
- struct loopfilter *lf = &cm->lf;
-
- // Encode the loop filter level and type
- aom_wb_write_literal(wb, lf->filter_level[0], 6);
- aom_wb_write_literal(wb, lf->filter_level[1], 6);
- if (num_planes > 1) {
- if (lf->filter_level[0] || lf->filter_level[1]) {
- aom_wb_write_literal(wb, lf->filter_level_u, 6);
- aom_wb_write_literal(wb, lf->filter_level_v, 6);
- }
- }
- aom_wb_write_literal(wb, lf->sharpness_level, 3);
-
- aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
-
- // Write out loop filter deltas applied at the MB level based on mode or
- // ref frame (if they are enabled), only if there is information to write.
- int meaningful = is_mode_ref_delta_meaningful(cm);
- aom_wb_write_bit(wb, meaningful);
- if (!meaningful) {
- return;
- }
-
- const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
- int8_t last_ref_deltas[SINGLE_REF_FRAMES];
- int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
- if (buf == NULL) {
- av1_set_default_ref_deltas(last_ref_deltas);
- av1_set_default_mode_deltas(last_mode_deltas);
- } else {
- memcpy(last_ref_deltas, buf->ref_deltas, SINGLE_REF_FRAMES);
- memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
- }
-
- for (int i = 0; i < SINGLE_REF_FRAMES; i++) {
- const int delta = lf->ref_deltas[i];
- const int changed = delta != last_ref_deltas[i];
- aom_wb_write_bit(wb, changed);
- if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
- }
- for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
- const int delta = lf->mode_deltas[i];
- const int changed = delta != last_mode_deltas[i];
- aom_wb_write_bit(wb, changed);
- if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
- }
-}
-#endif // CONFIG_NEW_DF
static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
struct aom_write_bit_buffer *wb) {
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index ed6c168..113bf17 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -28,10 +28,8 @@
#include "av1/encoder/encoder.h"
#include "av1/encoder/picklpf.h"
-#if CONFIG_NEW_DF
#include <float.h>
#define CHROMA_LAMBDA_MULT 6
-#endif // CONFIG_NEW_DF
static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
YV12_BUFFER_CONFIG *dst_bc, int plane) {
@@ -42,33 +40,16 @@
default: assert(plane >= 0 && plane <= 2); break;
}
}
-#if !CONFIG_NEW_DF
-int av1_get_max_filter_level(const AV1_COMP *cpi) {
- (void)cpi;
- return MAX_LOOP_FILTER;
-}
-
-#endif
static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
- AV1_COMP *const cpi,
-#if CONFIG_NEW_DF
- int q_offset, int side_offset,
-#else
- int filt_level,
-#endif
- int partial_frame, int plane, int dir) {
+ AV1_COMP *const cpi, int q_offset,
+ int side_offset, int partial_frame, int plane,
+ int dir) {
MultiThreadInfo *const mt_info = &cpi->mt_info;
int num_workers = mt_info->num_workers;
AV1_COMMON *const cm = &cpi->common;
int64_t filt_err;
assert(plane >= 0 && plane <= 2);
-#if !CONFIG_NEW_DF
- int filter_level[2] = { filt_level, filt_level };
- if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
- if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
-#endif // !CONFIG_NEW_DF
-#if CONFIG_NEW_DF
// set base filters for use of av1_get_filter_level when in DELTA_LF mode
switch (plane) {
case 0:
@@ -98,17 +79,7 @@
cm->lf.delta_side_v = side_offset;
break;
}
-#else
- // set base filters for use of av1_get_filter_level when in DELTA_LF mode
- switch (plane) {
- case 0:
- cm->lf.filter_level[0] = filter_level[0];
- cm->lf.filter_level[1] = filter_level[1];
- break;
- case 1: cm->lf.filter_level_u = filter_level[0]; break;
- case 2: cm->lf.filter_level_v = filter_level[0]; break;
- }
-#endif // CONFIG_NEW_DF
+
// TODO(any): please enable multi-thread and remove the flag when loop
// filter mask is compatible with multi-thread.
if (num_workers > 1)
@@ -134,7 +105,6 @@
return filt_err;
}
-#if CONFIG_NEW_DF
static int search_filter_offsets(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
int partial_frame,
const int *last_frame_offsets,
@@ -341,112 +311,7 @@
return best_cost < start_cost ? offset_best : offsets[off_ind];
}
-#else
-static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
- int partial_frame,
- const int *last_frame_filter_level,
- double *best_cost_ret, int plane, int dir) {
- const AV1_COMMON *const cm = &cpi->common;
- const int min_filter_level = 0;
- const int max_filter_level = av1_get_max_filter_level(cpi);
- int filt_direction = 0;
- int64_t best_err;
- int filt_best;
- MACROBLOCK *x = &cpi->td.mb;
- // Start the search at the previous frame filter level unless it is now out of
- // range.
- int lvl;
- switch (plane) {
- case 0:
- switch (dir) {
- case 2:
- lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
- 1;
- break;
- case 0:
- case 1: lvl = last_frame_filter_level[dir]; break;
- default: assert(dir >= 0 && dir <= 2); return 0;
- }
- break;
- case 1: lvl = last_frame_filter_level[2]; break;
- case 2: lvl = last_frame_filter_level[3]; break;
- default: assert(plane >= 0 && plane <= 2); return 0;
- }
- int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
- int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
- // Sum squared error at each filter level
- int64_t ss_err[MAX_LOOP_FILTER + 1];
-
- // Set each entry to -1
- memset(ss_err, 0xFF, sizeof(ss_err));
- yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
- best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
- filt_best = filt_mid;
- ss_err[filt_mid] = best_err;
-
- while (filter_step > 0) {
- const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
- const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
-
- // Bias against raising loop filter in favor of lowering it.
- int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
-
- // yx, bias less for large block size
- if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
-
- if (filt_direction <= 0 && filt_low != filt_mid) {
- // Get Low filter error score
- if (ss_err[filt_low] < 0) {
- ss_err[filt_low] =
- try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
- }
- // If value is close to the best so far then bias towards a lower loop
- // filter value.
- if (ss_err[filt_low] < (best_err + bias)) {
- // Was it actually better than the previous best?
- if (ss_err[filt_low] < best_err) {
- best_err = ss_err[filt_low];
- }
- filt_best = filt_low;
- }
- }
-
- // Now look at filt_high
- if (filt_direction >= 0 && filt_high != filt_mid) {
- if (ss_err[filt_high] < 0) {
- ss_err[filt_high] =
- try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
- }
- // If value is significantly better than previous best, bias added against
- // raising filter value
- if (ss_err[filt_high] < (best_err - bias)) {
- best_err = ss_err[filt_high];
- filt_best = filt_high;
- }
- }
-
- // Half the step distance if the best filter value was the same as last time
- if (filt_best == filt_mid) {
- filter_step /= 2;
- filt_direction = 0;
- } else {
- filt_direction = (filt_best < filt_mid) ? -1 : 1;
- filt_mid = filt_best;
- }
- }
-
- // Update best error
- best_err = ss_err[filt_best];
-
- if (best_cost_ret)
- *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST(
- x->rdmult, 0, (best_err << 4), cm->seq_params.bit_depth);
- return filt_best;
-}
-#endif // CONFIG_NEW_DF
-
-#if CONFIG_NEW_DF
void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
LPF_PICK_METHOD method) {
AV1_COMMON *const cm = &cpi->common;
@@ -694,99 +559,3 @@
#endif // DF_DUAL
}
}
-#else
-void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
- LPF_PICK_METHOD method) {
- AV1_COMMON *const cm = &cpi->common;
- const int num_planes = av1_num_planes(cm);
- struct loopfilter *const lf = &cm->lf;
- (void)sd;
-
- lf->sharpness_level = 0;
- cpi->td.mb.rdmult = cpi->rd.RDMULT;
-
- if (method == LPF_PICK_MINIMAL_LPF) {
- lf->filter_level[0] = 0;
- lf->filter_level[1] = 0;
- } else if (method >= LPF_PICK_FROM_Q) {
- const int min_filter_level = 0;
- const int max_filter_level = av1_get_max_filter_level(cpi);
-
- const int q =
- ROUND_POWER_OF_TWO(av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
- cm->seq_params.bit_depth),
- QUANT_TABLE_BITS);
-
- // based on tests result for rtc test set
- // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
- const int strength_boost_q_treshold = 700;
- const int inter_frame_multiplier =
- q > strength_boost_q_treshold ? 12034 : 6017;
- // These values were determined by linear fitting the result of the
- // searched level for 8 bit depth:
- // Keyframes: filt_guess = q * 0.06699 - 1.60817
- // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
- //
- // And high bit depth separately:
- // filt_guess = q * 0.316206 + 3.87252
- int filt_guess;
- switch (cm->seq_params.bit_depth) {
- case AOM_BITS_8:
- filt_guess =
- (cm->current_frame.frame_type == KEY_FRAME)
- ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
- : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
- break;
- case AOM_BITS_10:
- filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
- break;
- case AOM_BITS_12:
- filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
- break;
- default:
- assert(0 &&
- "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
- "or AOM_BITS_12");
- return;
- }
- if (cm->seq_params.bit_depth != AOM_BITS_8 &&
- cm->current_frame.frame_type == KEY_FRAME)
- filt_guess -= 4;
- // TODO(chengchen): retrain the model for Y, U, V filter levels
- lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
- lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
- lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
- lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
- } else {
- // TODO(anyone): What are good initial levels for keyframes?
- int last_frame_filter_level[4] = { 0 };
- if (!frame_is_intra_only(cm)) {
- last_frame_filter_level[0] = lf->filter_level[0];
- last_frame_filter_level[1] = lf->filter_level[1];
- last_frame_filter_level[2] = lf->filter_level_u;
- last_frame_filter_level[3] = lf->filter_level_v;
- }
-
- lf->filter_level[0] = lf->filter_level[1] =
- search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
- last_frame_filter_level, NULL, 0, 2);
- if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
- lf->filter_level[0] =
- search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
- last_frame_filter_level, NULL, 0, 0);
- lf->filter_level[1] =
- search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
- last_frame_filter_level, NULL, 0, 1);
- }
-
- if (num_planes > 1) {
- lf->filter_level_u =
- search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
- last_frame_filter_level, NULL, 1, 0);
- lf->filter_level_v =
- search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
- last_frame_filter_level, NULL, 2, 0);
- }
- }
-}
-#endif // !CONFIG_NEW_DF
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index f87d0b2..fd5021b 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -231,8 +231,6 @@
set_aom_config_var(
CONFIG_SKIP_MODE_DRL_WITH_REF_IDX 1
"AV2 experiment flag to enable DRL with ref_MV_idx for skip mode.")
-set_aom_config_var(CONFIG_NEW_DF 1
- "AV2 experiment flag on new deblocking filter.")
set_aom_config_var(CONFIG_TIP 1 "Enable temporal interpolated prediction (TIP)")
set_aom_config_var(CONFIG_OPTFLOW_ON_TIP 1
"Enable optical flow refinement on top of TIP")
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
deleted file mode 100644
index 4d75c3d..0000000
--- a/test/lpf_test.cc
+++ /dev/null
@@ -1,572 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <string>
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/aom_config.h"
-
-#if !CONFIG_NEW_DF
-#include "config/aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/entropy.h"
-#include "aom/aom_integer.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-// Horizontally and Vertically need 32x32: 8 Coeffs preceeding filtered section
-// 16 Coefs within filtered section
-// 8 Coeffs following filtered section
-const int kNumCoeffs = 1024;
-
-const int number_of_iterations = 10000;
-
-const int kSpeedTestNum = 500000;
-
-#define LOOP_PARAM \
- int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh
-#define DUAL_LOOP_PARAM \
- int p, const uint8_t *blimit0, const uint8_t *limit0, \
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \
- const uint8_t *thresh1
-
-typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM);
-typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM);
-typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
-typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
-
-typedef std::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
-typedef std::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
- hbddual_loop_param_t;
-typedef std::tuple<loop_op_t, loop_op_t, int> loop_param_t;
-typedef std::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
-
-template <typename Pixel_t, int PIXEL_WIDTH_t>
-void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
- const int mask, const int32_t p, const int i) {
- uint16_t tmp_s[kNumCoeffs];
-
- for (int j = 0; j < kNumCoeffs;) {
- const uint8_t val = rnd->Rand8();
- if (val & 0x80) { // 50% chance to choose a new value.
- tmp_s[j] = rnd->Rand16();
- j++;
- } else { // 50% chance to repeat previous value in row X times.
- int k = 0;
- while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
- if (j < 1) {
- tmp_s[j] = rnd->Rand16();
- } else if (val & 0x20) { // Increment by a value within the limit.
- tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
- } else { // Decrement by a value within the limit.
- tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
- }
- j++;
- }
- }
- }
-
- for (int j = 0; j < kNumCoeffs;) {
- const uint8_t val = rnd->Rand8();
- if (val & 0x80) {
- j++;
- } else { // 50% chance to repeat previous value in column X times.
- int k = 0;
- while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
- if (j < 1) {
- tmp_s[j] = rnd->Rand16();
- } else if (val & 0x20) { // Increment by a value within the limit.
- tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
- tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1));
- } else { // Decrement by a value within the limit.
- tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
- tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1));
- }
- j++;
- }
- }
- }
-
- for (int j = 0; j < kNumCoeffs; j++) {
- if (i % 2) {
- s[j] = tmp_s[j] & mask;
- } else {
- s[j] = tmp_s[p * (j % p) + j / p] & mask;
- }
- ref_s[j] = s[j];
- }
-}
-
-uint8_t GetOuterThresh(ACMRandom *rnd) {
- return static_cast<uint8_t>(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5));
-}
-
-uint8_t GetInnerThresh(ACMRandom *rnd) {
- return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1));
-}
-
-uint8_t GetHevThresh(ACMRandom *rnd) {
- return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4);
-}
-
-template <typename func_type_t, typename params_t>
-class LoopTestParam : public ::testing::TestWithParam<params_t> {
- public:
- virtual ~LoopTestParam() {}
- virtual void SetUp() {
- loopfilter_op_ = std::get<0>(this->GetParam());
- ref_loopfilter_op_ = std::get<1>(this->GetParam());
- bit_depth_ = std::get<2>(this->GetParam());
- mask_ = (1 << bit_depth_) - 1;
- }
-
- virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
- int bit_depth_;
- int mask_;
- func_type_t loopfilter_op_;
- func_type_t ref_loopfilter_op_;
-};
-
-void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
- op(s, p, blimit, limit, thresh, bd);
-}
-void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
- hbddual_loop_op_t op) {
- op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
-}
-
-typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_hbd);
-typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
- Loop8Test9Param_hbd;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_hbd);
-
-#define OPCHECK(a, b) \
- ACMRandom rnd(ACMRandom::DeterministicSeed()); \
- const int count_test_block = number_of_iterations; \
- const int32_t p = kNumCoeffs / 32; \
- DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
- DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
- int err_count_total = 0; \
- int first_failure = -1; \
- for (int i = 0; i < count_test_block; ++i) { \
- int err_count = 0; \
- uint8_t tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i); \
- call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_, \
- ref_loopfilter_op_); \
- ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit, \
- thresh, bit_depth_, loopfilter_op_)); \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- err_count += ref_s[j] != s[j]; \
- } \
- if (err_count && !err_count_total) { \
- first_failure = i; \
- } \
- err_count_total += err_count; \
- } \
- EXPECT_EQ(0, err_count_total) \
- << "Error: Loop8Test6Param, C output doesn't match SIMD " \
- "loopfilter output. " \
- << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
-
-#define VALCHECK(a, b) \
- ACMRandom rnd(ACMRandom::DeterministicSeed()); \
- const int count_test_block = number_of_iterations; \
- DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
- DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
- int err_count_total = 0; \
- int first_failure = -1; \
- for (int i = 0; i < count_test_block; ++i) { \
- int err_count = 0; \
- uint8_t tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- int32_t p = kNumCoeffs / 32; \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- s[j] = rnd.Rand16() & mask_; \
- ref_s[j] = s[j]; \
- } \
- call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_, \
- ref_loopfilter_op_); \
- ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit, \
- thresh, bit_depth_, loopfilter_op_)); \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- err_count += ref_s[j] != s[j]; \
- } \
- if (err_count && !err_count_total) { \
- first_failure = i; \
- } \
- err_count_total += err_count; \
- } \
- EXPECT_EQ(0, err_count_total) \
- << "Error: Loop8Test6Param, C output doesn't match SIMD " \
- "loopfilter output. " \
- << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
-
-#define SPEEDCHECK(a, b) \
- ACMRandom rnd(ACMRandom::DeterministicSeed()); \
- const int count_test_block = kSpeedTestNum; \
- const int32_t bd = bit_depth_; \
- DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
- uint8_t tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- int32_t p = kNumCoeffs / 32; \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- s[j] = rnd.Rand16() & mask_; \
- } \
- for (int i = 0; i < count_test_block; ++i) { \
- call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
- }
-
-TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
-
-#define OPCHECKd(a, b) \
- ACMRandom rnd(ACMRandom::DeterministicSeed()); \
- const int count_test_block = number_of_iterations; \
- DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
- DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
- int err_count_total = 0; \
- int first_failure = -1; \
- for (int i = 0; i < count_test_block; ++i) { \
- int err_count = 0; \
- uint8_t tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- int32_t p = kNumCoeffs / 32; \
- const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1; \
- InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i); \
- call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
- limit1, thresh1, bit_depth_, ref_loopfilter_op_); \
- ASM_REGISTER_STATE_CHECK( \
- call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
- limit1, thresh1, bit_depth_, loopfilter_op_)); \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- err_count += ref_s[j] != s[j]; \
- } \
- if (err_count && !err_count_total) { \
- first_failure = i; \
- } \
- err_count_total += err_count; \
- } \
- EXPECT_EQ(0, err_count_total) \
- << "Error: Loop8Test9Param, C output doesn't match SIMD " \
- "loopfilter output. " \
- << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
-
-#define VALCHECKd(a, b) \
- ACMRandom rnd(ACMRandom::DeterministicSeed()); \
- const int count_test_block = number_of_iterations; \
- DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
- DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]); \
- int err_count_total = 0; \
- int first_failure = -1; \
- for (int i = 0; i < count_test_block; ++i) { \
- int err_count = 0; \
- uint8_t tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- int32_t p = kNumCoeffs / 32; \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- s[j] = rnd.Rand16() & mask_; \
- ref_s[j] = s[j]; \
- } \
- call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
- limit1, thresh1, bit_depth_, ref_loopfilter_op_); \
- ASM_REGISTER_STATE_CHECK( \
- call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
- limit1, thresh1, bit_depth_, loopfilter_op_)); \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- err_count += ref_s[j] != s[j]; \
- } \
- if (err_count && !err_count_total) { \
- first_failure = i; \
- } \
- err_count_total += err_count; \
- } \
- EXPECT_EQ(0, err_count_total) \
- << "Error: Loop8Test9Param, C output doesn't match SIMD " \
- "loopfilter output. " \
- << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
-
-#define SPEEDCHECKd(a, b) \
- ACMRandom rnd(ACMRandom::DeterministicSeed()); \
- const int count_test_block = kSpeedTestNum; \
- DECLARE_ALIGNED(b, a, s[kNumCoeffs]); \
- uint8_t tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetOuterThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetInnerThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- tmp = GetHevThresh(&rnd); \
- DECLARE_ALIGNED(16, const uint8_t, \
- thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, \
- tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
- int32_t p = kNumCoeffs / 32; \
- for (int j = 0; j < kNumCoeffs; ++j) { \
- s[j] = rnd.Rand16() & mask_; \
- } \
- for (int i = 0; i < count_test_block; ++i) { \
- call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1, \
- limit1, thresh1, bit_depth_, loopfilter_op_); \
- }
-
-TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
-
-using std::make_tuple;
-
-#if HAVE_SSE2
-const hbdloop_param_t kHbdLoop8Test6[] = {
- make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
- 8),
- make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
- 8),
- make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
- 8),
- make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
- &aom_highbd_lpf_horizontal_14_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
-
- make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
- 8),
- make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
- 10),
- make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
- 10),
- make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
- 10),
- make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
- &aom_highbd_lpf_horizontal_14_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
- 10),
- make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
- 12),
- make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
- make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
- 12),
- make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
- 12),
- make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
- &aom_highbd_lpf_horizontal_14_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
- 12),
- make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd,
- ::testing::ValuesIn(kHbdLoop8Test6));
-
-#endif // HAVE_SSE2
-
-#if HAVE_SSE2
-const hbddual_loop_param_t kHbdLoop8Test9[] = {
- make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
- &aom_highbd_lpf_horizontal_4_dual_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
- &aom_highbd_lpf_horizontal_6_dual_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
- &aom_highbd_lpf_horizontal_8_dual_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
- &aom_highbd_lpf_horizontal_14_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
- &aom_highbd_lpf_vertical_4_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
- &aom_highbd_lpf_vertical_6_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
- &aom_highbd_lpf_vertical_8_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
- &aom_highbd_lpf_vertical_14_dual_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
- &aom_highbd_lpf_horizontal_4_dual_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
- &aom_highbd_lpf_horizontal_6_dual_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
- &aom_highbd_lpf_horizontal_8_dual_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
- &aom_highbd_lpf_horizontal_14_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
- &aom_highbd_lpf_vertical_4_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
- &aom_highbd_lpf_vertical_6_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
- &aom_highbd_lpf_vertical_8_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
- &aom_highbd_lpf_vertical_14_dual_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
- &aom_highbd_lpf_horizontal_4_dual_c, 12),
- make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
- &aom_highbd_lpf_horizontal_6_dual_c, 12),
- make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
- &aom_highbd_lpf_horizontal_8_dual_c, 12),
- make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
- &aom_highbd_lpf_horizontal_14_dual_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
- &aom_highbd_lpf_vertical_4_dual_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
- &aom_highbd_lpf_vertical_6_dual_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
- &aom_highbd_lpf_vertical_8_dual_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
- &aom_highbd_lpf_vertical_14_dual_c, 12),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_hbd,
- ::testing::ValuesIn(kHbdLoop8Test9));
-
-#endif // HAVE_SSE2
-
-#if HAVE_AVX2
-const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
- make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
- &aom_highbd_lpf_horizontal_4_dual_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
- &aom_highbd_lpf_horizontal_4_dual_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
- &aom_highbd_lpf_horizontal_4_dual_c, 12),
- make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
- &aom_highbd_lpf_horizontal_8_dual_c, 8),
- make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
- &aom_highbd_lpf_horizontal_8_dual_c, 10),
- make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
- &aom_highbd_lpf_horizontal_8_dual_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
- &aom_highbd_lpf_vertical_4_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
- &aom_highbd_lpf_vertical_4_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
- &aom_highbd_lpf_vertical_4_dual_c, 12),
- make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
- &aom_highbd_lpf_vertical_8_dual_c, 8),
- make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
- &aom_highbd_lpf_vertical_8_dual_c, 10),
- make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
- &aom_highbd_lpf_vertical_8_dual_c, 12),
-};
-
-INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test9Param_hbd,
- ::testing::ValuesIn(kHbdLoop8Test9Avx2));
-#endif
-} // namespace
-#endif // !CONFIG_NEW_DF
diff --git a/test/test.cmake b/test/test.cmake
index 1c6be67..a402306 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -95,7 +95,6 @@
"${AOM_ROOT}/test/hiprec_convolve_test_util.h"
"${AOM_ROOT}/test/intrabc_test.cc"
"${AOM_ROOT}/test/intrapred_test.cc"
- "${AOM_ROOT}/test/lpf_test.cc"
"${AOM_ROOT}/test/opt_flow_test.cc"
"${AOM_ROOT}/test/scan_test.cc"
"${AOM_ROOT}/test/selfguided_filter_test.cc"