Remove the compile time flag for CONFIG_NEW_DF

This commit is removing the compile time flag of CONFIG_NEW_DF.
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 4951eea..2926e31 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -73,7 +73,6 @@
   "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
-  "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
   "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
   "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
@@ -108,7 +107,6 @@
   "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
   "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
-  "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
   "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
   "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
   "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6f5f471..3423edb 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -166,58 +166,9 @@
 #
 # Loopfilter
 #
-if (aom_config("CONFIG_NEW_DF") eq "yes") {
-      add_proto qw/void aom_highbd_lpf_horizontal_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
-      add_proto qw/void aom_highbd_lpf_vertical_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
-} else {
-add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+add_proto qw/void aom_highbd_lpf_horizontal_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
+add_proto qw/void aom_highbd_lpf_vertical_generic/, "uint16_t *s, int pitch, int filt_width, const uint16_t *q_thresh, const uint16_t *side_thresh, int bd";
 
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_6 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
-specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-}
 
 #
 # Encoder functions.
diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c
index eb14d7d..6092ca3 100644
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@@ -18,8 +18,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
-#if CONFIG_NEW_DF
-
 #define DF_SPARSE 1
 #define DF_FILT26 1
 #define DF_8_THRESH 3
@@ -281,498 +279,3 @@
     s += pitch;
   }
 }
-
-#else  // !CONFIG_NEW_DF
-
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
-  switch (bd) {
-    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
-    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
-    case 8:
-    default: return (int16_t)clamp(t, -128, 128 - 1);
-  }
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
-                                         uint16_t p1, uint16_t p0, uint16_t q0,
-                                         uint16_t q1, int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
-  return ~mask;
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
-                                        uint16_t p3, uint16_t p2, uint16_t p1,
-                                        uint16_t p0, uint16_t q0, uint16_t q1,
-                                        uint16_t q2, uint16_t q3, int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p3 - p2) > limit16) * -1;
-  mask |= (abs(p2 - p1) > limit16) * -1;
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(q2 - q1) > limit16) * -1;
-  mask |= (abs(q3 - q2) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
-                                                uint16_t p2, uint16_t p1,
-                                                uint16_t p0, uint16_t q0,
-                                                uint16_t q1, uint16_t q2,
-                                                int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p2 - p1) > limit16) * -1;
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(q2 - q1) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
-                                              uint16_t p1, uint16_t p0,
-                                              uint16_t q0, uint16_t q1,
-                                              uint16_t q2, int bd) {
-  int8_t mask = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p1 - p0) > thresh16) * -1;
-  mask |= (abs(q1 - q0) > thresh16) * -1;
-  mask |= (abs(p2 - p0) > thresh16) * -1;
-  mask |= (abs(q2 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
-                                       uint16_t p1, uint16_t p0, uint16_t q0,
-                                       uint16_t q1, uint16_t q2, uint16_t q3,
-                                       int bd) {
-  int8_t mask = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p1 - p0) > thresh16) * -1;
-  mask |= (abs(q1 - q0) > thresh16) * -1;
-  mask |= (abs(p2 - p0) > thresh16) * -1;
-  mask |= (abs(q2 - q0) > thresh16) * -1;
-  mask |= (abs(p3 - p0) > thresh16) * -1;
-  mask |= (abs(q3 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
-// Is there high edge variance internal edge:
-// 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
-                                      uint16_t q0, uint16_t q1, int bd) {
-  int16_t hev = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  hev |= (abs(p1 - p0) > thresh16) * -1;
-  hev |= (abs(q1 - q0) > thresh16) * -1;
-  return hev;
-}
-
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
-                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
-                                  int bd) {
-  int16_t filter1, filter2;
-  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
-  // into -128 to +127 instead of 0 to 255.
-  int shift = bd - 8;
-  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
-  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
-  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
-  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
-  const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
-
-  // Add outer taps if we have high edge variance.
-  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
-
-  // Inner taps.
-  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
-
-  // Save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way.
-  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
-  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
-
-  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
-  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
-
-  // Outer tap adjustments.
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
-  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
-}
-
-void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const int8_t mask =
-        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_4_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1];
-    const int8_t mask =
-        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
-    s += pitch;
-  }
-}
-
-void aom_highbd_lpf_vertical_4_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                              bd);
-}
-
-static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
-                                  uint16_t *op2, uint16_t *op1, uint16_t *op0,
-                                  uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
-                                  int bd) {
-  if (flat && mask) {
-    const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
-    // 5-tap filter [1, 2, 2, 2, 1]
-    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
-    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
-  } else {
-    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
-  }
-}
-
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
-                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
-                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
-                                  uint16_t *oq2, uint16_t *oq3, int bd) {
-  if (flat && mask) {
-    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
-  }
-}
-
-void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
-    const int8_t mask =
-        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
-    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
-    highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                   s + 1 * p, s + 2 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
-    const int8_t mask =
-        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
-    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
-    highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
-                   bd);
-    s += pitch;
-  }
-}
-
-void aom_highbd_lpf_vertical_6_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                              bd);
-}
-
-void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
-                   s + 2, s + 3, bd);
-    s += pitch;
-  }
-}
-
-void aom_highbd_lpf_vertical_8_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                              bd);
-}
-
-static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
-                                   int8_t flat2, uint16_t *op6, uint16_t *op5,
-                                   uint16_t *op4, uint16_t *op3, uint16_t *op2,
-                                   uint16_t *op1, uint16_t *op0, uint16_t *oq0,
-                                   uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
-                                   uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
-                                   int bd) {
-  if (flat2 && flat && mask) {
-    const uint16_t p6 = *op6;
-    const uint16_t p5 = *op5;
-    const uint16_t p4 = *op4;
-    const uint16_t p3 = *op3;
-    const uint16_t p2 = *op2;
-    const uint16_t p1 = *op1;
-    const uint16_t p0 = *op0;
-    const uint16_t q0 = *oq0;
-    const uint16_t q1 = *oq1;
-    const uint16_t q2 = *oq2;
-    const uint16_t q3 = *oq3;
-    const uint16_t q4 = *oq4;
-    const uint16_t q5 = *oq5;
-    const uint16_t q6 = *oq6;
-
-    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
-    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
-                              4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
-                                  q0 + q1 + q2 + q3 + q4,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
-                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
-                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
-                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
-                              4);
-  } else {
-    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
-                   bd);
-  }
-}
-
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
-                                            const uint8_t *blimit,
-                                            const uint8_t *limit,
-                                            const uint8_t *thresh, int count,
-                                            int bd) {
-  int i;
-  int step = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < step * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-
-    const int8_t flat2 =
-        highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
-                          s[5 * p], s[6 * p], bd);
-
-    highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
-                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
-                    s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
-                                    const uint8_t *blimit, const uint8_t *limit,
-                                    const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
-}
-
-void aom_highbd_lpf_horizontal_14_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
-  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
-}
-
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int count,
-                                          int bd) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4];
-    const uint16_t p2 = s[-3];
-    const uint16_t p1 = s[-2];
-    const uint16_t p0 = s[-1];
-    const uint16_t q0 = s[0];
-    const uint16_t q1 = s[1];
-    const uint16_t q2 = s[2];
-    const uint16_t q3 = s[3];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat2 =
-        highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
-
-    highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
-                    s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
-                    s + 6, bd);
-    s += p;
-  }
-}
-
-void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh,
-                                  int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
-  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                                4, bd);
-}
-
-#endif  // !CONFIG_NEW_DF
diff --git a/aom_dsp/x86/highbd_loopfilter_avx2.c b/aom_dsp/x86/highbd_loopfilter_avx2.c
deleted file mode 100644
index ffc60f1..0000000
--- a/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/common_avx2.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom/aom_integer.h"
-#if !CONFIG_NEW_DF
-void aom_highbd_lpf_horizontal_14_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
-                                         blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                       limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                        limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                        limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                      limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                      limit1, thresh1, bd);
-}
-#endif  // !CONFIG_NEW_DF
diff --git a/aom_dsp/x86/highbd_loopfilter_sse2.c b/aom_dsp/x86/highbd_loopfilter_sse2.c
deleted file mode 100644
index 12a5066..0000000
--- a/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ /dev/null
@@ -1,1700 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#if !CONFIG_NEW_DF
-static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
-                                         __m128i *pixel) {
-  *pixel = _mm_min_epi16(*pixel, *max);
-  *pixel = _mm_max_epi16(*pixel, *min);
-}
-
-static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-}
-
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
-                             const uint8_t *t, int bd, __m128i *blt,
-                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
-  *blt = _mm_slli_epi16(x, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
-  *lt = _mm_slli_epi16(x, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
-  *thr = _mm_slli_epi16(x, shift);
-
-  *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void get_limit_dual(
-    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
-    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
-    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
-    __m128i *t80_out) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
-  __m128i x1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *blt_out = _mm_slli_epi16(x0, shift);
-
-  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
-  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *lt_out = _mm_slli_epi16(x0, shift);
-
-  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
-  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *thr_out = _mm_slli_epi16(x0, shift);
-
-  *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
-                                     __m128i *p, __m128i *q) {
-  int i;
-  for (i = 0; i < size; i++) {
-    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
-    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
-  }
-}
-
-static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
-                                           const __m128i *l, const __m128i *bl,
-                                           __m128i *mask) {
-  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
-  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
-
-  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
-  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
-  int i;
-  for (i = 1; i < 4; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
-    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
-  }
-  max = _mm_subs_epu16(max, *l);
-  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
-}
-
-static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
-                                                 __m128i *p1p0, __m128i *q1q0,
-                                                 __m128i *abs_p1p0, __m128i *l,
-                                                 __m128i *bl, __m128i *t,
-                                                 __m128i *hev, __m128i *mask) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
-  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
-  __m128i max, max01, h;
-
-  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
-  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
-
-  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
-  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
-
-  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
-  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
-  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
-  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
-  // mask |= (abs(*p1 - *p0) > limit) * -1;
-  // mask |= (abs(*q1 - *q0) > limit) * -1;
-  h = _mm_subs_epu16(max01, *t);
-
-  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-  // replicate for the further "merged variables" usage
-  *hev = _mm_unpacklo_epi64(*hev, *hev);
-
-  max = _mm_max_epi16(max, max01);
-  int i;
-  for (i = 2; i < x; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
-  }
-  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
-  max = _mm_subs_epu16(max, *l);
-  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
-}
-
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
-                                      int start, int end, __m128i *flat) {
-  int i;
-  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
-                              abs_diff16(pq[start + 1], pq[0]));
-
-  for (i = start + 2; i < end; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
-  }
-  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
-  __m128i ft;
-  ft = _mm_subs_epu16(max, *th);
-
-  const __m128i zero = _mm_setzero_si128();
-  *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
-                                           const __m128i *q, int start, int end,
-                                           __m128i *flat) {
-  int i;
-  __m128i max =
-      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
-
-  for (i = start + 1; i < end; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
-    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
-  }
-
-  __m128i ft;
-  ft = _mm_subs_epu16(max, *th);
-
-  const __m128i zero = _mm_setzero_si128();
-  *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
-                                          __m128i *flat2, int bd) {
-  // check the distance 1,2,3 against 0
-  __m128i th = _mm_set1_epi16(1);
-  th = _mm_slli_epi16(th, bd - 8);
-  flat_mask_internal(&th, pq, 1, 4, flat);
-  flat_mask_internal(&th, pq, 4, 7, flat2);
-}
-
-static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
-                                               const __m128i *q, __m128i *flat,
-                                               __m128i *flat2, int bd) {
-  // check the distance 1,2,3 against 0
-  __m128i th = _mm_set1_epi16(1);
-  th = _mm_slli_epi16(th, bd - 8);
-  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
-  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
-}
-
-static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
-                                                 __m128i *hev, __m128i *mask,
-                                                 __m128i *qs1qs0,
-                                                 __m128i *ps1ps0, __m128i *t80,
-                                                 int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-  const __m128i pmin = _mm_subs_epi16(zero, *t80);
-
-  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
-  __m128i ps1ps0_work, qs1qs0_work, work;
-  __m128i filt, filter2filter1, filter2filt, filter1filt;
-
-  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
-  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
-
-  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
-  pixel_clamp(&pmin, &pmax, &work);
-  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-
-  filt = _mm_subs_epi16(filt, work);
-  filt = _mm_subs_epi16(filt, work);
-  filt = _mm_subs_epi16(filt, work);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, *mask);
-  filt = _mm_unpacklo_epi64(filt, filt);
-
-  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
-  pixel_clamp(&pmin, &pmax, &filter2filter1);
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
-
-  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
-
-  // filt >> 1
-  filt = _mm_adds_epi16(filt, one);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(*hev, filt);
-
-  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
-  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
-
-  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
-  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
-
-  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
-  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
-
-  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
-  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
-}
-
-static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
-                                            __m128i *qs, const __m128i *mask,
-                                            const __m128i *th, int bd,
-                                            __m128i *t80) {
-  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
-  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
-  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
-  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i pmin = _mm_subs_epi16(zero, *t80);
-  __m128i filter = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filter);
-
-  // hev_filter
-  __m128i hev;
-  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
-  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
-  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm_subs_epu16(h, *th);
-  const __m128i ffff = _mm_cmpeq_epi16(h, h);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-
-  filter = _mm_and_si128(filter, hev);
-
-  const __m128i x = _mm_subs_epi16(qs0, ps0);
-  filter = _mm_adds_epi16(filter, x);
-  filter = _mm_adds_epi16(filter, x);
-  filter = _mm_adds_epi16(filter, x);
-  pixel_clamp(&pmin, &pmax, &filter);
-  filter = _mm_and_si128(filter, *mask);
-  const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t4 = _mm_set1_epi16(4);
-  __m128i filter1 = _mm_adds_epi16(filter, t4);
-  __m128i filter2 = _mm_adds_epi16(filter, t3);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter1 = _mm_srai_epi16(filter1, 3);
-  filter2 = _mm_srai_epi16(filter2, 3);
-  qs0 = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &qs0);
-  ps0 = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &ps0);
-  qs[0] = _mm_adds_epi16(qs0, *t80);
-  ps[0] = _mm_adds_epi16(ps0, *t80);
-  filter = _mm_adds_epi16(filter1, one);
-  filter = _mm_srai_epi16(filter, 1);
-  filter = _mm_andnot_si128(hev, filter);
-  qs1 = _mm_subs_epi16(qs1, filter);
-  pixel_clamp(&pmin, &pmax, &qs1);
-  ps1 = _mm_adds_epi16(ps1, filter);
-  pixel_clamp(&pmin, &pmax, &ps1);
-  qs[1] = _mm_adds_epi16(qs1, *t80);
-  ps[1] = _mm_adds_epi16(ps1, *t80);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
-    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
-    const unsigned char *lt, const unsigned char *thr, int bd) {
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit, limit, thresh;
-  __m128i t80;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
-
-  for (i = 0; i < 7; i++) {
-    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
-  }
-  __m128i mask, hevhev;
-  __m128i p1p0, q1q0, abs_p1p0;
-
-  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hevhev, &mask);
-
-  __m128i ps0ps1, qs0qs1;
-  // filter4
-  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
-
-  __m128i flat, flat2;
-  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
-
-  flat = _mm_and_si128(flat, mask);
-  flat2 = _mm_and_si128(flat2, flat);
-
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-  flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
-  // flat and wide flat calculations
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i flat_p[3], flat_q[3], flat_pq[3];
-    __m128i flat2_p[6], flat2_q[6];
-    __m128i flat2_pq[6];
-    __m128i sum_p6, sum_p3;
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-
-    __m128i work0, work0_0, work0_1, sum_p_0;
-    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
-    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
-
-    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
-    __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
-    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
-
-    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
-    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
-    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
-    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
-    work0_1 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
-
-    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-
-    work0 = _mm_add_epi16(sum_p3, pq[1]);
-    flat_p[1] = _mm_add_epi16(sum_lp, work0);
-    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
-    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
-    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
-
-    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
-    work0 = _mm_add_epi16(sum_p3, pq[2]);
-
-    flat_p[2] = _mm_add_epi16(sum_lp, work0);
-    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-
-    int flat2_mask =
-        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
-    if (flat2_mask) {
-      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
-      flat2_q[0] = _mm_add_epi16(
-          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
-
-      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
-      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
-      flat2_pq[0] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-      flat2_pq[1] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
-      sum_p = _mm_sub_epi16(sum_p, q[4]);
-      sum_q = _mm_sub_epi16(sum_q, pq[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
-      flat2_p[2] = _mm_add_epi16(sum_p, work0);
-      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[2] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[3]);
-      sum_q = _mm_sub_epi16(sum_q, pq[3]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
-      flat2_p[3] = _mm_add_epi16(sum_p, work0);
-      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[3] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[2]);
-      sum_q = _mm_sub_epi16(sum_q, pq[2]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
-      flat2_p[4] = _mm_add_epi16(sum_p, work0);
-      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[4] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[1]);
-      sum_q = _mm_sub_epi16(sum_q, pq[1]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
-      flat2_p[5] = _mm_add_epi16(sum_p, work0);
-      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[5] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-    }  // flat2
-       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // highbd_filter8
-    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
-    for (i = 0; i < 3; i++) {
-      pq[i] = _mm_andnot_si128(flat, pq[i]);
-      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
-      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
-    }
-
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    if (flat2_mask) {
-      for (i = 0; i < 6; i++) {
-        pq[i] = _mm_andnot_si128(flat2, pq[i]);
-        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
-        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
-      }
-    }
-  } else {
-    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
-                                       const uint8_t *blimit,
-                                       const uint8_t *limit,
-                                       const uint8_t *thresh, int bd) {
-  __m128i p[7], q[7], pq[7];
-  int i;
-
-  for (i = 0; i < 7; i++) {
-    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
-    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
-  }
-
-  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
-  for (i = 0; i < 6; i++) {
-    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
-    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
-    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
-    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
-    const uint8_t *thr1, int bd) {
-  __m128i blimit, limit, thresh, t80;
-  const __m128i zero = _mm_setzero_si128();
-
-  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
-                 &t80);
-  __m128i mask;
-  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
-  __m128i flat, flat2;
-  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
-
-  flat = _mm_and_si128(flat, mask);
-  flat2 = _mm_and_si128(flat2, flat);
-  __m128i ps[2], qs[2];
-  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
-  // flat and wide flat calculations
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i flat_p[3], flat_q[3];
-    __m128i flat2_p[6], flat2_q[6];
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
-    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
-    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
-    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
-    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
-    sum_q = _mm_add_epi16(sum_q, sum_lq);
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-    flat_p[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
-    flat_q[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
-    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
-    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
-    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
-    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
-    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
-    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-    flat_p[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
-    flat_q[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
-    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-    flat_p[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
-    flat_q[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
-    int flat2_mask =
-        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
-    if (flat2_mask) {
-      flat2_p[0] = _mm_srli_epi16(
-          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
-                                               _mm_add_epi16(p[1], q[0]))),
-          4);
-      flat2_q[0] = _mm_srli_epi16(
-          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
-                                               _mm_add_epi16(p[0], q[1]))),
-          4);
-
-      flat2_p[1] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
-          4);
-      flat2_q[1] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[4]);
-      sum_q = _mm_sub_epi16(sum_q, p[4]);
-      flat2_p[2] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
-          4);
-      flat2_q[2] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[3]);
-      sum_q = _mm_sub_epi16(sum_q, p[3]);
-      flat2_p[3] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
-          4);
-      flat2_q[3] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[2]);
-      sum_q = _mm_sub_epi16(sum_q, p[2]);
-      flat2_p[4] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
-          4);
-      flat2_q[4] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[1]);
-      sum_q = _mm_sub_epi16(sum_q, p[1]);
-      flat2_p[5] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
-          4);
-      flat2_q[5] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
-          4);
-    }
-    // highbd_filter8
-    int i;
-    for (i = 0; i < 2; i++) {
-      ps[i] = _mm_andnot_si128(flat, ps[i]);
-      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-      p[i] = _mm_or_si128(ps[i], flat_p[i]);
-      qs[i] = _mm_andnot_si128(flat, qs[i]);
-      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-      q[i] = _mm_or_si128(qs[i], flat_q[i]);
-    }
-    p[2] = _mm_andnot_si128(flat, p[2]);
-    //  p2 remains unchanged if !(flat && mask)
-    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
-    //  when (flat && mask)
-    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
-    q[2] = _mm_andnot_si128(flat, q[2]);
-    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
-    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-
-    for (i = 0; i < 2; i++) {
-      ps[i] = _mm_andnot_si128(flat, ps[i]);
-      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-      p[i] = _mm_or_si128(ps[i], flat_p[i]);
-      qs[i] = _mm_andnot_si128(flat, qs[i]);
-      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-      q[i] = _mm_or_si128(qs[i], flat_q[i]);
-    }
-    // highbd_filter16
-    if (flat2_mask) {
-      for (i = 0; i < 6; i++) {
-        //  p[i] remains unchanged if !(flat2 && flat && mask)
-        p[i] = _mm_andnot_si128(flat2, p[i]);
-        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-        //  get values for when (flat2 && flat && mask)
-        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-        q[i] = _mm_andnot_si128(flat2, q[i]);
-        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-        q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      }
-    }
-  } else {
-    p[0] = ps[0];
-    q[0] = qs[0];
-    p[1] = ps[1];
-    q[1] = qs[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_dual_sse2(
-    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p[7], q[7];
-  int i;
-  load_highbd_pixel(s, 7, pitch, p, q);
-
-  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
-                                   _limit1, _thresh1, bd);
-
-  for (i = 0; i < 6; i++) {
-    _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-    _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
-    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
-    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
-    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-  __m128i pq[3];
-  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
-  __m128i flat_p1p0, flat_q0q1;
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i t80;
-  const __m128i one = _mm_set1_epi16(0x1);
-
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
-  // flat_mask
-  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
-  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_c;
-    __m128i pq0x2_pq1, pq1_pq2;
-
-    // op1
-    pq0x2_pq1 =
-        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
-    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
-                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
-    workp_b =
-        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
-    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
-                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_srli_si128(pq1_pq2, 8);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
-                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(*q2, *q2);
-    workp_b =
-        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
-    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
-    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
-    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
-    const unsigned char *_thresh0, const unsigned char *_blimit1,
-    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit0, limit0, thresh0;
-  __m128i t80;
-  __m128i mask, flat, work;
-  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
-  __m128i op1, op0, oq0, oq1;
-  const __m128i four = _mm_set1_epi16(4);
-  const __m128i one = _mm_set1_epi16(0x1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  abs_p2p1 = abs_diff16(*p2, *p1);
-  abs_p1p0 = abs_diff16(*p1, *p0);
-  abs_q1q0 = abs_diff16(*q1, *q0);
-  abs_q2q1 = abs_diff16(*q2, *q1);
-
-  abs_p0q0 = abs_diff16(*p0, *q0);
-  abs_p1q1 = abs_diff16(*p1, *q1);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
-  mask = _mm_max_epi16(abs_q2q1, mask);
-  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  mask = _mm_max_epi16(work, mask);
-  mask = _mm_max_epi16(mask, abs_p2p1);
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  // flat_mask
-  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
-  flat = _mm_max_epi16(flat, work);
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-
-    // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
-                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
-    op1 = _mm_srli_epi16(workp_shft0, 3);
-
-    // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
-    workp_a =
-        _mm_add_epi16(workp_a,
-                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
-    op0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
-                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
-    workp_b = _mm_add_epi16(*q1, *q2);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
-    oq0 = _mm_srli_epi16(workp_shft0, 3);
-
-    // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
-                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
-    workp_b = _mm_add_epi16(*q2, *q2);
-    workp_shft1 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
-    oq1 = _mm_srli_epi16(workp_shft1, 3);
-
-    qs[0] = _mm_andnot_si128(flat, qs[0]);
-    oq0 = _mm_and_si128(flat, oq0);
-    *q0 = _mm_or_si128(qs[0], oq0);
-
-    qs[1] = _mm_andnot_si128(flat, qs[1]);
-    oq1 = _mm_and_si128(flat, oq1);
-    *q1 = _mm_or_si128(qs[1], oq1);
-
-    ps[0] = _mm_andnot_si128(flat, ps[0]);
-    op0 = _mm_and_si128(flat, op0);
-    *p0 = _mm_or_si128(ps[0], op0);
-
-    ps[1] = _mm_andnot_si128(flat, ps[1]);
-    op1 = _mm_and_si128(flat, op1);
-    *p1 = _mm_or_si128(ps[1], op1);
-  } else {
-    *q0 = qs[0];
-    *q1 = qs[1];
-    *p0 = ps[0];
-    *p1 = ps[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
-
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
-  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
-                             _blimit, _limit, _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
-}
-
-void aom_highbd_lpf_horizontal_6_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2;
-
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-
-  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
-                                  _limit0, _thresh0, _blimit1, _limit1,
-                                  _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    const unsigned char *_blimit, const unsigned char *_limit,
-    const unsigned char *_thresh, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-  __m128i pq[4];
-  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
-
-  __m128i abs_p1p0;
-
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i t80;
-  const __m128i one = _mm_set1_epi16(0x1);
-
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
-  // flat_mask4
-  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
-  flat = _mm_max_epi16(abs_p1p0, flat);
-  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
-    // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-    // o*p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    workp_c = _mm_add_epi16(workp_a, workp_c);
-
-    // o*p1
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
-    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
-    // o*p0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
-    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    workp_a = _mm_add_epi16(workp_a, workp_b);
-    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-
-    work_a = _mm_andnot_si128(flat, pq[2]);
-    *p2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_a, *p2);
-    *q2 = _mm_srli_si128(*p2, 8);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
-    const unsigned char *_limit0, const unsigned char *_thresh0,
-    const unsigned char *_blimit1, const unsigned char *_limit1,
-    const unsigned char *_thresh1, int bd) {
-  __m128i blimit0, limit0, thresh0;
-  __m128i t80;
-  __m128i mask, flat;
-  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
-  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i four = _mm_set1_epi16(4);
-  const __m128i one = _mm_set1_epi16(0x1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  abs_p0q0 = abs_diff16(*p0, *q0);
-  abs_p1q1 = abs_diff16(*p1, *q1);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
-
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
-  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
-  work1 =
-      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
-  work0 = _mm_max_epi16(work0, work1);
-  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
-  work2 = _mm_max_epi16(work2, work0);
-  mask = _mm_max_epi16(work2, mask);
-
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
-  flat = _mm_max_epi16(work1, flat);
-  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
-  flat = _mm_max_epi16(work0, flat);
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b;
-    // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-    // o*p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // o*p1
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
-    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // o*p0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
-    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
-    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
-    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    qs[0] = _mm_andnot_si128(flat, qs[0]);
-    oq0 = _mm_and_si128(flat, oq0);
-    *q0 = _mm_or_si128(qs[0], oq0);
-
-    qs[1] = _mm_andnot_si128(flat, qs[1]);
-    oq1 = _mm_and_si128(flat, oq1);
-    *q1 = _mm_or_si128(qs[1], oq1);
-
-    ps[0] = _mm_andnot_si128(flat, ps[0]);
-    op0 = _mm_and_si128(flat, op0);
-    *p0 = _mm_or_si128(ps[0], op0);
-
-    ps[1] = _mm_andnot_si128(flat, ps[1]);
-    op1 = _mm_and_si128(flat, op1);
-    *p1 = _mm_or_si128(ps[1], op1);
-
-    work_a = _mm_andnot_si128(flat, *q2);
-    *q2 = _mm_and_si128(flat, oq2);
-    *q2 = _mm_or_si128(work_a, *q2);
-
-    work_a = _mm_andnot_si128(flat, *p2);
-    *p2 = _mm_and_si128(flat, op2);
-    *p2 = _mm_or_si128(work_a, *p2);
-  } else {
-    *q0 = qs[0];
-    *q1 = qs[1];
-    *p0 = ps[0];
-    *p1 = ps[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-
-  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
-                             &p1p0, _blimit, _limit, _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-
-  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
-                                  _blimit0, _limit0, _thresh0, _blimit1,
-                                  _limit1, _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
-    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
-    const uint8_t *_thresh, int bd) {
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev;
-  __m128i p1p0, q1q0;
-  __m128i pq[2];
-
-  __m128i abs_p1p0;
-
-  __m128i t80;
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-
-  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
-    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i blimit0, limit0, thresh0;
-  __m128i mask, flat;
-  __m128i p[2], q[2];
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
-  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
-
-  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
-  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
-
-  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
-  const __m128i one = _mm_set1_epi16(1);
-
-  __m128i t80;
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  // filter_mask and hev_mask
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-  mask = _mm_max_epi16(flat, mask);
-
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  p[0] = *p0;
-  p[1] = *p1;
-  q[0] = *q0;
-  q[1] = *q1;
-
-  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-}
-
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p1p0, q1q0;
-  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
-  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
-                             _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-void aom_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i ps[2], qs[2];
-
-  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
-                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
-}
-
-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
-  __m128i p1p0, q1q0;
-  __m128i p1, q1;
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
-  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
-
-  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
-                             thresh, bd);
-
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
-
-  // transpose from 8x4 to 4x8
-  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i ps[2], qs[2];
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
-
-  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
-                               &d2, &d3);
-
-  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
-                                  thresh0, blimit1, limit1, thresh1, bd);
-
-  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
-                               &d3, &d4, &d5, &d6, &d7);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
-  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
-  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
-  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x3, x2, x1, x0, p0, q0;
-  __m128i p1p0, q1q0;
-
-  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
-  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-
-  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
-                             limit, thresh, bd);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_6_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i p0, q0, p1, q1, p2, q2;
-
-  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
-
-  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
-                           &p0, &q0, &q1, &q2, &d6, &d7);
-
-  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
-                                  _limit0, _thresh0, _blimit1, _limit1,
-                                  _thresh1, bd);
-
-  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
-  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
-  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
-  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p2, p1, p0, p3, q0;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
-  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
-  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
-
-  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  // Loop filtering
-  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
-                             &p1p0, blimit, limit, thresh, bd);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
-                               &d1, &d2, &d3);
-
-  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
-  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
-  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
-  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
-  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
-  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
-  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
-
-  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
-                           &d2, &d3, &d4, &d5, &d6, &d7);
-
-  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
-                                  blimit0, limit0, thresh0, blimit1, limit1,
-                                  thresh1, bd);
-
-  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
-                           &x2, &x3, &x4, &x5, &x6, &x7);
-
-  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
-  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
-  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
-  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
-  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
-  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
-  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
-  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
-}
-
-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int bd) {
-  __m128i q[7], p[7], pq[7];
-  __m128i p6, p5, p4, p3;
-  __m128i p6_2, p5_2, p4_2, p3_2;
-  __m128i d0, d1, d2, d3;
-  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
-
-  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
-  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
-  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
-  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-
-  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
-                               &p[3], &p[2], &p[1], &p[0]);
-
-  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
-  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
-  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
-  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-
-  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
-                               &q[3], &q[4], &q[5], &q[6], &d7_2);
-
-  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
-  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
-                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
-
-  q[0] = _mm_srli_si128(pq[0], 8);
-  q[1] = _mm_srli_si128(pq[1], 8);
-  q[2] = _mm_srli_si128(pq[2], 8);
-  q[3] = _mm_srli_si128(pq[3], 8);
-  q[4] = _mm_srli_si128(pq[4], 8);
-  q[5] = _mm_srli_si128(pq[5], 8);
-
-  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
-                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
-  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
-  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
-  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
-  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
-}
-
-void aom_highbd_lpf_vertical_14_dual_sse2(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i q[7], p[7];
-  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
-  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
-  __m128i d0, d7;
-  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
-
-  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
-  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
-  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
-  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
-  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
-  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
-  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
-
-  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
-                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
-
-  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
-  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
-  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
-  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
-  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
-  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
-  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
-
-  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
-                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
-                           &q[6], &d7);
-
-  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
-                                   limit1, thresh1, bd);
-
-  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
-                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
-                           &d6_out, &d7_out);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
-
-  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
-                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
-                           &d6_out, &d7_out);
-
-  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
-  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
-  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
-  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
-  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
-  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
-  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
-  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
-}
-#endif  // !CONFIG_NEW_DF
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index 9ada1e6..4e2b038 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -23,7 +23,6 @@
 #include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
-#if CONFIG_NEW_DF || CONFIG_PEF
 #define DF_MVS 0
 #if DF_MVS
 #define DF_MV_THRESH 8
@@ -57,7 +56,6 @@
   1460, 1470, 1480, 1489, 1499, 1509, 1519, 1529, 1539, 1549, 1559, 1569, 1579,
   1589, 1599, 1608, 1618, 1628, 1638, 1648, 1658, 1668, 1678
 };
-#endif  // CONFIG_NEW_DF || CONFIG_PEF
 
 static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
   { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
@@ -65,11 +63,6 @@
   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
 };
 
-#if !CONFIG_NEW_DF
-static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
-                                                      { 2, 2 },
-                                                      { 3, 3 } };
-#endif
 static const int mode_lf_lut[] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
   1, 0, 1,                                // INTER_SINGLE_MODES (GLOBALMV == 0)
@@ -97,7 +90,6 @@
 #endif  // CONFIG_OPTFLOW_REFINEMENT
 };
 
-#if CONFIG_NEW_DF || CONFIG_PEF
 // Function obtains q_threshold from the quantization index.
 int df_quant_from_qindex(int q_index, int bit_depth) {
   int qstep = ROUND_POWER_OF_TWO(av1_ac_quant_QTX(q_index, 0, bit_depth),
@@ -120,32 +112,7 @@
 
   return side_threshold;
 }
-#endif  // CONFIG_NEW_DF || CONFIG_PEF
 
-#if !CONFIG_NEW_DF
-static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
-  int lvl;
-
-  // For each possible value for the loop filter fill out limits
-  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
-    // Set loop filter parameters that control sharpness.
-    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
-
-    if (sharpness_lvl > 0) {
-      if (block_inside_limit > (9 - sharpness_lvl))
-        block_inside_limit = (9 - sharpness_lvl);
-    }
-
-    if (block_inside_limit < 1) block_inside_limit = 1;
-
-    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
-    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
-           SIMD_WIDTH);
-  }
-}
-#endif  // !CONFIG_NEW_DF
-
-#if CONFIG_NEW_DF
 uint16_t av1_get_filter_q(const loop_filter_info_n *lfi_n, const int dir_idx,
                           int plane, const MB_MODE_INFO *mbmi) {
   const int segment_id = mbmi->segment_id;
@@ -162,80 +129,19 @@
       mbmi->ref_frame[0])][mode_lf_lut[mbmi->mode]];
 }
 
-#else
-uint8_t av1_get_filter_level(const AV1_COMMON *cm,
-                             const loop_filter_info_n *lfi_n, const int dir_idx,
-                             int plane, const MB_MODE_INFO *mbmi) {
-  const int segment_id = mbmi->segment_id;
-  if (cm->delta_q_info.delta_lf_present_flag) {
-    int8_t delta_lf;
-    if (cm->delta_q_info.delta_lf_multi) {
-      const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
-      delta_lf = mbmi->delta_lf[delta_lf_idx];
-    } else {
-      delta_lf = mbmi->delta_lf_from_base;
-    }
-    int base_level;
-    if (plane == 0)
-      base_level = cm->lf.filter_level[dir_idx];
-    else if (plane == 1)
-      base_level = cm->lf.filter_level_u;
-    else
-      base_level = cm->lf.filter_level_v;
-    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
-    assert(plane >= 0 && plane <= 2);
-    const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
-    if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
-      const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
-      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
-    }
-
-    if (cm->lf.mode_ref_delta_enabled) {
-      const int scale = 1 << (lvl_seg >> 5);
-      lvl_seg +=
-          cm->lf.ref_deltas[COMPACT_INDEX0_NRS(mbmi->ref_frame[0])] * scale;
-      if (is_inter_ref_frame(mbmi->ref_frame[0]))
-        lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
-      lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
-    }
-    return lvl_seg;
-  } else {
-    return lfi_n->lvl[plane][segment_id][dir_idx][COMPACT_INDEX0_NRS(
-        mbmi->ref_frame[0])][mode_lf_lut[mbmi->mode]];
-  }
-}
-#endif  // CONFIG_NEW_DF
-
 void av1_loop_filter_init(AV1_COMMON *cm) {
   assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
   struct loopfilter *lf = &cm->lf;
-#if !CONFIG_NEW_DF
-  loop_filter_info_n *lfi = &cm->lf_info;
-  int lvl;
-#endif  // !CONFIG_NEW_DF
 
   lf->combine_vert_horz_lf = 1;
-#if !CONFIG_NEW_DF
-  // init limits for given sharpness
-  update_sharpness(lfi, lf->sharpness_level);
-
-  // init hev threshold const vectors
-  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
-    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
-#endif
 }
-#if CONFIG_NEW_DF
 // Update the loop filter for the current frame.
 // This should be called before loop_filter_rows(),
 // av1_loop_filter_frame() calls this function directly.
 void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
                                 int plane_end) {
-#if CONFIG_NEW_DF
   int q_ind[MAX_MB_PLANE], q_ind_r[MAX_MB_PLANE], side_ind[MAX_MB_PLANE],
       side_ind_r[MAX_MB_PLANE];
-#else
-  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
-#endif  // CONFIG_NEW_DF
   int plane;
   int seg_id;
   // n_shift is the multiplier for lf_deltas
@@ -245,7 +151,6 @@
   struct loopfilter *const lf = &cm->lf;
   const struct segmentation *const seg = &cm->seg;
 
-#if CONFIG_NEW_DF
 #if DF_DUAL
   q_ind[0] =
       cm->quant_params.base_qindex + cm->lf.delta_q_luma[0] * DF_DELTA_SCALE;
@@ -286,23 +191,10 @@
                cm->lf.delta_q_v * DF_DELTA_SCALE;
   side_ind_r[2] = cm->quant_params.base_qindex + cm->quant_params.v_ac_delta_q +
                   cm->lf.delta_side_v * DF_DELTA_SCALE;
-#else
-  // update sharpness limits
-  update_sharpness(lfi, lf->sharpness_level);
-
-  filt_lvl[0] = cm->lf.filter_level[0];
-  filt_lvl[1] = cm->lf.filter_level_u;
-  filt_lvl[2] = cm->lf.filter_level_v;
-
-  filt_lvl_r[0] = cm->lf.filter_level[1];
-  filt_lvl_r[1] = cm->lf.filter_level_u;
-  filt_lvl_r[2] = cm->lf.filter_level_v;
-#endif  // CONFIG_NEW_DF
 
   assert(plane_start >= AOM_PLANE_Y);
   assert(plane_end <= MAX_MB_PLANE);
 
-#if CONFIG_NEW_DF
   for (plane = plane_start; plane < plane_end; plane++) {
     if (plane == 0 && !cm->lf.filter_level[0] && !cm->lf.filter_level[1])
       break;
@@ -310,36 +202,19 @@
       continue;
     else if (plane == 2 && !cm->lf.filter_level_v)
       continue;
-#else
-  for (plane = plane_start; plane < plane_end; plane++) {
-    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
-      break;
-    else if (plane == 1 && !filt_lvl[1])
-      continue;
-    else if (plane == 2 && !filt_lvl[2])
-      continue;
-#endif  // CONFIG_NEW_DF
+
     for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
       for (int dir = 0; dir < 2; ++dir) {
-#if CONFIG_NEW_DF
         int q_ind_seg = (dir == 0) ? q_ind[plane] : q_ind_r[plane];
         int side_ind_seg = (dir == 0) ? side_ind[plane] : side_ind_r[plane];
-
-#else
-        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
-#endif  // CONFIG_NEW_DF
         const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
 
         if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
           const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
-#if CONFIG_NEW_DF
           // TODO(Andrey): add separate offsets to segments for q and side
           // thresholds // add clamp
           q_ind_seg += data;
           side_ind_seg += data;
-#else
-          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
-#endif  // CONFIG_NEW_DF
         }
 
         if (!lf->mode_ref_delta_enabled) {
@@ -350,7 +225,6 @@
 
           // we could get rid of this if we assume that deltas are set to
           // zero when not in use; encoder always uses deltas
-#if CONFIG_NEW_DF
           int ref, mode;
           lfi->q_thr[plane][seg_id][dir][INTRA_FRAME_INDEX][0] = q_thr_seg;
           lfi->side_thr[plane][seg_id][dir][INTRA_FRAME_INDEX][0] =
@@ -369,12 +243,7 @@
                 side_thr_seg;
           }
 #endif  // CONFIG_TIP
-#else
-          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
-                 sizeof(lfi->lvl[plane][seg_id][dir]));
-#endif  // CONFIG_NEW_DF
         } else {
-#if CONFIG_NEW_DF
           // we could get rid of this if we assume that deltas are set to
           // zero when not in use; encoder always uses deltas
           const int scale = 4;
@@ -415,118 +284,11 @@
                                     cm->seq_params.bit_depth);
           }
 #endif  // CONFIG_TIP
-#else
-          int ref, mode;
-          const int scale = 1 << (lvl_seg >> 5);
-          const int intra_lvl =
-              lvl_seg + lf->ref_deltas[INTRA_FRAME_INDEX] * scale;
-          lfi->lvl[plane][seg_id][dir][INTRA_FRAME_INDEX][0] =
-              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-          for (ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
-            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
-                                    lf->mode_deltas[mode] * scale;
-              lfi->lvl[plane][seg_id][dir][ref][mode] =
-                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
-            }
-          }
-#if CONFIG_TIP
-          for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-            const int inter_lvl = lvl_seg +
-                                  lf->ref_deltas[TIP_FRAME_INDEX] * scale +
-                                  lf->mode_deltas[mode] * scale;
-            lfi->lvl[plane][seg_id][dir][TIP_FRAME_INDEX][mode] =
-                clamp(inter_lvl, 0, MAX_LOOP_FILTER);
-          }
-#endif  // CONFIG_TIP
-#endif  // CONFIG_NEW_DF
         }
       }
     }
   }
 }
-#else
-// Update the loop filter for the current frame.
-// This should be called before loop_filter_rows(),
-// av1_loop_filter_frame() calls this function directly.
-void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
-                                int plane_end) {
-  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
-  int plane;
-  int seg_id;
-  // n_shift is the multiplier for lf_deltas
-  // the multiplier is 1 for when filter_lvl is between 0 and 31;
-  // 2 when filter_lvl is between 32 and 63
-  loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *const lf = &cm->lf;
-  const struct segmentation *const seg = &cm->seg;
-
-  // update sharpness limits
-  update_sharpness(lfi, lf->sharpness_level);
-
-  filt_lvl[0] = cm->lf.filter_level[0];
-  filt_lvl[1] = cm->lf.filter_level_u;
-  filt_lvl[2] = cm->lf.filter_level_v;
-
-  filt_lvl_r[0] = cm->lf.filter_level[1];
-  filt_lvl_r[1] = cm->lf.filter_level_u;
-  filt_lvl_r[2] = cm->lf.filter_level_v;
-
-  assert(plane_start >= AOM_PLANE_Y);
-  assert(plane_end <= MAX_MB_PLANE);
-
-  for (plane = plane_start; plane < plane_end; plane++) {
-    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
-      break;
-    else if (plane == 1 && !filt_lvl[1])
-      continue;
-    else if (plane == 2 && !filt_lvl[2])
-      continue;
-
-    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-      for (int dir = 0; dir < 2; ++dir) {
-        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
-        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
-        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
-          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
-          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
-        }
-
-        if (!lf->mode_ref_delta_enabled) {
-          // we could get rid of this if we assume that deltas are set to
-          // zero when not in use; encoder always uses deltas
-          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
-                 sizeof(lfi->lvl[plane][seg_id][dir]));
-        } else {
-          int ref, mode;
-          const int scale = 1 << (lvl_seg >> 5);
-          const int intra_lvl =
-              lvl_seg + lf->ref_deltas[INTRA_FRAME_INDEX] * scale;
-          lfi->lvl[plane][seg_id][dir][INTRA_FRAME_INDEX][0] =
-              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-          for (ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
-            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
-                                    lf->mode_deltas[mode] * scale;
-              lfi->lvl[plane][seg_id][dir][ref][mode] =
-                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
-            }
-          }
-#if CONFIG_TIP
-          for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-            const int inter_lvl = lvl_seg +
-                                  lf->ref_deltas[TIP_FRAME_INDEX] * scale +
-                                  lf->mode_deltas[mode] * scale;
-            lfi->lvl[plane][seg_id][dir][TIP_FRAME_INDEX][mode] =
-                clamp(inter_lvl, 0, MAX_LOOP_FILTER);
-          }
-#endif  // CONFIG_TIP
-        }
-      }
-    }
-  }
-}
-#endif  // CONFIG_NEW_DF
 
 static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
                                   const MB_MODE_INFO *const mbmi,
@@ -584,10 +346,8 @@
   const uint8_t *lim;
   const uint8_t *mblim;
   const uint8_t *hev_thr;
-#if CONFIG_NEW_DF
   uint16_t q_threshold;
   uint16_t side_threshold;
-#endif  // CONFIG_NEW_DF
 } AV1_DEBLOCKING_PARAMETERS;
 
 // Return TX_SIZE from get_transform_size(), so it is plane and direction
@@ -646,21 +406,13 @@
 
     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
     {
-#if CONFIG_NEW_DF
       const uint32_t curr_q =
           av1_get_filter_q(&cm->lf_info, edge_dir, plane, mbmi);
       const uint32_t curr_side =
           av1_get_filter_side(&cm->lf_info, edge_dir, plane, mbmi);
-#else
-      const uint32_t curr_level =
-          av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
-#endif  // CONFIG_NEW_DF
 
       const int curr_skipped =
           mbmi->skip_txfm[plane_type] && is_inter_block(mbmi, tree_type);
-#if !CONFIG_NEW_DF
-      uint32_t level = curr_level;
-#endif  // !CONFIG_NEW_DF
       if (coord) {
         {
           const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
@@ -673,15 +425,11 @@
           const TX_SIZE pv_ts =
               get_transform_size(xd, mi_prev, edge_dir, pv_row, pv_col, plane,
                                  tree_type, plane_ptr);
-#if CONFIG_NEW_DF
           const uint32_t pv_q =
               av1_get_filter_q(&cm->lf_info, edge_dir, plane, mi_prev);
           const uint32_t pv_side =
               av1_get_filter_side(&cm->lf_info, edge_dir, plane, mi_prev);
-#else
-          const uint32_t pv_lvl =
-              av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
-#endif  // CONFIG_NEW_DF
+
           const int pv_skip_txfm = mi_prev->skip_txfm[plane_type] &&
                                    is_inter_block(mi_prev, tree_type);
           const BLOCK_SIZE bsize = get_mb_plane_block_size_from_tree_type(
@@ -699,7 +447,6 @@
           const int32_t pu_edge = !(coord & prediction_masks);
           // if the current and the previous blocks are skipped,
           // deblock the edge if the edge belongs to a PU's edge only.
-#if CONFIG_NEW_DF
 #if DF_REDUCED_SB_EDGE
           const BLOCK_SIZE superblock_size = get_plane_block_size(
               cm->seq_params.sb_size, plane_ptr->subsampling_x,
@@ -770,19 +517,13 @@
             }
           }
 #endif  // DF_MVS
-#endif  // CONFIG_NEW_DF
 
-#if CONFIG_NEW_DF
           if (((curr_q && curr_side) || (pv_q && pv_side)) &&
-#else
-          if ((curr_level || pv_lvl) &&
-#endif
-#if CONFIG_NEW_DF && DF_MVS
+#if DF_MVS
               (!pv_skip_txfm || !curr_skipped || diff_mvs)) {
 #else
               (!pv_skip_txfm || !curr_skipped || pu_edge)) {
 #endif
-#if CONFIG_NEW_DF
             TX_SIZE clipped_ts = ts;
             if (!plane) {
               if (((VERT_EDGE == edge_dir) && (width < x + 16)) ||
@@ -798,9 +539,6 @@
               }
             }
             const TX_SIZE min_ts = AOMMIN(clipped_ts, pv_ts);
-#else
-            const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
-#endif  // CONFIG_NEW_DF
             if (TX_4X4 >= min_ts) {
               params->filter_length = 4;
             } else if (TX_8X8 == min_ts) {
@@ -853,28 +591,13 @@
             }
 #endif  // DF_FILT26
 
-#if CONFIG_NEW_DF
             // update the level if the current block is skipped,
             // but the previous one is not
             params->q_threshold = (curr_q) ? (curr_q) : (pv_q);
             params->side_threshold = (curr_side) ? (curr_side) : (pv_side);
-#else
-            // update the level if the current block is skipped,
-            // but the previous one is not
-            level = (curr_level) ? (curr_level) : (pv_lvl);
-#endif  // CONFIG_NEW_DF
           }
         }
       }
-#if !CONFIG_NEW_DF
-      // prepare common parameters
-      if (params->filter_length) {
-        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
-        params->lim = limits->lim;
-        params->mblim = limits->mblim;
-        params->hev_thr = limits->hev_thr;
-      }
-#endif  // !CONFIG_NEW_DF
     }
   }
   return ts;
@@ -884,9 +607,7 @@
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col) {
-#if CONFIG_NEW_DF
   if (!plane && !cm->lf.filter_level[0]) return;
-#endif
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint16_t *const dst_ptr = plane_ptr->dst.buf;
@@ -916,39 +637,12 @@
       }
 
       const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
-#if CONFIG_NEW_DF
-
       if (params.filter_length) {
         aom_highbd_lpf_vertical_generic_c(p, dst_stride, params.filter_length,
                                           &params.q_threshold,
                                           &params.side_threshold, bit_depth);
       }
-#else
-      switch (params.filter_length) {
-        // apply 4-tap filtering
-        case 4:
-          aom_highbd_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
-                                    params.hev_thr, bit_depth);
-          break;
-        case 6:  // apply 6-tap filter for chroma plane only
-          assert(plane != 0);
-          aom_highbd_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
-                                    params.hev_thr, bit_depth);
-          break;
-        // apply 8-tap filtering
-        case 8:
-          aom_highbd_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
-                                    params.hev_thr, bit_depth);
-          break;
-        // apply 14-tap filtering
-        case 14:
-          aom_highbd_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
-                                     params.hev_thr, bit_depth);
-          break;
-        // no filtering
-        default: break;
-      }
-#endif  // !CONFIG_NEW_DF
+
       // advance the destination pointer
       advance_units = tx_size_wide_unit[tx_size];
       x += advance_units;
@@ -961,9 +655,7 @@
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col) {
-#if CONFIG_NEW_DF
   if (!plane && !cm->lf.filter_level[1]) return;
-#endif
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint16_t *const dst_ptr = plane_ptr->dst.buf;
@@ -993,41 +685,12 @@
       }
       const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
 
-#if CONFIG_NEW_DF
       if (params.filter_length) {
         aom_highbd_lpf_horizontal_generic_c(p, dst_stride, params.filter_length,
                                             &params.q_threshold,
                                             &params.side_threshold, bit_depth);
       }
 
-#else
-      switch (params.filter_length) {
-        // apply 4-tap filtering
-        case 4:
-          aom_highbd_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
-                                      params.hev_thr, bit_depth);
-          break;
-        // apply 6-tap filtering
-        case 6:
-          assert(plane != 0);
-          aom_highbd_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
-                                      params.hev_thr, bit_depth);
-          break;
-        // apply 8-tap filtering
-        case 8:
-          aom_highbd_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
-                                      params.hev_thr, bit_depth);
-          break;
-        // apply 14-tap filtering
-        case 14:
-          aom_highbd_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
-                                       params.hev_thr, bit_depth);
-          break;
-        // no filtering
-        default: break;
-      }
-#endif  //! CONFIG_NEW_DF
-
       // advance the destination pointer
       advance_units = tx_size_high_unit[tx_size];
       y += advance_units;
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index 094bbb4..e01f1f7 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -23,8 +23,6 @@
 extern "C" {
 #endif
 
-#if CONFIG_NEW_DF
-
 #define MAX_DF_OFFSETS 64
 #define ZERO_DF_OFFSET 32
 
@@ -42,10 +40,6 @@
 #define DF_CHROMA_WIDE 1
 
 #define DF_REDUCED_SB_EDGE 1
-#else
-#define DF_FILT26 0
-#define DF_CHROMA_WIDE 0
-#endif  // CONFIG_NEW_DF
 
 #define MAX_LOOP_FILTER 63
 #define MAX_SHARPNESS 7
@@ -110,7 +104,6 @@
   int filter_level_u;
   int filter_level_v;
 
-#if CONFIG_NEW_DF
 #if DF_DUAL
   int delta_q_luma[2];
   int delta_side_luma[2];
@@ -122,9 +115,7 @@
   int delta_side_u;
   int delta_q_v;
   int delta_side_v;
-#else
-  int sharpness_level;
-#endif  // CONFIG_NEW_DF
+
   uint8_t mode_ref_delta_enabled;
   uint8_t mode_ref_delta_update;
 
@@ -152,16 +143,10 @@
 } loop_filter_thresh;
 
 typedef struct {
-#if CONFIG_NEW_DF
   uint16_t q_thr[MAX_MB_PLANE][MAX_SEGMENTS][2][SINGLE_REF_FRAMES]
                 [MAX_MODE_LF_DELTAS];
   uint16_t side_thr[MAX_MB_PLANE][MAX_SEGMENTS][2][SINGLE_REF_FRAMES]
                    [MAX_MODE_LF_DELTAS];
-#else
-  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
-  uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][SINGLE_REF_FRAMES]
-             [MAX_MODE_LF_DELTAS];
-#endif
 } loop_filter_info_n;
 
 typedef struct LoopFilterWorkerData {
@@ -208,16 +193,9 @@
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
-#if !CONFIG_NEW_DF
-uint8_t av1_get_filter_level(const struct AV1Common *cm,
-                             const loop_filter_info_n *lfi_n, const int dir_idx,
-                             int plane, const MB_MODE_INFO *mbmi);
-#endif
-#if CONFIG_NEW_DF || CONFIG_PEF
 int df_quant_from_qindex(int q_index, int bit_depth);
 
 int df_side_from_qindex(int q_index, int bit_depth);
-#endif  // CONFIG_NEW_DF || CONFIG_PEF
 #if CONFIG_LPF_MASK
 void av1_filter_block_plane_ver(struct AV1Common *const cm,
                                 struct macroblockd_plane *const plane_ptr,
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 6d3ecec..b4f0ad1 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -3245,16 +3245,8 @@
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
-#if !CONFIG_NEW_DF  // was DF_REF_DELTAS
-  lf->mode_ref_delta_enabled = 1;
-  lf->mode_ref_delta_update = 1;
-
-  av1_set_default_ref_deltas(lf->ref_deltas);
-  av1_set_default_mode_deltas(lf->mode_deltas);
-#else
   lf->mode_ref_delta_enabled = 0;
   lf->mode_ref_delta_update = 0;
-#endif
 }
 
 void av1_setup_frame_contexts(AV1_COMMON *cm) {
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index d193249..e11e0b2 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -23,12 +23,8 @@
 extern "C" {
 #endif
 
-#if CONFIG_NEW_DF
 #define DF_RESTRICT_ORIP 1
 #define ORIP_BLOCK_SIZE 32
-#else
-#define DF_RESTRICT_ORIP 0
-#endif
 
 #if CONFIG_AIMC
 /*! \brief set the luma intra mode and delta angles for a given mode index.
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 3920205..c3c0d35 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3125,7 +3125,6 @@
           1) == 0);
 #endif  // CONFIG_LR_FLEX_SYNTAX
 }
-#if CONFIG_NEW_DF
 static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
                                         struct aom_read_bit_buffer *rb) {
   const int num_planes = av1_num_planes(cm);
@@ -3281,60 +3280,6 @@
   lf->mode_ref_delta_update = 0;
   lf->mode_ref_delta_enabled = 0;
 }
-#else
-static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
-                                        struct aom_read_bit_buffer *rb) {
-  const int num_planes = av1_num_planes(cm);
-  struct loopfilter *lf = &cm->lf;
-
-  if (is_global_intrabc_allowed(cm) || cm->features.coded_lossless) {
-    // write default deltas to frame buffer
-    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
-    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
-    return;
-  }
-  assert(!cm->features.coded_lossless);
-  if (cm->prev_frame) {
-    // write deltas to frame buffer
-    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, SINGLE_REF_FRAMES);
-    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
-  } else {
-    av1_set_default_ref_deltas(lf->ref_deltas);
-    av1_set_default_mode_deltas(lf->mode_deltas);
-  }
-  lf->filter_level[0] = aom_rb_read_literal(rb, 6);
-  lf->filter_level[1] = aom_rb_read_literal(rb, 6);
-  if (num_planes > 1) {
-    if (lf->filter_level[0] || lf->filter_level[1]) {
-      lf->filter_level_u = aom_rb_read_literal(rb, 6);
-      lf->filter_level_v = aom_rb_read_literal(rb, 6);
-    }
-  }
-  lf->sharpness_level = aom_rb_read_literal(rb, 3);
-
-  // Read in loop filter deltas applied at the MB level based on mode or ref
-  // frame.
-  lf->mode_ref_delta_update = 0;
-
-  lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
-  if (lf->mode_ref_delta_enabled) {
-    lf->mode_ref_delta_update = aom_rb_read_bit(rb);
-    if (lf->mode_ref_delta_update) {
-      for (int i = 0; i < SINGLE_REF_FRAMES; i++)
-        if (aom_rb_read_bit(rb))
-          lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
-
-      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
-        if (aom_rb_read_bit(rb))
-          lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
-    }
-  }
-
-  // write deltas to frame buffer
-  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, SINGLE_REF_FRAMES);
-  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
-}
-#endif  // CONFIG_NEW_DF
 
 static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
                                   struct aom_read_bit_buffer *rb) {
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 110f436..4bbcaaf 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3659,38 +3659,7 @@
 #endif  // CONFIG_PC_WIENER
   }
 }
-#if !CONFIG_NEW_DF
-// Only write out the ref delta section if any of the elements
-// will signal a delta.
-static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
-  struct loopfilter *lf = &cm->lf;
-  if (!lf->mode_ref_delta_update) {
-    return 0;
-  }
-  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
-  int8_t last_ref_deltas[SINGLE_REF_FRAMES];
-  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
-  if (buf == NULL) {
-    av1_set_default_ref_deltas(last_ref_deltas);
-    av1_set_default_mode_deltas(last_mode_deltas);
-  } else {
-    memcpy(last_ref_deltas, buf->ref_deltas, SINGLE_REF_FRAMES);
-    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
-  }
-  for (int i = 0; i < SINGLE_REF_FRAMES; i++) {
-    if (lf->ref_deltas[i] != last_ref_deltas[i]) {
-      return true;
-    }
-  }
-  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-    if (lf->mode_deltas[i] != last_mode_deltas[i]) {
-      return true;
-    }
-  }
-  return false;
-}
-#endif  // !CONFIG_NEW_DF
-#if CONFIG_NEW_DF
+
 static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
                                          struct aom_write_bit_buffer *wb) {
   assert(!cm->features.coded_lossless);
@@ -3805,60 +3774,6 @@
 #endif  // DF_TWO_PARAM
   }
 }
-#else
-static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
-                                         struct aom_write_bit_buffer *wb) {
-  assert(!cm->features.coded_lossless);
-  if (is_global_intrabc_allowed(cm)) return;
-  const int num_planes = av1_num_planes(cm);
-  struct loopfilter *lf = &cm->lf;
-
-  // Encode the loop filter level and type
-  aom_wb_write_literal(wb, lf->filter_level[0], 6);
-  aom_wb_write_literal(wb, lf->filter_level[1], 6);
-  if (num_planes > 1) {
-    if (lf->filter_level[0] || lf->filter_level[1]) {
-      aom_wb_write_literal(wb, lf->filter_level_u, 6);
-      aom_wb_write_literal(wb, lf->filter_level_v, 6);
-    }
-  }
-  aom_wb_write_literal(wb, lf->sharpness_level, 3);
-
-  aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
-
-  // Write out loop filter deltas applied at the MB level based on mode or
-  // ref frame (if they are enabled), only if there is information to write.
-  int meaningful = is_mode_ref_delta_meaningful(cm);
-  aom_wb_write_bit(wb, meaningful);
-  if (!meaningful) {
-    return;
-  }
-
-  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
-  int8_t last_ref_deltas[SINGLE_REF_FRAMES];
-  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
-  if (buf == NULL) {
-    av1_set_default_ref_deltas(last_ref_deltas);
-    av1_set_default_mode_deltas(last_mode_deltas);
-  } else {
-    memcpy(last_ref_deltas, buf->ref_deltas, SINGLE_REF_FRAMES);
-    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
-  }
-
-  for (int i = 0; i < SINGLE_REF_FRAMES; i++) {
-    const int delta = lf->ref_deltas[i];
-    const int changed = delta != last_ref_deltas[i];
-    aom_wb_write_bit(wb, changed);
-    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
-  }
-  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-    const int delta = lf->mode_deltas[i];
-    const int changed = delta != last_mode_deltas[i];
-    aom_wb_write_bit(wb, changed);
-    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
-  }
-}
-#endif  // CONFIG_NEW_DF
 
 static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
                                    struct aom_write_bit_buffer *wb) {
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index ed6c168..113bf17 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -28,10 +28,8 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/picklpf.h"
 
-#if CONFIG_NEW_DF
 #include <float.h>
 #define CHROMA_LAMBDA_MULT 6
-#endif  // CONFIG_NEW_DF
 
 static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
                             YV12_BUFFER_CONFIG *dst_bc, int plane) {
@@ -42,33 +40,16 @@
     default: assert(plane >= 0 && plane <= 2); break;
   }
 }
-#if !CONFIG_NEW_DF
-int av1_get_max_filter_level(const AV1_COMP *cpi) {
-  (void)cpi;
-  return MAX_LOOP_FILTER;
-}
-
-#endif
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
-                                AV1_COMP *const cpi,
-#if CONFIG_NEW_DF
-                                int q_offset, int side_offset,
-#else
-                                int filt_level,
-#endif
-                                int partial_frame, int plane, int dir) {
+                                AV1_COMP *const cpi, int q_offset,
+                                int side_offset, int partial_frame, int plane,
+                                int dir) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   int num_workers = mt_info->num_workers;
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
   assert(plane >= 0 && plane <= 2);
-#if !CONFIG_NEW_DF
-  int filter_level[2] = { filt_level, filt_level };
-  if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
-  if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
-#endif  // !CONFIG_NEW_DF
-#if CONFIG_NEW_DF
   // set base filters for use of av1_get_filter_level when in DELTA_LF mode
   switch (plane) {
     case 0:
@@ -98,17 +79,7 @@
       cm->lf.delta_side_v = side_offset;
       break;
   }
-#else
-  // set base filters for use of av1_get_filter_level when in DELTA_LF mode
-  switch (plane) {
-    case 0:
-      cm->lf.filter_level[0] = filter_level[0];
-      cm->lf.filter_level[1] = filter_level[1];
-      break;
-    case 1: cm->lf.filter_level_u = filter_level[0]; break;
-    case 2: cm->lf.filter_level_v = filter_level[0]; break;
-  }
-#endif  // CONFIG_NEW_DF
+
   // TODO(any): please enable multi-thread and remove the flag when loop
   // filter mask is compatible with multi-thread.
   if (num_workers > 1)
@@ -134,7 +105,6 @@
   return filt_err;
 }
 
-#if CONFIG_NEW_DF
 static int search_filter_offsets(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                                  int partial_frame,
                                  const int *last_frame_offsets,
@@ -341,112 +311,7 @@
 
   return best_cost < start_cost ? offset_best : offsets[off_ind];
 }
-#else
-static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                               int partial_frame,
-                               const int *last_frame_filter_level,
-                               double *best_cost_ret, int plane, int dir) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int min_filter_level = 0;
-  const int max_filter_level = av1_get_max_filter_level(cpi);
-  int filt_direction = 0;
-  int64_t best_err;
-  int filt_best;
-  MACROBLOCK *x = &cpi->td.mb;
 
-  // Start the search at the previous frame filter level unless it is now out of
-  // range.
-  int lvl;
-  switch (plane) {
-    case 0:
-      switch (dir) {
-        case 2:
-          lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
-                1;
-          break;
-        case 0:
-        case 1: lvl = last_frame_filter_level[dir]; break;
-        default: assert(dir >= 0 && dir <= 2); return 0;
-      }
-      break;
-    case 1: lvl = last_frame_filter_level[2]; break;
-    case 2: lvl = last_frame_filter_level[3]; break;
-    default: assert(plane >= 0 && plane <= 2); return 0;
-  }
-  int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
-  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
-  // Sum squared error at each filter level
-  int64_t ss_err[MAX_LOOP_FILTER + 1];
-
-  // Set each entry to -1
-  memset(ss_err, 0xFF, sizeof(ss_err));
-  yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
-  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
-  filt_best = filt_mid;
-  ss_err[filt_mid] = best_err;
-
-  while (filter_step > 0) {
-    const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
-    const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
-
-    // Bias against raising loop filter in favor of lowering it.
-    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
-
-    // yx, bias less for large block size
-    if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
-
-    if (filt_direction <= 0 && filt_low != filt_mid) {
-      // Get Low filter error score
-      if (ss_err[filt_low] < 0) {
-        ss_err[filt_low] =
-            try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
-      }
-      // If value is close to the best so far then bias towards a lower loop
-      // filter value.
-      if (ss_err[filt_low] < (best_err + bias)) {
-        // Was it actually better than the previous best?
-        if (ss_err[filt_low] < best_err) {
-          best_err = ss_err[filt_low];
-        }
-        filt_best = filt_low;
-      }
-    }
-
-    // Now look at filt_high
-    if (filt_direction >= 0 && filt_high != filt_mid) {
-      if (ss_err[filt_high] < 0) {
-        ss_err[filt_high] =
-            try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
-      }
-      // If value is significantly better than previous best, bias added against
-      // raising filter value
-      if (ss_err[filt_high] < (best_err - bias)) {
-        best_err = ss_err[filt_high];
-        filt_best = filt_high;
-      }
-    }
-
-    // Half the step distance if the best filter value was the same as last time
-    if (filt_best == filt_mid) {
-      filter_step /= 2;
-      filt_direction = 0;
-    } else {
-      filt_direction = (filt_best < filt_mid) ? -1 : 1;
-      filt_mid = filt_best;
-    }
-  }
-
-  // Update best error
-  best_err = ss_err[filt_best];
-
-  if (best_cost_ret)
-    *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        x->rdmult, 0, (best_err << 4), cm->seq_params.bit_depth);
-  return filt_best;
-}
-#endif  // CONFIG_NEW_DF
-
-#if CONFIG_NEW_DF
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
   AV1_COMMON *const cm = &cpi->common;
@@ -694,99 +559,3 @@
 #endif  // DF_DUAL
   }
 }
-#else
-void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                           LPF_PICK_METHOD method) {
-  AV1_COMMON *const cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-  struct loopfilter *const lf = &cm->lf;
-  (void)sd;
-
-  lf->sharpness_level = 0;
-  cpi->td.mb.rdmult = cpi->rd.RDMULT;
-
-  if (method == LPF_PICK_MINIMAL_LPF) {
-    lf->filter_level[0] = 0;
-    lf->filter_level[1] = 0;
-  } else if (method >= LPF_PICK_FROM_Q) {
-    const int min_filter_level = 0;
-    const int max_filter_level = av1_get_max_filter_level(cpi);
-
-    const int q =
-        ROUND_POWER_OF_TWO(av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
-                                            cm->seq_params.bit_depth),
-                           QUANT_TABLE_BITS);
-
-    // based on tests result for rtc test set
-    // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
-    const int strength_boost_q_treshold = 700;
-    const int inter_frame_multiplier =
-        q > strength_boost_q_treshold ? 12034 : 6017;
-    // These values were determined by linear fitting the result of the
-    // searched level for 8 bit depth:
-    // Keyframes: filt_guess = q * 0.06699 - 1.60817
-    // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
-    //
-    // And high bit depth separately:
-    // filt_guess = q * 0.316206 + 3.87252
-    int filt_guess;
-    switch (cm->seq_params.bit_depth) {
-      case AOM_BITS_8:
-        filt_guess =
-            (cm->current_frame.frame_type == KEY_FRAME)
-                ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
-                : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
-        break;
-      case AOM_BITS_10:
-        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
-        break;
-      case AOM_BITS_12:
-        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
-        break;
-      default:
-        assert(0 &&
-               "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
-               "or AOM_BITS_12");
-        return;
-    }
-    if (cm->seq_params.bit_depth != AOM_BITS_8 &&
-        cm->current_frame.frame_type == KEY_FRAME)
-      filt_guess -= 4;
-    // TODO(chengchen): retrain the model for Y, U, V filter levels
-    lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
-    lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
-    lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
-    lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
-  } else {
-    // TODO(anyone): What are good initial levels for keyframes?
-    int last_frame_filter_level[4] = { 0 };
-    if (!frame_is_intra_only(cm)) {
-      last_frame_filter_level[0] = lf->filter_level[0];
-      last_frame_filter_level[1] = lf->filter_level[1];
-      last_frame_filter_level[2] = lf->filter_level_u;
-      last_frame_filter_level[3] = lf->filter_level_v;
-    }
-
-    lf->filter_level[0] = lf->filter_level[1] =
-        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                            last_frame_filter_level, NULL, 0, 2);
-    if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
-      lf->filter_level[0] =
-          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 0, 0);
-      lf->filter_level[1] =
-          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 0, 1);
-    }
-
-    if (num_planes > 1) {
-      lf->filter_level_u =
-          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 1, 0);
-      lf->filter_level_v =
-          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 2, 0);
-    }
-  }
-}
-#endif  // !CONFIG_NEW_DF
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index f87d0b2..fd5021b 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -231,8 +231,6 @@
 set_aom_config_var(
   CONFIG_SKIP_MODE_DRL_WITH_REF_IDX 1
   "AV2 experiment flag to enable DRL with ref_MV_idx for skip mode.")
-set_aom_config_var(CONFIG_NEW_DF 1
-                   "AV2 experiment flag on new deblocking filter.")
 set_aom_config_var(CONFIG_TIP 1 "Enable temporal interpolated prediction (TIP)")
 set_aom_config_var(CONFIG_OPTFLOW_ON_TIP 1
                    "Enable optical flow refinement on top of TIP")
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
deleted file mode 100644
index 4d75c3d..0000000
--- a/test/lpf_test.cc
+++ /dev/null
@@ -1,572 +0,0 @@
-/*
- * Copyright (c) 2021, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 3-Clause Clear License
- * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
- * License was not distributed with this source code in the LICENSE file, you
- * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
- * Alliance for Open Media Patent License 1.0 was not distributed with this
- * source code in the PATENTS file, you can obtain it at
- * aomedia.org/license/patent-license/.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <string>
-#include <tuple>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/aom_config.h"
-
-#if !CONFIG_NEW_DF
-#include "config/aom_dsp_rtcd.h"
-
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/entropy.h"
-#include "aom/aom_integer.h"
-
-using libaom_test::ACMRandom;
-
-namespace {
-// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
-//                                         16 Coefs within filtered section
-//                                         8  Coeffs following filtered section
-const int kNumCoeffs = 1024;
-
-const int number_of_iterations = 10000;
-
-const int kSpeedTestNum = 500000;
-
-#define LOOP_PARAM \
-  int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh
-#define DUAL_LOOP_PARAM                                                      \
-  int p, const uint8_t *blimit0, const uint8_t *limit0,                      \
-      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \
-      const uint8_t *thresh1
-
-typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM);
-typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM);
-typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
-typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
-
-typedef std::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
-typedef std::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
-    hbddual_loop_param_t;
-typedef std::tuple<loop_op_t, loop_op_t, int> loop_param_t;
-typedef std::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
-
-template <typename Pixel_t, int PIXEL_WIDTH_t>
-void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
-               const int mask, const int32_t p, const int i) {
-  uint16_t tmp_s[kNumCoeffs];
-
-  for (int j = 0; j < kNumCoeffs;) {
-    const uint8_t val = rnd->Rand8();
-    if (val & 0x80) {  // 50% chance to choose a new value.
-      tmp_s[j] = rnd->Rand16();
-      j++;
-    } else {  // 50% chance to repeat previous value in row X times.
-      int k = 0;
-      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
-        if (j < 1) {
-          tmp_s[j] = rnd->Rand16();
-        } else if (val & 0x20) {  // Increment by a value within the limit.
-          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
-        } else {  // Decrement by a value within the limit.
-          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
-        }
-        j++;
-      }
-    }
-  }
-
-  for (int j = 0; j < kNumCoeffs;) {
-    const uint8_t val = rnd->Rand8();
-    if (val & 0x80) {
-      j++;
-    } else {  // 50% chance to repeat previous value in column X times.
-      int k = 0;
-      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
-        if (j < 1) {
-          tmp_s[j] = rnd->Rand16();
-        } else if (val & 0x20) {  // Increment by a value within the limit.
-          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
-              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1));
-        } else {  // Decrement by a value within the limit.
-          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
-              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1));
-        }
-        j++;
-      }
-    }
-  }
-
-  for (int j = 0; j < kNumCoeffs; j++) {
-    if (i % 2) {
-      s[j] = tmp_s[j] & mask;
-    } else {
-      s[j] = tmp_s[p * (j % p) + j / p] & mask;
-    }
-    ref_s[j] = s[j];
-  }
-}
-
-uint8_t GetOuterThresh(ACMRandom *rnd) {
-  return static_cast<uint8_t>(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5));
-}
-
-uint8_t GetInnerThresh(ACMRandom *rnd) {
-  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1));
-}
-
-uint8_t GetHevThresh(ACMRandom *rnd) {
-  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4);
-}
-
-template <typename func_type_t, typename params_t>
-class LoopTestParam : public ::testing::TestWithParam<params_t> {
- public:
-  virtual ~LoopTestParam() {}
-  virtual void SetUp() {
-    loopfilter_op_ = std::get<0>(this->GetParam());
-    ref_loopfilter_op_ = std::get<1>(this->GetParam());
-    bit_depth_ = std::get<2>(this->GetParam());
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  int bit_depth_;
-  int mask_;
-  func_type_t loopfilter_op_;
-  func_type_t ref_loopfilter_op_;
-};
-
-void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
-  op(s, p, blimit, limit, thresh, bd);
-}
-void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
-                     hbddual_loop_op_t op) {
-  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
-}
-
-typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param_hbd);
-typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
-    Loop8Test9Param_hbd;
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param_hbd);
-
-#define OPCHECK(a, b)                                                          \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  const int32_t p = kNumCoeffs / 32;                                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                      \
-    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
-                ref_loopfilter_op_);                                           \
-    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
-                                         thresh, bit_depth_, loopfilter_op_)); \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
-    }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
-
-#define VALCHECK(a, b)                                                         \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    int32_t p = kNumCoeffs / 32;                                               \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      s[j] = rnd.Rand16() & mask_;                                             \
-      ref_s[j] = s[j];                                                         \
-    }                                                                          \
-    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
-                ref_loopfilter_op_);                                           \
-    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
-                                         thresh, bit_depth_, loopfilter_op_)); \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
-    }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
-
-#define SPEEDCHECK(a, b)                                                      \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                              \
-  const int count_test_block = kSpeedTestNum;                                 \
-  const int32_t bd = bit_depth_;                                              \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                       \
-  uint8_t tmp = GetOuterThresh(&rnd);                                         \
-  DECLARE_ALIGNED(16, const uint8_t,                                          \
-                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-  tmp = GetInnerThresh(&rnd);                                                 \
-  DECLARE_ALIGNED(16, const uint8_t,                                          \
-                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,      \
-                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };    \
-  tmp = GetHevThresh(&rnd);                                                   \
-  DECLARE_ALIGNED(16, const uint8_t,                                          \
-                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
-  int32_t p = kNumCoeffs / 32;                                                \
-  for (int j = 0; j < kNumCoeffs; ++j) {                                      \
-    s[j] = rnd.Rand16() & mask_;                                              \
-  }                                                                           \
-  for (int i = 0; i < count_test_block; ++i) {                                \
-    call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
-  }
-
-TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
-
-#define OPCHECKd(a, b)                                                         \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetOuterThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    int32_t p = kNumCoeffs / 32;                                               \
-    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;               \
-    InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                       \
-    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
-    ASM_REGISTER_STATE_CHECK(                                                  \
-        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
-    }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
-
-#define VALCHECKd(a, b)                                                        \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
-  const int count_test_block = number_of_iterations;                           \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
-  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
-  int err_count_total = 0;                                                     \
-  int first_failure = -1;                                                      \
-  for (int i = 0; i < count_test_block; ++i) {                                 \
-    int err_count = 0;                                                         \
-    uint8_t tmp = GetOuterThresh(&rnd);                                        \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetOuterThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    tmp = GetInnerThresh(&rnd);                                                \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-    tmp = GetHevThresh(&rnd);                                                  \
-    DECLARE_ALIGNED(16, const uint8_t,                                         \
-                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-    int32_t p = kNumCoeffs / 32;                                               \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      s[j] = rnd.Rand16() & mask_;                                             \
-      ref_s[j] = s[j];                                                         \
-    }                                                                          \
-    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
-    ASM_REGISTER_STATE_CHECK(                                                  \
-        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
-                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
-    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-      err_count += ref_s[j] != s[j];                                           \
-    }                                                                          \
-    if (err_count && !err_count_total) {                                       \
-      first_failure = i;                                                       \
-    }                                                                          \
-    err_count_total += err_count;                                              \
-  }                                                                            \
-  EXPECT_EQ(0, err_count_total)                                                \
-      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
-         "loopfilter output. "                                                 \
-      << "First failed at test case " << first_failure;
-
-TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
-
-#define SPEEDCHECKd(a, b)                                                    \
-  ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
-  const int count_test_block = kSpeedTestNum;                                \
-  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
-  uint8_t tmp = GetOuterThresh(&rnd);                                        \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  tmp = GetInnerThresh(&rnd);                                                \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-  tmp = GetHevThresh(&rnd);                                                  \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  tmp = GetOuterThresh(&rnd);                                                \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  tmp = GetInnerThresh(&rnd);                                                \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
-                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
-  tmp = GetHevThresh(&rnd);                                                  \
-  DECLARE_ALIGNED(16, const uint8_t,                                         \
-                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
-                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
-  int32_t p = kNumCoeffs / 32;                                               \
-  for (int j = 0; j < kNumCoeffs; ++j) {                                     \
-    s[j] = rnd.Rand16() & mask_;                                             \
-  }                                                                          \
-  for (int i = 0; i < count_test_block; ++i) {                               \
-    call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,     \
-                    limit1, thresh1, bit_depth_, loopfilter_op_);            \
-  }
-
-TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
-
-using std::make_tuple;
-
-#if HAVE_SSE2
-const hbdloop_param_t kHbdLoop8Test6[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
-             8),
-  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
-             8),
-  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
-             8),
-  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
-             &aom_highbd_lpf_horizontal_14_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
-
-  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
-             8),
-  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
-             10),
-  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
-             10),
-  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
-             10),
-  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
-             &aom_highbd_lpf_horizontal_14_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
-             10),
-  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
-             12),
-  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
-             12),
-  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
-             12),
-  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
-             &aom_highbd_lpf_horizontal_14_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
-             12),
-  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd,
-                         ::testing::ValuesIn(kHbdLoop8Test6));
-
-#endif  // HAVE_SSE2
-
-#if HAVE_SSE2
-const hbddual_loop_param_t kHbdLoop8Test9[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
-             &aom_highbd_lpf_horizontal_6_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
-             &aom_highbd_lpf_horizontal_14_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
-             &aom_highbd_lpf_vertical_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
-             &aom_highbd_lpf_vertical_6_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
-             &aom_highbd_lpf_vertical_14_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
-             &aom_highbd_lpf_horizontal_6_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
-             &aom_highbd_lpf_horizontal_14_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
-             &aom_highbd_lpf_vertical_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
-             &aom_highbd_lpf_vertical_6_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
-             &aom_highbd_lpf_vertical_14_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
-             &aom_highbd_lpf_horizontal_6_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
-             &aom_highbd_lpf_horizontal_14_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
-             &aom_highbd_lpf_vertical_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
-             &aom_highbd_lpf_vertical_6_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
-             &aom_highbd_lpf_vertical_8_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
-             &aom_highbd_lpf_vertical_14_dual_c, 12),
-};
-
-INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_hbd,
-                         ::testing::ValuesIn(kHbdLoop8Test9));
-
-#endif  // HAVE_SSE2
-
-#if HAVE_AVX2
-const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
-             &aom_highbd_lpf_horizontal_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
-             &aom_highbd_lpf_horizontal_8_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
-             &aom_highbd_lpf_vertical_4_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
-             &aom_highbd_lpf_vertical_4_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
-             &aom_highbd_lpf_vertical_4_dual_c, 12),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
-             &aom_highbd_lpf_vertical_8_dual_c, 8),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
-             &aom_highbd_lpf_vertical_8_dual_c, 10),
-  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
-             &aom_highbd_lpf_vertical_8_dual_c, 12),
-};
-
-INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test9Param_hbd,
-                         ::testing::ValuesIn(kHbdLoop8Test9Avx2));
-#endif
-}  // namespace
-#endif  // !CONFIG_NEW_DF
diff --git a/test/test.cmake b/test/test.cmake
index 1c6be67..a402306 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -95,7 +95,6 @@
     "${AOM_ROOT}/test/hiprec_convolve_test_util.h"
     "${AOM_ROOT}/test/intrabc_test.cc"
     "${AOM_ROOT}/test/intrapred_test.cc"
-    "${AOM_ROOT}/test/lpf_test.cc"
     "${AOM_ROOT}/test/opt_flow_test.cc"
     "${AOM_ROOT}/test/scan_test.cc"
     "${AOM_ROOT}/test/selfguided_filter_test.cc"