Add experiment CONFIG_CDEF_SINGLEPASS: Make CDEF single pass

Low latency, cpu-used=0:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.3162 | -0.6719 | -0.6535 |   0.0089 | -0.3890 | -0.1515 |    -0.6682

High latency, cpu-used=0:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0293 | -0.3556 | -0.5505 |   0.0684 | -0.0862 |  0.0513 |    -0.2765

Low latency, cpu-used=4:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.2248 | -0.7764 | -0.6630 |  -0.2109 | -0.3240 | -0.2532 |    -0.6980

High latency, cpu-used=4:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1118 | -0.5841 | -0.7406 |  -0.0463 | -0.2442 | -0.1064 |    -0.4187

Change-Id: I9ca8399c8f45489541a66f535fb3d771eb1d59ab
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 46e55bb..e7d7e10 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -236,13 +236,15 @@
 if (CONFIG_CDEF)
   set(AOM_AV1_COMMON_SOURCES
       ${AOM_AV1_COMMON_SOURCES}
-      "${AOM_ROOT}/av1/common/clpf.c"
-      "${AOM_ROOT}/av1/common/clpf_simd.h"
+      if (!CONFIG_CDEF_SINGLEPASS)
+        "${AOM_ROOT}/av1/common/clpf.c"
+        "${AOM_ROOT}/av1/common/clpf_simd.h"
+        "${AOM_ROOT}/av1/common/cdef_block_simd.h")
+      endif ()
       "${AOM_ROOT}/av1/common/cdef.c"
       "${AOM_ROOT}/av1/common/cdef.h"
       "${AOM_ROOT}/av1/common/cdef_block.c"
-      "${AOM_ROOT}/av1/common/cdef_block.h"
-      "${AOM_ROOT}/av1/common/cdef_block_simd.h")
+      "${AOM_ROOT}/av1/common/cdef_block.h")
 
   set(AOM_AV1_ENCODER_SOURCES
       ${AOM_AV1_ENCODER_SOURCES}
@@ -250,22 +252,34 @@
 
   set(AOM_AV1_COMMON_INTRIN_SSE2
       ${AOM_AV1_COMMON_INTRIN_SSE2}
-      "${AOM_ROOT}/av1/common/clpf_sse2.c"
+      if (!CONFIG_CDEF_SINGLEPASS)
+        "${AOM_ROOT}/av1/common/clpf_sse2.c"
+      endif ()
       "${AOM_ROOT}/av1/common/cdef_block_sse2.c")
 
   set(AOM_AV1_COMMON_INTRIN_SSSE3
       ${AOM_AV1_COMMON_INTRIN_SSSE3}
-      "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+      if (!CONFIG_CDEF_SINGLEPASS)
+        "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+      endif ()
       "${AOM_ROOT}/av1/common/cdef_block_ssse3.c")
 
   set(AOM_AV1_COMMON_INTRIN_SSE4_1
       ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/clpf_sse4.c"
+      if (!CONFIG_CDEF_SINGLEPASS)
+        "${AOM_ROOT}/av1/common/clpf_sse4.c"
+      endif ()
       "${AOM_ROOT}/av1/common/cdef_block_sse4.c")
 
+  set(AOM_AV1_COMMON_INTRIN_AVX2
+      ${AOM_AV1_COMMON_INTRIN_AVX2}
+      "${AOM_ROOT}/av1/common/cdef_block_avx2.c")
+
   set(AOM_AV1_COMMON_INTRIN_NEON
       ${AOM_AV1_COMMON_INTRIN_NEON}
-      "${AOM_ROOT}/av1/common/clpf_neon.c"
+      if (!CONFIG_CDEF_SINGLEPASS)
+        "${AOM_ROOT}/av1/common/clpf_neon.c"
+      endif ()
       "${AOM_ROOT}/av1/common/cdef_block_neon.c")
 endif ()
 
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 5411229..d5d40af 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -90,12 +90,16 @@
 AV1_COMMON_SRCS-yes += common/warped_motion.c
 endif
 ifeq ($(CONFIG_CDEF),yes)
+ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/cdef_block_avx2.c
+else
 AV1_COMMON_SRCS-yes += common/clpf.c
 AV1_COMMON_SRCS-yes += common/clpf_simd.h
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
 AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
+endif
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/cdef_block_sse2.c
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/cdef_block_ssse3.c
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/cdef_block_sse4.c
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index edc07f8..dc6a844 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -520,18 +520,22 @@
 # Deringing Functions
 
 if (aom_config("CONFIG_CDEF") eq "yes") {
-  add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
   add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
-  add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
-  add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+  if (aom_config("CONFIG_CDEF_SINGLEPASS") ne "yes") {
+    add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+    add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+    add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+    add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+    add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+    add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
+    add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+    add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
+    add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+    add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
+  } else {
+    add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max";
+  }
 
-  add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
-  add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
-  add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
-  add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
   add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
   add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
 
@@ -539,20 +543,28 @@
   # structs as arguments, which makes the v256 type of the intrinsics
   # hard to support, so optimizations for this target are disabled.
   if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
-    specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
-    specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
-    specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
+    if (aom_config("CONFIG_CDEF_SINGLEPASS") eq "yes") {
+      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+      specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+      specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+      specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+    } else {
+      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
+      specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+      specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
+      specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
+      specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
+      specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
+      specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
+      specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
 
-    specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+      specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+      specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
+      specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+      specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+      specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
+      specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    }
   }
 }
 
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 8bb3874..8417fea 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -260,14 +260,21 @@
            uv_sec_strength == 0) ||
           (cdef_count = sb_compute_cdef_list(
                cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist,
-               get_filter_skip(level) || get_filter_skip(uv_level))) == 0) {
+#if CONFIG_CDEF_SINGLEPASS
+               (level & 1) || (uv_level & 1))) == 0)
+#else
+                 get_filter_skip(level) || get_filter_skip(uv_level))) == 0)
+#endif
+      {
         cdef_left = 0;
         continue;
       }
 
       curr_row_cdef[fbc] = 1;
       for (pli = 0; pli < nplanes; pli++) {
+#if !CONFIG_CDEF_SINGLEPASS
         uint16_t dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE];
+#endif
         int coffset;
         int rend, cend;
         int pri_damping = cm->cdef_pri_damping;
@@ -386,15 +393,28 @@
 #if CONFIG_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           cdef_filter_fb(
+#if CONFIG_CDEF_SINGLEPASS
+              NULL,
+              &CONVERT_TO_SHORTPTR(
+#else
               (uint8_t *)&CONVERT_TO_SHORTPTR(
+#endif
                   xd->plane[pli]
                       .dst.buf)[xd->plane[pli].dst.stride *
                                     (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
                                 (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+#if CONFIG_CDEF_SINGLEPASS
+              xd->plane[pli].dst.stride,
+#else
               xd->plane[pli].dst.stride, dst,
+#endif
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+#if CONFIG_CDEF_SINGLEPASS
+              sec_strength, pri_damping, sec_damping, coeff_shift);
+#else
               sec_strength, sec_damping, pri_damping, coeff_shift, 0, 1);
+#endif
         } else {
 #endif
           cdef_filter_fb(
@@ -402,10 +422,18 @@
                    .dst.buf[xd->plane[pli].dst.stride *
                                 (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
                             (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+#if CONFIG_CDEF_SINGLEPASS
+              NULL, xd->plane[pli].dst.stride,
+#else
               xd->plane[pli].dst.stride, dst,
+#endif
               &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
               ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+#if CONFIG_CDEF_SINGLEPASS
+              sec_strength, pri_damping, sec_damping, coeff_shift);
+#else
               sec_strength, sec_damping, pri_damping, coeff_shift, 0, 0);
+#endif
 
 #if CONFIG_HIGHBITDEPTH
         }
diff --git a/av1/common/cdef_block.c b/av1/common/cdef_block.c
index 3fe836a..874644a 100644
--- a/av1/common/cdef_block.c
+++ b/av1/common/cdef_block.c
@@ -21,6 +21,7 @@
 #include "./cdef.h"
 
 /* Generated from gen_filter_tables.c. */
+#if !CONFIG_CDEF_SINGLEPASS || CDEF_FULL
 const int cdef_directions[8][3] = {
   { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2, -3 * CDEF_BSTRIDE + 3 },
   { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2, -1 * CDEF_BSTRIDE + 3 },
@@ -31,6 +32,18 @@
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0, 3 * CDEF_BSTRIDE + 0 },
   { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1, 3 * CDEF_BSTRIDE - 1 }
 };
+#else
+const int cdef_directions[8][2] = {
+  { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
+  { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
+};
+#endif
 
 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
    The search minimizes the weighted variance along all the lines in a
@@ -110,6 +123,94 @@
   return best_dir;
 }
 
+#if CONFIG_CDEF_SINGLEPASS
+#if CDEF_FULL
+const int cdef_pri_taps[2][3] = { { 3, 2, 1 }, { 2, 2, 2 } };
+const int cdef_sec_taps[2][2] = { { 3, 1 }, { 3, 1 } };
+#else
+const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+#endif
+
+/* Smooth in the direction detected. */
+#if CDEF_CAP
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+                         const uint16_t *in, int pri_strength, int sec_strength,
+                         int dir, int pri_damping, int sec_damping, int bsize,
+                         UNUSED int max_unused)
+#else
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+                         const uint16_t *in, int pri_strength, int sec_strength,
+                         int dir, int pri_damping, int sec_damping, int bsize,
+                         int max)
+#endif
+{
+  int i, j, k;
+  const int s = CDEF_BSTRIDE;
+  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+  for (i = 0; i < 4 << (bsize == BLOCK_8X8); i++) {
+    for (j = 0; j < 4 << (bsize == BLOCK_8X8); j++) {
+      int16_t sum = 0;
+      int16_t y;
+      int16_t x = in[i * s + j];
+#if CDEF_CAP
+      int max = x;
+      int min = x;
+#endif
+#if CDEF_FULL
+      for (k = 0; k < 3; k++)
+#else
+      for (k = 0; k < 2; k++)
+#endif
+      {
+        int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+        int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+        sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+        sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+#if CDEF_CAP
+        if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+        if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+        min = AOMMIN(p0, min);
+        min = AOMMIN(p1, min);
+#endif
+#if CDEF_FULL
+        if (k == 2) continue;
+#endif
+        int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
+        int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
+        int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
+        int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
+#if CDEF_CAP
+        if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+        if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+        if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+        if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+        min = AOMMIN(s0, min);
+        min = AOMMIN(s1, min);
+        min = AOMMIN(s2, min);
+        min = AOMMIN(s3, min);
+#endif
+        sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+      }
+#if CDEF_CAP
+      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
+#else
+      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), 0, max);
+#endif
+      if (dst8)
+        dst8[i * dstride + j] = (uint8_t)y;
+      else
+        dst16[i * dstride + j] = (uint16_t)y;
+    }
+  }
+}
+
+#else
+
 /* Smooth in the direction detected. */
 void cdef_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in,
                           int threshold, int dir, int damping) {
@@ -167,6 +268,7 @@
     }
   }
 }
+#endif
 
 /* Compute the primary filter strength for an 8x8 block based on the
    directional variance difference. A high variance difference means
@@ -180,6 +282,7 @@
   return var ? (strength * (4 + i) + 8) >> 4 : 0;
 }
 
+#if !CONFIG_CDEF_SINGLEPASS
 void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
                                int sstride) {
   int i, j;
@@ -303,25 +406,56 @@
                     cdef_list *dlist, int cdef_count, int level,
                     int sec_strength, int sec_damping, int pri_damping,
                     int coeff_shift, int skip_dering, int hbd) {
+#else
+
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                    cdef_list *dlist, int cdef_count, int level,
+                    int sec_strength, int pri_damping, int sec_damping,
+                    int coeff_shift) {
+#endif
   int bi;
   int bx;
   int by;
   int bsize, bsizex, bsizey;
 
+#if CONFIG_CDEF_SINGLEPASS
+  int pri_strength = (level >> 1) << coeff_shift;
+  int filter_skip = level & 1;
+  if (!pri_strength && !sec_strength && filter_skip) {
+    pri_strength = 19 << coeff_shift;
+    sec_strength = 7 << coeff_shift;
+  }
+#else
   int threshold = (level >> 1) << coeff_shift;
   int filter_skip = get_filter_skip(level);
   if (level == 1) threshold = 31 << coeff_shift;
 
   cdef_direction_func cdef_direction[] = { cdef_direction_4x4,
                                            cdef_direction_8x8 };
+#endif
   sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
   pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
   bsize =
       ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
   bsizex = 3 - xdec;
   bsizey = 3 - ydec;
-
-  if (!skip_dering) {
+#if CONFIG_CDEF_SINGLEPASS
+  if (dirinit && pri_strength == 0 && sec_strength == 0)
+#else
+  if (!skip_dering)
+#endif
+  {
+#if CONFIG_CDEF_SINGLEPASS
+    // If we're here, both primary and secondary strengths are 0, and
+    // we still haven't written anything to y[] yet, so we just copy
+    // the input to y[]. This is necessary only for av1_cdef_search()
+    // and only av1_cdef_search() sets dirinit.
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+#else
     if (pli == 0) {
       if (!dirinit || !*dirinit) {
         for (bi = 0; bi < cdef_count; bi++) {
@@ -394,12 +528,56 @@
     for (bi = 0; bi < cdef_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
+#endif
       int iy, ix;
       // TODO(stemidts/jmvalin): SIMD optimisations
       for (iy = 0; iy < 1 << bsizey; iy++)
         for (ix = 0; ix < 1 << bsizex; ix++)
+#if CONFIG_CDEF_SINGLEPASS
+          dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+#else
           y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+#endif
               in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
     }
+#if CONFIG_CDEF_SINGLEPASS
+    return;
+#endif
   }
+
+#if CONFIG_CDEF_SINGLEPASS
+  if (pli == 0) {
+    if (!dirinit || !*dirinit) {
+      for (bi = 0; bi < cdef_count; bi++) {
+        by = dlist[bi].by;
+        bx = dlist[bi].bx;
+        dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+                                    CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+      }
+      if (dirinit) *dirinit = 1;
+    }
+  }
+
+  assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
+  for (bi = 0; bi < cdef_count; bi++) {
+    int t = !filter_skip && dlist[bi].skip ? 0 : pri_strength;
+    int s = !filter_skip && dlist[bi].skip ? 0 : sec_strength;
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    if (dst8)
+      cdef_filter_block(
+          &dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL, dstride,
+          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+    else
+      cdef_filter_block(
+          NULL, &dst16[dirinit ? bi << (bsizex + bsizey)
+                               : (by << bsizey) * dstride + (bx << bsizex)],
+          dirinit ? 1 << bsizex : dstride,
+          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1);
+  }
+#endif
 }
diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 3891e2b..bf277fa 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h
@@ -17,6 +17,9 @@
 #define CDEF_BLOCKSIZE 64
 #define CDEF_BLOCKSIZE_LOG2 6
 #define CDEF_NBLOCKS (CDEF_BLOCKSIZE / 8)
+#if CONFIG_CDEF_SINGLEPASS
+#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
+#endif
 
 /* We need to buffer three vertical lines. */
 #define CDEF_VBORDER (3)
@@ -28,7 +31,24 @@
 #define CDEF_VERY_LARGE (30000)
 #define CDEF_INBUF_SIZE (CDEF_BSTRIDE * (CDEF_BLOCKSIZE + 2 * CDEF_VBORDER))
 
+#if CONFIG_CDEF_SINGLEPASS
+// Filter configuration
+#define CDEF_CAP 1   // 1 = Cap change to largest diff
+#define CDEF_FULL 0  // 1 = 7x7 filter, 0 = 5x5 filter
+
+#if CDEF_FULL
+extern const int cdef_pri_taps[2][3];
+extern const int cdef_sec_taps[2][2];
 extern const int cdef_directions[8][3];
+#else
+extern const int cdef_pri_taps[2][2];
+extern const int cdef_sec_taps[2][2];
+extern const int cdef_directions[8][2];
+#endif
+
+#else  // CONFIG_CDEF_SINGLEPASS
+extern const int cdef_directions[8][3];
+#endif
 
 typedef struct {
   uint8_t by;
@@ -36,12 +56,30 @@
   uint8_t skip;
 } cdef_list;
 
+#if CONFIG_CDEF_SINGLEPASS
+typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
+                                       int dstride, const uint16_t *in,
+                                       int pri_strength, int sec_strength,
+                                       int dir, int pri_damping,
+                                       int sec_damping, int bsize, int max);
+void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                              cdef_list *dlist, int cdef_count, int bsize);
+#else
 typedef void (*cdef_direction_func)(uint16_t *y, int ystride,
                                     const uint16_t *in, int threshold, int dir,
                                     int damping);
 
 int get_filter_skip(int level);
+#endif
 
+#if CONFIG_CDEF_SINGLEPASS
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                    cdef_list *dlist, int cdef_count, int level,
+                    int sec_strength, int pri_damping, int sec_damping,
+                    int coeff_shift);
+#else
 void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
                     int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
                     int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
@@ -49,3 +87,4 @@
                     int sec_strength, int sec_damping, int pri_damping,
                     int coeff_shift, int skip_dering, int hbd);
 #endif
+#endif
diff --git a/av1/common/cdef_block_avx2.c b/av1/common/cdef_block_avx2.c
new file mode 100644
index 0000000..5e48045
--- /dev/null
+++ b/av1/common/cdef_block_avx2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "./cdef_block_simd.h"
diff --git a/av1/common/cdef_block_simd.h b/av1/common/cdef_block_simd.h
index 358d919..b53b88b 100644
--- a/av1/common/cdef_block_simd.h
+++ b/av1/common/cdef_block_simd.h
@@ -221,6 +221,815 @@
   return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
 }
 
+#if CONFIG_CDEF_SINGLEPASS
+// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
+SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
+                           unsigned int adjdamp) {
+  const v256 diff16 = v256_sub_16(a, b);
+  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
+  const v128 sign = v128_cmplt_s8(diff, v128_zero());
+  diff = v128_abs_s8(diff);
+  return v128_xor(
+      v128_add_8(sign,
+                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
+                                                v128_shr_u8(diff, adjdamp)))),
+      sign);
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        int max)
+#endif
+{
+  v128 p0, p1, p2, p3;
+  v256 sum, row, tap, res;
+#if CDEF_CAP
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+#endif
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+  int po3 = cdef_directions[dir][2];
+#endif
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+  if (pri_strength) pri_damping -= get_msb(pri_strength);
+  if (sec_strength) sec_damping -= get_msb(sec_strength);
+
+  sum = v256_zero();
+  row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
+#if CDEF_CAP
+  max = min = row;
+#endif
+
+  if (pri_strength) {
+    // Primary near taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Primary far taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+#if CDEF_FULL
+    // Primary extra taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po3]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po3]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po3]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po3]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po3]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po3]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po3]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po3]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[2] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+#endif
+  }
+
+  if (sec_strength) {
+    // Secondary near taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // Secondary far taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+  }
+
+  // res = row + ((sum - (sum < 0) + 8) >> 4)
+  sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+  res = v256_add_16(sum, v256_dup_16(8));
+  res = v256_shr_n_s16(res, 4);
+  res = v256_add_16(row, res);
+#if CDEF_CAP
+  res = v256_min_s16(v256_max_s16(res, min), max);
+#else
+  res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
+#endif
+  res = v256_pack_s16_u8(res, res);
+
+  p0 = v256_low_v128(res);
+  u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
+  u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
+  u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
+  u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        int max)
+#endif
+{
+  int i;
+  v128 p0, p1, p2, p3;
+  v256 sum, row, res, tap;
+#if CDEF_CAP
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+#endif
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+  int po3 = cdef_directions[dir][2];
+#endif
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+  if (pri_strength) pri_damping -= get_msb(pri_strength);
+  if (sec_strength) sec_damping -= get_msb(sec_strength);
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+#if CDEF_CAP
+    max = min = row;
+#endif
+    // Primary near taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Primary far taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+#if CDEF_FULL
+    // Primary extra taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[2] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[2]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+#endif
+
+    // Secondary near taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // Secondary far taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+#if CDEF_CAP
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+#endif
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+#if CDEF_CAP
+    res = v256_min_s16(v256_max_s16(res, min), max);
+#else
+    res = v256_min_s16(v256_max_s16(res, v256_zero()), v256_dup_16(max));
+#endif
+    res = v256_pack_s16_u8(res, res);
+
+    p0 = v256_low_v128(res);
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
+    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
+  }
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         int max)
+#endif
+{
+  int i;
+  v128 p0, p1, p2, p3, sum, row, res;
+#if CDEF_CAP
+  v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
+#endif
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+  int po3 = cdef_directions[dir][2];
+#endif
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+  if (pri_strength) pri_damping -= get_msb(pri_strength);
+  if (sec_strength) sec_damping -= get_msb(sec_strength);
+  for (i = 0; i < 4; i += 2) {
+    sum = v128_zero();
+    row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+#if CDEF_CAP
+    min = max = row;
+#endif
+
+    // Primary near taps
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v128_add_16(
+        sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+
+    // Primary far taps
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v128_add_16(
+        sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
+
+#if CDEF_FULL
+    // Primary extra taps
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po3]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po3]));
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po3]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po3]));
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[2] * (p0 + p1)
+    sum = v128_add_16(
+        sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
+#endif
+
+    // Secondary near taps
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+                     v128_andn(p3, v128_cmpeq_16(p3, large)));
+    min = v128_min_s16(
+        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
+                                          v128_add_16(v128_add_16(p0, p1),
+                                                      v128_add_16(p2, p3))));
+
+    // Secondary far taps
+    p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    p1 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    p2 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    p3 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+                     v128_andn(p3, v128_cmpeq_16(p3, large)));
+    min = v128_min_s16(
+        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
+                                          v128_add_16(v128_add_16(p0, p1),
+                                                      v128_add_16(p2, p3))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
+    res = v128_add_16(sum, v128_dup_16(8));
+    res = v128_shr_n_s16(res, 4);
+    res = v128_add_16(row, res);
+#if CDEF_CAP
+    res = v128_min_s16(v128_max_s16(res, min), max);
+#else
+    res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
+#endif
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(res));
+    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(res));
+  }
+}
+
+#if CDEF_CAP
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         UNUSED int max_unused)
+#else
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         int max)
+#endif
+{
+  int i;
+  v128 sum, p0, p1, p2, p3, row, res;
+#if CDEF_CAP
+  v128 max, min, large = v128_dup_16(CDEF_VERY_LARGE);
+#endif
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+#if CDEF_FULL
+  int po3 = cdef_directions[dir][2];
+#endif
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[pri_strength & 1];
+  const int *sec_taps = cdef_sec_taps[pri_strength & 1];
+
+  if (pri_strength) pri_damping -= get_msb(pri_strength);
+  if (sec_strength) sec_damping -= get_msb(sec_strength);
+
+  for (i = 0; i < 8; i++) {
+    sum = v128_zero();
+    row = v128_load_aligned(&in[i * CDEF_BSTRIDE]);
+
+#if CDEF_CAP
+    min = max = row;
+#endif
+    // Primary near taps
+    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]);
+    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]);
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v128_add_16(
+        sum, v128_mullo_s16(v128_dup_16(pri_taps[0]), v128_add_16(p0, p1)));
+
+    // Primary far taps
+    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]);
+    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]);
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v128_add_16(
+        sum, v128_mullo_s16(v128_dup_16(pri_taps[1]), v128_add_16(p0, p1)));
+
+#if CDEF_FULL
+    // Primary extra taps
+    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + po3]);
+    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - po3]);
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    min = v128_min_s16(v128_min_s16(min, p0), p1);
+#endif
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[2] * (p0 + p1)
+    sum = v128_add_16(
+        sum, v128_mullo_s16(v128_dup_16(pri_taps[2]), v128_add_16(p0, p1)));
+#endif
+
+    // Secondary near taps
+    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]);
+    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]);
+    p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]);
+    p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]);
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+                     v128_andn(p3, v128_cmpeq_16(p3, large)));
+    min = v128_min_s16(
+        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[0]),
+                                          v128_add_16(v128_add_16(p0, p1),
+                                                      v128_add_16(p2, p3))));
+
+    // Secondary far taps
+    p0 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]);
+    p1 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]);
+    p2 = v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]);
+    p3 = v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]);
+#if CDEF_CAP
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p0, v128_cmpeq_16(p0, large))),
+                     v128_andn(p1, v128_cmpeq_16(p1, large)));
+    max =
+        v128_max_s16(v128_max_s16(max, v128_andn(p2, v128_cmpeq_16(p2, large))),
+                     v128_andn(p3, v128_cmpeq_16(p3, large)));
+    min = v128_min_s16(
+        v128_min_s16(v128_min_s16(v128_min_s16(min, p0), p1), p2), p3);
+#endif
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    sum = v128_add_16(sum, v128_mullo_s16(v128_dup_16(sec_taps[1]),
+                                          v128_add_16(v128_add_16(p0, p1),
+                                                      v128_add_16(p2, p3))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v128_add_16(sum, v128_cmplt_s16(sum, v128_zero()));
+    res = v128_add_16(sum, v128_dup_16(8));
+    res = v128_shr_n_s16(res, 4);
+    res = v128_add_16(row, res);
+#if CDEF_CAP
+    res = v128_min_s16(v128_max_s16(res, min), max);
+#else
+    res = v128_min_s16(v128_max_s16(res, v128_zero()), v128_dup_16(max));
+#endif
+    v128_store_unaligned(&dst[i * dstride], res);
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
+                                  const uint16_t *in, int pri_strength,
+                                  int sec_strength, int dir, int pri_damping,
+                                  int sec_damping, int bsize, int max) {
+  if (dst8)
+    (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_8)
+                        : SIMD_FUNC(cdef_filter_block_4x4_8))(
+        dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+        sec_damping, max);
+  else
+    (bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_16)
+                        : SIMD_FUNC(cdef_filter_block_4x4_16))(
+        dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+        sec_damping, max);
+}
+
+#else
+
 void SIMD_FUNC(cdef_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in,
                                    int threshold, int dir, int damping) {
   int i;
@@ -364,6 +1173,7 @@
     v64_store_unaligned(&dst[i * dstride], row);
   }
 }
+#endif
 
 void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
                                          const uint8_t *src, int sstride, int v,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 2d2f113..fb7550f 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3011,8 +3011,12 @@
 #if CONFIG_CDEF
 static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   int i;
+#if CONFIG_CDEF_SINGLEPASS
+  cm->cdef_pri_damping = cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
+#else
   cm->cdef_pri_damping = aom_rb_read_literal(rb, 1) + 5;
   cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
+#endif
   cm->cdef_bits = aom_rb_read_literal(rb, 2);
   cm->nb_cdef_strengths = 1 << cm->cdef_bits;
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 4a6fbd8..c86ef3a 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3454,8 +3454,13 @@
 #if CONFIG_CDEF
 static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   int i;
+#if CONFIG_CDEF_SINGLEPASS
+  aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
+  assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
+#else
   aom_wb_write_literal(wb, cm->cdef_pri_damping - 5, 1);
   aom_wb_write_literal(wb, cm->cdef_sec_damping - 3, 2);
+#endif
   aom_wb_write_literal(wb, cm->cdef_bits, 2);
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
     aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index da2370b..f3f7799 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4206,7 +4206,7 @@
     cm->cdef_strengths[0] = 0;
     cm->nb_cdef_strengths = 1;
   } else {
-    // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+    // Find CDEF parameters
     av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
                     cpi->oxcf.speed > 0);
 
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 443e9e5..accc97e 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -68,11 +68,16 @@
                                 uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
                                 int fast) {
   uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+#if !CONFIG_CDEF_SINGLEPASS
   const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+#endif
   int i, j;
   uint64_t best_tot_mse = (uint64_t)1 << 63;
   int best_id0 = 0;
   int best_id1 = 0;
+#if CONFIG_CDEF_SINGLEPASS
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+#endif
   memset(tot_mse, 0, sizeof(tot_mse));
   for (i = 0; i < sb_count; i++) {
     int gi;
@@ -305,7 +310,11 @@
   int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
   uint64_t(*mse[2])[TOTAL_STRENGTHS];
+#if CONFIG_CDEF_SINGLEPASS
+  int pri_damping = 3 + (cm->base_qindex >> 6);
+#else
   int pri_damping = 6;
+#endif
   int sec_damping = 3 + (cm->base_qindex >> 6);
   int i;
   int nb_strengths;
@@ -414,6 +423,17 @@
           int xsize = (nhb << mi_wide_l2[pli]) +
                       CDEF_HBORDER * (fbc != nhfb - 1) + xoff;
           sec_strength = gi % CDEF_SEC_STRENGTHS;
+#if CONFIG_CDEF_SINGLEPASS
+          copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                       src[pli],
+                       (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+                       (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
+                       stride[pli], ysize, xsize);
+          cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
+                         dir, &dirinit, var, pli, dlist, cdef_count, threshold,
+                         sec_strength + (sec_strength == 3), pri_damping,
+                         sec_damping, coeff_shift);
+#else
           if (sec_strength == 0)
             copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
                          src[pli],
@@ -425,6 +445,7 @@
                          pli, dlist, cdef_count, threshold,
                          sec_strength + (sec_strength == 3), sec_damping,
                          pri_damping, coeff_shift, sec_strength != 0, 1);
+#endif
           curr_mse = compute_cdef_dist(
               ref_coeff[pli] +
                   (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index aacb1ac..4d45a57 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -114,6 +114,7 @@
 set(CONFIG_BGSPRITE 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CB4X4 1 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CDEF 1 CACHE NUMBER "AV1 experiment flag.")
+set(CONFIG_CDEF_SINGLEPASS 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CFL 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CHROMA_2X2 0 CACHE NUMBER "AV1 experiment flag.")
 set(CONFIG_CHROMA_SUB8X8 1 CACHE NUMBER "AV1 experiment flag.")
diff --git a/configure b/configure
index ce82e73..2f2f130 100755
--- a/configure
+++ b/configure
@@ -244,6 +244,7 @@
 EXPERIMENT_LIST="
     fp_mb_stats
     cdef
+    cdef_singlepass
     var_tx
     rect_tx
     rect_tx_ext
@@ -561,6 +562,7 @@
     enabled altref2 && enable_feature flex_refs
     enabled rect_tx_ext && enable_feature rect_tx
     enabled cfl && enable_feature smooth_hv
+    enabled cdef_singlepass && enable_feature cdef
 
     if ! enabled delta_q && enabled ext_delta_q; then
       log_echo "ext_delta_q requires delta_q, so disabling ext_delta_q"
diff --git a/test/cdef_test.cc b/test/cdef_test.cc
new file mode 100644
index 0000000..d2ab692
--- /dev/null
+++ b/test/cdef_test.cc
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+*/
+
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/cdef_block.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef std::tr1::tuple<cdef_filter_block_func, cdef_filter_block_func, int>
+    cdef_dir_param_t;
+
+class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
+ public:
+  virtual ~CDEFBlockTest() {}
+  virtual void SetUp() {
+    cdef = GET_PARAM(0);
+    ref_cdef = GET_PARAM(1);
+    bsize = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bsize;
+  cdef_filter_block_func cdef;
+  cdef_filter_block_func ref_cdef;
+};
+
+typedef CDEFBlockTest CDEFSpeedTest;
+
+void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
+               cdef_filter_block_func ref_cdef) {
+  const int size = 8;
+  const int ysize = size + 2 * CDEF_VBORDER;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
+  DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
+  DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  int error = 0, pristrength = 0, secstrength, dir;
+  int boundary, pridamping, secdamping, depth, bits, level, count,
+      errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
+      errpridamping = 0, errsecdamping = 0;
+  unsigned int pos = 0;
+
+  for (boundary = 0; boundary < 16; boundary++) {
+    for (depth = 8; depth <= 12; depth += 2) {
+      for (pridamping = 3 + depth - 8;
+           pridamping < 7 - 3 * !!boundary + depth - 8; pridamping++) {
+        for (secdamping = 3 + depth - 8;
+             secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+          for (count = 0; count < iterations; count++) {
+            for (level = 0; level < (1 << depth) && !error;
+                 level += (2 + 6 * !!boundary) << (depth - 8)) {
+              for (bits = 1; bits <= depth && !error;
+                   bits += 1 + 3 * !!boundary) {
+                for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+                  s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                               (1 << depth) - 1);
+                if (boundary) {
+                  if (boundary & 1) {  // Left
+                    for (int i = 0; i < ysize; i++)
+                      for (int j = 0; j < CDEF_HBORDER; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                  if (boundary & 2) {  // Right
+                    for (int i = 0; i < ysize; i++)
+                      for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                  if (boundary & 4) {  // Above
+                    for (int i = 0; i < CDEF_VBORDER; i++)
+                      for (int j = 0; j < CDEF_BSTRIDE; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                  if (boundary & 8) {  // Below
+                    for (int i = CDEF_VBORDER + size; i < ysize; i++)
+                      for (int j = 0; j < CDEF_BSTRIDE; j++)
+                        s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+                  }
+                }
+                for (dir = 0; dir < 8; dir++) {
+                  for (pristrength = 0;
+                       pristrength <= 19 << (depth - 8) && !error;
+                       pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+                    if (pristrength == 16) pristrength = 19;
+                    for (secstrength = 0;
+                         secstrength <= 4 << (depth - 8) && !error;
+                         secstrength += 1 << (depth - 8)) {
+                      if (secstrength == 3 << (depth - 8)) continue;
+                      ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
+                               s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                               pristrength, secstrength, dir, pridamping,
+                               secdamping, bsize, (1 << depth) - 1);
+                      // If cdef and ref_cdef are the same, we're just testing
+                      // speed
+                      if (cdef != ref_cdef)
+                        ASM_REGISTER_STATE_CHECK(
+                            cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
+                                 s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                                 pristrength, secstrength, dir, pridamping,
+                                 secdamping, bsize, (1 << depth) - 1));
+                      if (ref_cdef != cdef) {
+                        for (pos = 0;
+                             pos<sizeof(d) / sizeof(*d)>> (depth == 8) &&
+                             !error;
+                             pos++) {
+                          error = ref_d[pos] != d[pos];
+                          errdepth = depth;
+                          errpristrength = pristrength;
+                          errsecstrength = secstrength;
+                          errboundary = boundary;
+                          errpridamping = pridamping;
+                          errsecdamping = secdamping;
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  pos--;
+  EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
+                      << std::endl
+                      << "First error at " << pos % size << "," << pos / size
+                      << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
+                      << ") " << std::endl
+                      << "pristrength: " << errpristrength << std::endl
+                      << "pridamping: " << errpridamping << std::endl
+                      << "secstrength: " << errsecstrength << std::endl
+                      << "secdamping: " << errsecdamping << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << "size: " << bsize << std::endl
+                      << "boundary: " << errboundary << std::endl
+                      << std::endl;
+}
+
+void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
+                     cdef_filter_block_func ref_cdef) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_cdef(bsize, iterations, ref_cdef, ref_cdef);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_cdef(bsize, iterations, cdef, cdef);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+#if 0
+  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
+            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
+#endif
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
+                          int coeff_shift);
+
+typedef std::tr1::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+
+class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
+ public:
+  virtual ~CDEFFindDirTest() {}
+  virtual void SetUp() {
+    finddir = GET_PARAM(0);
+    ref_finddir = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  find_dir_t finddir;
+  find_dir_t ref_finddir;
+};
+
+typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+                                 int coeff_shift),
+                  int (*ref_finddir)(const uint16_t *img, int stride,
+                                     int32_t *var, int coeff_shift)) {
+  const int size = 8;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[size * size]);
+
+  int error = 0;
+  int depth, bits, level, count, errdepth = 0;
+  int ref_res = 0, res = 0;
+  int32_t ref_var = 0, var = 0;
+
+  for (depth = 8; depth <= 12 && !error; depth += 2) {
+    for (count = 0; count < 512 && !error; count++) {
+      for (level = 0; level < (1 << depth) && !error;
+           level += 1 << (depth - 8)) {
+        for (bits = 1; bits <= depth && !error; bits++) {
+          for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+            s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                         (1 << depth) - 1);
+          for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+            ref_res = ref_finddir(s, size, &ref_var, depth - 8);
+          if (finddir != ref_finddir)
+            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+          if (ref_finddir != finddir) {
+            if (res != ref_res || var != ref_var) error = 1;
+            errdepth = depth;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+                      << std::endl
+                      << "return: " << res << " : " << ref_res << std::endl
+                      << "var: " << var << " : " << ref_var << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << std::endl;
+}
+
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
+                                       int32_t *var, int coeff_shift),
+                        int (*ref_finddir)(const uint16_t *img, int stride,
+                                           int32_t *var, int coeff_shift)) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_finddir(ref_finddir, ref_finddir);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_finddir(finddir, finddir);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+#if 0
+  std::cout << "[          ] C time = " << ref_elapsed_time / 1000
+            << " ms, SIMD time = " << elapsed_time / 1000 << " ms" << std::endl;
+#endif
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
+  test_cdef(bsize, 1, cdef, ref_cdef);
+}
+
+TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
+  test_cdef_speed(bsize, 4, cdef, ref_cdef);
+}
+
+TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
+  test_finddir(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
+  test_finddir_speed(finddir, ref_finddir);
+}
+
+using std::tr1::make_tuple;
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, CDEFBlockTest,
+    ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_4X4),
+                      make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, CDEFBlockTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
+#endif
+
+// Test speed for all supported architectures
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, CDEFSpeedTest,
+    ::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_4X4),
+                      make_tuple(&cdef_filter_block_sse4_1,
+                                 &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, CDEFSpeedTest,
+    ::testing::Values(
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
+        make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#endif  // defined(_WIN64) || !defined(_MSC_VER)
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 43b975c..b035184 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -137,10 +137,16 @@
         "${AOM_ROOT}/test/simd_cmp_impl.h")
 
     if (CONFIG_CDEF)
-      set(AOM_UNIT_TEST_COMMON_SOURCES
-          ${AOM_UNIT_TEST_COMMON_SOURCES}
-          "${AOM_ROOT}/test/clpf_test.cc"
-          "${AOM_ROOT}/test/dering_test.cc")
+      if (CONFIG_CDEF_SINGLEPASS)
+        set(AOM_UNIT_TEST_COMMON_SOURCES
+            ${AOM_UNIT_TEST_COMMON_SOURCES}
+            "${AOM_ROOT}/test/cdef_test.cc")
+      else ()
+        set(AOM_UNIT_TEST_COMMON_SOURCES
+            ${AOM_UNIT_TEST_COMMON_SOURCES}
+            "${AOM_ROOT}/test/clpf_test.cc"
+            "${AOM_ROOT}/test/dering_test.cc")
+      endif ()
     endif ()
 
     # Omit 4-tap filter intra predictor test-- currently a 3-tap filter is in
diff --git a/test/test.mk b/test/test.mk
index 2f99ab7..9f60d57 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -135,8 +135,12 @@
 LIBAOM_TEST_SRCS-$(CONFIG_ADAPT_SCAN)  += scan_test.cc
 LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
 LIBAOM_TEST_SRCS-yes                   += lpf_8_test.cc
+ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
+LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += cdef_test.cc
+else
 LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += dering_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_CDEF)        += clpf_test.cc
+endif
 LIBAOM_TEST_SRCS-yes                   += simd_cmp_impl.h
 LIBAOM_TEST_SRCS-$(HAVE_SSE2)          += simd_cmp_sse2.cc
 LIBAOM_TEST_SRCS-$(HAVE_SSSE3)         += simd_cmp_ssse3.cc