Do real chroma RDO search for CDEF

Chroma now has a list of strenghts too, with the superblock signalling
shared between luma and chroma.

low-latency, cpu=4:

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
-0.0114 | -1.4626 | -1.4745 |  -0.0423 | 0.0430 | -0.0001 |    -0.7416

Change-Id: I389c77f1d80020f810e45f8502c656ad9d397c8c
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6203336..b31a32c 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -854,31 +854,14 @@
 if (aom_config("CONFIG_CDEF") eq "yes") {
   add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
   add_proto qw/void aom_clpf_hblock_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
-    add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
-    # VS compiling for 32 bit targets does not support vector types in
-    # structs as arguments, which makes the v256 type of the intrinsics
-    # hard to support, so optimizations for this target are disabled.
-    if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-      specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
-      specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
-    }
-  }
-  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
-  }
   add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
-  add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp";
   # VS compiling for 32 bit targets does not support vector types in
   # structs as arguments, which makes the v256 type of the intrinsics
   # hard to support, so optimizations for this target are disabled.
   if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
     specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
   }
 }
 
diff --git a/av1/av1_cx.mk b/av1/av1_cx.mk
index 9b624a2..15b2581 100644
--- a/av1/av1_cx.mk
+++ b/av1/av1_cx.mk
@@ -110,13 +110,6 @@
 AV1_CX_SRCS-yes += encoder/mbgraph.h
 ifeq ($(CONFIG_CDEF),yes)
 AV1_CX_SRCS-yes += encoder/pickcdef.c
-AV1_CX_SRCS-yes += encoder/clpf_rdo.c
-AV1_CX_SRCS-yes += encoder/clpf_rdo.h
-AV1_CX_SRCS-yes += encoder/clpf_rdo_simd.h
-AV1_CX_SRCS-$(HAVE_SSE2) += encoder/clpf_rdo_sse2.c
-AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/clpf_rdo_ssse3.c
-AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/clpf_rdo_sse4.c
-AV1_CX_SRCS-$(HAVE_NEON) += encoder/clpf_rdo_neon.c
 endif
 ifeq ($(CONFIG_PVQ),yes)
 # PVQ from daala
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 707f8603..e2f5b42 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -143,8 +143,8 @@
 #endif
 }
 
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
-                    int clpf_strength_u, int clpf_strength_v) {
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                    MACROBLOCKD *xd) {
   int r, c;
   int sbr, sbc;
   int nhsb, nvsb;
@@ -162,11 +162,9 @@
   int dering_left;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   int nplanes = 3;
-  int *lev;
   int chroma_dering =
       xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
       xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
-  lev = cm->cdef_strengths;
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
@@ -193,6 +191,7 @@
     dering_left = 1;
     for (sbc = 0; sbc < nhsb; sbc++) {
       int level, clpf_strength;
+      int uv_level, uv_clpf_strength;
       int nhb, nvb;
       int cstart = 0;
 #if 0  // TODO(stemidts/jmvalin): Handle tile borders correctly
@@ -205,18 +204,34 @@
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
       level = dering_level_table
-          [lev[cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                                   MAX_MIB_SIZE * sbc]
-                   ->mbmi.cdef_strength] /
+          [cm->cdef_strengths[cm->mi_grid_visible[MAX_MIB_SIZE * sbr *
+                                                      cm->mi_stride +
+                                                  MAX_MIB_SIZE * sbc]
+                                  ->mbmi.cdef_strength] /
            CLPF_STRENGTHS];
       clpf_strength =
-          lev[cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                                  MAX_MIB_SIZE * sbc]
-                  ->mbmi.cdef_strength] %
+          cm->cdef_strengths[cm->mi_grid_visible[MAX_MIB_SIZE * sbr *
+                                                     cm->mi_stride +
+                                                 MAX_MIB_SIZE * sbc]
+                                 ->mbmi.cdef_strength] %
           CLPF_STRENGTHS;
       clpf_strength += clpf_strength == 3;
+      uv_level = dering_level_table
+          [cm->cdef_uv_strengths[cm->mi_grid_visible[MAX_MIB_SIZE * sbr *
+                                                         cm->mi_stride +
+                                                     MAX_MIB_SIZE * sbc]
+                                     ->mbmi.cdef_strength] /
+           CLPF_STRENGTHS];
+      uv_clpf_strength =
+          cm->cdef_uv_strengths[cm->mi_grid_visible[MAX_MIB_SIZE * sbr *
+                                                        cm->mi_stride +
+                                                    MAX_MIB_SIZE * sbc]
+                                    ->mbmi.cdef_strength] %
+          CLPF_STRENGTHS;
+      uv_clpf_strength += uv_clpf_strength == 3;
       curr_row_dering[sbc] = 0;
-      if ((level == 0 && clpf_strength == 0) ||
+      if ((level == 0 && clpf_strength == 0 && uv_level == 0 &&
+           uv_clpf_strength == 0) ||
           (dering_count = sb_compute_dering_list(
                cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, dlist)) == 0) {
         dering_left = 0;
@@ -232,9 +247,11 @@
         int clpf_damping = 3 - (pli != AOM_PLANE_Y) + (cm->base_qindex >> 6);
 
         if (pli) {
-          if (!chroma_dering) level = 0;
-          clpf_strength = pli == 1 ? clpf_strength_u : clpf_strength_v;
-          clpf_strength += clpf_strength == 3;
+          if (chroma_dering)
+            level = uv_level;
+          else
+            level = 0;
+          clpf_strength = uv_clpf_strength;
         }
         if (sbc == nhsb - 1)
           cend = (nhb << bsize[pli]);
@@ -359,12 +376,7 @@
                     coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER,
                     (nhb << bsize[pli]));
 
-        /* FIXME: This is a temporary hack that uses more conservative
-           deringing for chroma. */
-        if (pli)
-          threshold = (level * 5 + 4) >> 3 << coeff_shift;
-        else
-          threshold = level << coeff_shift;
+        threshold = level << coeff_shift;
         if (threshold == 0 && clpf_strength == 0) continue;
         od_dering(dst,
                   &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index d3c33f2..e1944cf 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h
@@ -33,8 +33,7 @@
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
 int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
                            dering_list *dlist);
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
-                    int clpf_strength_u, int clpf_strength_v);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
 
 void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
                      AV1_COMMON *cm, MACROBLOCKD *xd);
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 17ac451..64f9a6e 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -404,9 +404,8 @@
 #if CONFIG_CDEF
   int nb_cdef_strengths;
   int cdef_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
   int cdef_bits;
-  int clpf_strength_u;
-  int clpf_strength_v;
 #endif
 
 #if CONFIG_DELTA_Q
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 990a516..a5abb90 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -2672,9 +2672,8 @@
   cm->nb_cdef_strengths = 1 << cm->cdef_bits;
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
     cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+    cm->cdef_uv_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
   }
-  cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
-  cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
 }
 #endif  // CONFIG_CDEF
 
@@ -4948,8 +4947,7 @@
 
 #if CONFIG_CDEF
   if (!cm->skip_loop_filter) {
-    av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->clpf_strength_u,
-                   cm->clpf_strength_v);
+    av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
   }
 #endif  // CONFIG_CDEF
 
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index fdc7ac8..9252b00 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3496,9 +3496,8 @@
   aom_wb_write_literal(wb, cm->cdef_bits, 2);
   for (i = 0; i < cm->nb_cdef_strengths; i++) {
     aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
+    aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
   }
-  aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
-  aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
 }
 #endif
 
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
deleted file mode 100644
index 0173681..0000000
--- a/av1/encoder/clpf_rdo.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/clpf.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_image.h"
-#include "aom/aom_integer.h"
-#include "av1/common/quant_common.h"
-
-// Calculate the error of a filtered and unfiltered block
-void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
-                       int ostride, int x0, int y0, int width, int height,
-                       int *sum0, int *sum1, unsigned int strength, int size,
-                       unsigned int dmp) {
-  int x, y;
-  for (y = y0; y < y0 + size; y++) {
-    for (x = x0; x < x0 + size; x++) {
-      const int O = org[y * ostride + x];
-      const int X = rec[y * rstride + x];
-      const int A = rec[AOMMAX(0, y - 2) * rstride + x];
-      const int B = rec[AOMMAX(0, y - 1) * rstride + x];
-      const int C = rec[y * rstride + AOMMAX(0, x - 2)];
-      const int D = rec[y * rstride + AOMMAX(0, x - 1)];
-      const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)];
-      const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
-      const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
-      const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
-      const int delta =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, dmp);
-      const int Y = X + delta;
-      *sum0 += (O - X) * (O - X);
-      *sum1 += (O - Y) * (O - Y);
-    }
-  }
-}
-
-void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
-                             int rstride, int ostride, int x0, int y0,
-                             int width, int height, int *sum, int size,
-                             unsigned int dmp) {
-  int x, y;
-
-  for (y = y0; y < y0 + size; y++) {
-    for (x = x0; x < x0 + size; x++) {
-      const int O = org[y * ostride + x];
-      const int X = rec[y * rstride + x];
-      const int A = rec[AOMMAX(0, y - 2) * rstride + x];
-      const int B = rec[AOMMAX(0, y - 1) * rstride + x];
-      const int C = rec[y * rstride + AOMMAX(0, x - 2)];
-      const int D = rec[y * rstride + AOMMAX(0, x - 1)];
-      const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)];
-      const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
-      const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
-      const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
-      const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp);
-      const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp);
-      const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp);
-      const int F1 = X + delta1;
-      const int F2 = X + delta2;
-      const int F3 = X + delta3;
-      sum[0] += (O - X) * (O - X);
-      sum[1] += (O - F1) * (O - F1);
-      sum[2] += (O - F2) * (O - F2);
-      sum[3] += (O - F3) * (O - F3);
-    }
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-// Identical to aom_clpf_detect_c() apart from "rec" and "org".
-void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
-                           int rstride, int ostride, int x0, int y0, int width,
-                           int height, int *sum0, int *sum1,
-                           unsigned int strength, int size, unsigned int bd,
-                           unsigned int dmp) {
-  const int shift = bd - 8;
-  int x, y;
-  for (y = y0; y < y0 + size; y++) {
-    for (x = x0; x < x0 + size; x++) {
-      const int O = org[y * ostride + x] >> shift;
-      const int X = rec[y * rstride + x] >> shift;
-      const int A = rec[AOMMAX(0, y - 2) * rstride + x] >> shift;
-      const int B = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
-      const int C = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
-      const int D = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
-      const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
-      const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
-      const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
-      const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
-      const int delta = av1_clpf_sample(X, A, B, C, D, E, F, G, H,
-                                        strength >> shift, dmp - shift);
-      const int Y = X + delta;
-      *sum0 += (O - X) * (O - X);
-      *sum1 += (O - Y) * (O - Y);
-    }
-  }
-}
-
-// aom_clpf_detect_multi_c() apart from "rec" and "org".
-void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
-                                 int rstride, int ostride, int x0, int y0,
-                                 int width, int height, int *sum, int size,
-                                 unsigned int bd, unsigned int dmp) {
-  const int shift = bd - 8;
-  int x, y;
-
-  for (y = y0; y < y0 + size; y++) {
-    for (x = x0; x < x0 + size; x++) {
-      int O = org[y * ostride + x] >> shift;
-      int X = rec[y * rstride + x] >> shift;
-      const int A = rec[AOMMAX(0, y - 2) * rstride + x] >> shift;
-      const int B = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
-      const int C = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
-      const int D = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
-      const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
-      const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
-      const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
-      const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
-      const int delta1 =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp - shift);
-      const int delta2 =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp - shift);
-      const int delta3 =
-          av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp - shift);
-      const int F1 = X + delta1;
-      const int F2 = X + delta2;
-      const int F3 = X + delta3;
-      sum[0] += (O - X) * (O - X);
-      sum[1] += (O - F1) * (O - F1);
-      sum[2] += (O - F2) * (O - F2);
-      sum[3] += (O - F3) * (O - F3);
-    }
-  }
-}
-#endif
-
-// Calculate the square error of all filter settings.  Result:
-// res[0][0]   : unfiltered
-// res[0][1-3] : strength=1,2,4, no signals
-static void clpf_rdo(const YV12_BUFFER_CONFIG *rec,
-                     const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                     unsigned int block_size, int w, int h, uint64_t res[4],
-                     int plane) {
-  int m, n;
-  int sum[4];
-  const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
-  const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
-  uint8_t *rec_buffer =
-      plane != AOM_PLANE_Y
-          ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
-          : rec->y_buffer;
-  uint8_t *org_buffer =
-      plane != AOM_PLANE_Y
-          ? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
-          : org->y_buffer;
-  int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
-  int rec_height =
-      plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
-  int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
-  int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
-  int damping =
-      cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
-
-  sum[0] = sum[1] = sum[2] = sum[3] = 0;
-
-  for (m = 0; m < h; m++) {
-    for (n = 0; n < w; n++) {
-      int xpos = n * block_size;
-      int ypos = m * block_size;
-      if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
-                               (xpos << subx) / MI_SIZE]
-               ->mbmi.skip) {
-#if CONFIG_AOM_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          aom_clpf_detect_multi_hbd(
-              CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
-              rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
-              block_size, cm->bit_depth, damping);
-        } else {
-          aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                                xpos, ypos, rec_width, rec_height, sum,
-                                block_size, damping);
-        }
-#else
-        aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                              xpos, ypos, rec_width, rec_height, sum,
-                              block_size, damping);
-#endif
-      }
-    }
-  }
-
-  res[0] += sum[0];
-  res[1] += sum[1];
-  res[2] += sum[2];
-  res[3] += sum[3];
-}
-
-void av1_clpf_test_plane(const YV12_BUFFER_CONFIG *rec,
-                         const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                         int *best_strength, int plane) {
-  int i;
-  uint64_t best, sums[4];
-  int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
-  int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
-  const int bs = MI_SIZE;
-  const int bslog = get_msb(bs);
-
-  memset(sums, 0, sizeof(sums));
-
-  clpf_rdo(rec, org, cm, bs, width >> bslog, height >> bslog, sums, plane);
-
-  // Add a favourable bias for conservative strengths
-  for (i = 0; i < 4; i++) sums[i] -= sums[i] >> (7 + i);
-
-  // Tag the strength to the error
-  for (i = 0; i < 4; i++) sums[i] = (sums[i] << 2) + i;
-
-  // Identify the strength with the smallest error
-  best = (uint64_t)1 << 63;
-  for (i = 0; i < 4; i++)
-    if (sums[i] < best) best = sums[i];
-  *best_strength = best & 3 ? 1 << ((best - 1) & 3) : 0;
-}
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
deleted file mode 100644
index e137378..0000000
--- a/av1/encoder/clpf_rdo.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_ENCODER_CLPF_H_
-#define AV1_ENCODER_CLPF_H_
-
-#include "av1/common/reconinter.h"
-
-void av1_clpf_test_plane(const YV12_BUFFER_CONFIG *rec,
-                         const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                         int *best_strength, int plane);
-
-#endif
diff --git a/av1/encoder/clpf_rdo_neon.c b/av1/encoder/clpf_rdo_neon.c
deleted file mode 100644
index 02053c5..0000000
--- a/av1/encoder/clpf_rdo_neon.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_neon
-#include "./clpf_rdo_simd.h"
diff --git a/av1/encoder/clpf_rdo_sse2.c b/av1/encoder/clpf_rdo_sse2.c
deleted file mode 100644
index 99847c0..0000000
--- a/av1/encoder/clpf_rdo_sse2.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse2
-#include "./clpf_rdo_simd.h"
diff --git a/av1/encoder/clpf_rdo_sse4.c b/av1/encoder/clpf_rdo_sse4.c
deleted file mode 100644
index 049f537..0000000
--- a/av1/encoder/clpf_rdo_sse4.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_sse4_1
-#include "./clpf_rdo_simd.h"
diff --git a/av1/encoder/clpf_rdo_ssse3.c b/av1/encoder/clpf_rdo_ssse3.c
deleted file mode 100644
index 35b23b2..0000000
--- a/av1/encoder/clpf_rdo_ssse3.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_simd.h"
-#define SIMD_FUNC(name) name##_ssse3
-#include "./clpf_rdo_simd.h"
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 757d589..963d7f5 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -19,7 +19,6 @@
 #if CONFIG_CDEF
 #include "av1/common/cdef.h"
 #include "av1/common/clpf.h"
-#include "av1/encoder/clpf_rdo.h"
 #endif  // CONFIG_CDEF
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
@@ -3522,7 +3521,6 @@
   }
 #if CONFIG_CDEF
   if (is_lossless_requested(&cpi->oxcf)) {
-    cm->clpf_strength_u = cm->clpf_strength_v = 0;
     cm->cdef_bits = 0;
     cm->cdef_strengths[0] = 0;
     cm->nb_cdef_strengths = 1;
@@ -3531,12 +3529,7 @@
     av1_cdef_search(cm->frame_to_show, cpi->Source, cm, xd);
 
     // Apply the filter
-    av1_cdef_frame(cm->frame_to_show, cm, xd, cm->clpf_strength_u,
-                   cm->clpf_strength_v);
-
-    // Pack the clpf chroma strengths into two bits each
-    cm->clpf_strength_u -= cm->clpf_strength_u == 4;
-    cm->clpf_strength_v -= cm->clpf_strength_v == 4;
+    av1_cdef_frame(cm->frame_to_show, cm, xd);
   }
 #endif
 #if CONFIG_LOOP_RESTORATION
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index cb8a500..96320d3 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -17,7 +17,6 @@
 #include "av1/common/cdef.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
-#include "av1/encoder/clpf_rdo.h"
 #include "av1/encoder/encoder.h"
 
 #define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS)
@@ -79,12 +78,12 @@
 }
 
 static double compute_dist(uint16_t *x, int xstride, uint16_t *y, int ystride,
-                           int nhb, int nvb, int coeff_shift) {
+                           int nhb, int nvb, int coeff_shift, int bsize) {
   int i, j;
   double sum;
   sum = 0;
-  for (i = 0; i < nvb << 3; i++) {
-    for (j = 0; j < nhb << 3; j++) {
+  for (i = 0; i < nvb << bsize; i++) {
+    for (j = 0; j < nhb << bsize; j++) {
       double tmp;
       tmp = x[i * xstride + j] - y[i * ystride + j];
       sum += tmp * tmp;
@@ -97,11 +96,11 @@
                      AV1_COMMON *cm, MACROBLOCKD *xd) {
   int r, c;
   int sbr, sbc;
-  uint16_t *src;
-  uint16_t *ref_coeff;
+  uint16_t *src[3];
+  uint16_t *ref_coeff[3];
   dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
-  int stride;
+  int stride[3];
   int bsize[3];
   int dec[3];
   int pli;
@@ -114,8 +113,8 @@
   int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
-  uint64_t(*mse)[DERING_STRENGTHS * CLPF_STRENGTHS] =
-      aom_malloc(sizeof(*mse) * nvsb * nhsb);
+  int *selected_strength = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+  uint64_t(*mse[3])[TOTAL_STRENGTHS];
   int clpf_damping = 3 + (cm->base_qindex >> 6);
   int i;
   int best_lev[CDEF_MAX_STRENGTHS];
@@ -123,35 +122,56 @@
   int nb_strength_bits;
   int quantizer;
   double lambda;
+  int nplanes = 3;
+  int chroma_dering =
+      xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
   quantizer =
       av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
   lambda = .12 * quantizer * quantizer / 256.;
 
-  src = aom_memalign(32, sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
-  ref_coeff =
-      aom_memalign(32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
-  for (pli = 0; pli < 3; pli++) {
+  for (pli = 0; pli < nplanes; pli++) {
+    uint8_t *ref_buffer;
+    int ref_stride;
+    switch (pli) {
+      case 0:
+        ref_buffer = ref->y_buffer;
+        ref_stride = ref->y_stride;
+        break;
+      case 1:
+        ref_buffer = ref->u_buffer;
+        ref_stride = ref->uv_stride;
+        break;
+      case 2:
+        ref_buffer = ref->v_buffer;
+        ref_stride = ref->uv_stride;
+        break;
+    }
+    mse[pli] = aom_malloc(sizeof(**mse) * nvsb * nhsb);
+    src[pli] = aom_memalign(32, sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
+    ref_coeff[pli] =
+        aom_memalign(32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
     dec[pli] = xd->plane[pli].subsampling_x;
     bsize[pli] = OD_DERING_SIZE_LOG2 - dec[pli];
-  }
-  stride = cm->mi_cols << bsize[0];
-  for (r = 0; r < cm->mi_rows << bsize[0]; ++r) {
-    for (c = 0; c < cm->mi_cols << bsize[0]; ++c) {
+    stride[pli] = cm->mi_cols << 3;
+    for (r = 0; r < cm->mi_rows << bsize[pli]; ++r) {
+      for (c = 0; c < cm->mi_cols << bsize[pli]; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        src[r * stride + c] = CONVERT_TO_SHORTPTR(
-            xd->plane[0].dst.buf)[r * xd->plane[0].dst.stride + c];
-        ref_coeff[r * stride + c] =
-            CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c];
-      } else {
+        if (cm->use_highbitdepth) {
+          src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
+              xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
+          ref_coeff[pli][r * stride[pli] + c] =
+              CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
+        } else {
 #endif
-        src[r * stride + c] =
-            xd->plane[0].dst.buf[r * xd->plane[0].dst.stride + c];
-        ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c];
+          src[pli][r * stride[pli] + c] =
+              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+          ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
 #if CONFIG_AOM_HIGHBITDEPTH
+        }
+#endif
       }
-#endif
     }
   }
   sb_count = 0;
@@ -175,44 +195,49 @@
         int j;
         level = dering_level_table[gi / CLPF_STRENGTHS];
         threshold = level << coeff_shift;
-        for (r = 0; r < nvb << bsize[0]; r++) {
-          for (c = 0; c < nhb << bsize[0]; c++) {
-            dst[(r * MAX_MIB_SIZE << bsize[0]) + c] =
-                src[((sbr * MAX_MIB_SIZE << bsize[0]) + r) * stride +
-                    (sbc * MAX_MIB_SIZE << bsize[0]) + c];
+        for (pli = 0; pli < nplanes; pli++) {
+          if (pli > 0 && !chroma_dering) threshold = 0;
+          for (r = 0; r < nvb << bsize[pli]; r++) {
+            for (c = 0; c < nhb << bsize[pli]; c++) {
+              dst[(r * MAX_MIB_SIZE << bsize[pli]) + c] =
+                  src[pli]
+                     [((sbr * MAX_MIB_SIZE << bsize[pli]) + r) * stride[pli] +
+                      (sbc * MAX_MIB_SIZE << bsize[pli]) + c];
+            }
           }
-        }
-        in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
-        /* We avoid filtering the pixels for which some of the pixels to average
-           are outside the frame. We could change the filter instead, but it
-           would
-           add special cases for any future vectorization. */
-        for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
-          inbuf[i] = OD_DERING_VERY_LARGE;
-        for (i = -OD_FILT_VBORDER * (sbr != 0);
-             i < (nvb << bsize[0]) + OD_FILT_VBORDER * (sbr != nvsb - 1); i++) {
-          for (j = -OD_FILT_HBORDER * (sbc != 0);
-               j < (nhb << bsize[0]) + OD_FILT_HBORDER * (sbc != nhsb - 1);
-               j++) {
-            uint16_t *x;
-            x = &src[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
-                     (sbc * MAX_MIB_SIZE << bsize[0])];
-            in[i * OD_FILT_BSTRIDE + j] = x[i * stride + j];
+          in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+          /* We avoid filtering the pixels for which some of the pixels to
+             average
+             are outside the frame. We could change the filter instead, but it
+             would add special cases for any future vectorization. */
+          for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
+            inbuf[i] = OD_DERING_VERY_LARGE;
+          for (i = -OD_FILT_VBORDER * (sbr != 0);
+               i < (nvb << bsize[pli]) + OD_FILT_VBORDER * (sbr != nvsb - 1);
+               i++) {
+            for (j = -OD_FILT_HBORDER * (sbc != 0);
+                 j < (nhb << bsize[pli]) + OD_FILT_HBORDER * (sbc != nhsb - 1);
+                 j++) {
+              uint16_t *x;
+              x = &src[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
+                            (sbc * MAX_MIB_SIZE << bsize[pli])];
+              in[i * OD_FILT_BSTRIDE + j] = x[i * stride[pli] + j];
+            }
           }
+          clpf_strength = gi % CLPF_STRENGTHS;
+          od_dering(tmp_dst, in, dec[pli], dir, pli, dlist, dering_count,
+                    threshold, clpf_strength + (clpf_strength == 3),
+                    clpf_damping, coeff_shift);
+          copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[pli], tmp_dst,
+                                     dlist, dering_count, bsize[pli]);
+          mse[pli][sb_count][gi] = (int)compute_dist(
+              dst, MAX_MIB_SIZE << bsize[pli],
+              &ref_coeff[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
+                              (sbc * MAX_MIB_SIZE << bsize[pli])],
+              stride[pli], nhb, nvb, coeff_shift, bsize[pli]);
+          sb_index[sb_count] =
+              MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
         }
-        clpf_strength = gi % CLPF_STRENGTHS;
-        od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold,
-                  clpf_strength + (clpf_strength == 3), clpf_damping,
-                  coeff_shift);
-        copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst,
-                                   dlist, dering_count, bsize[0]);
-        mse[sb_count][gi] = (int)compute_dist(
-            dst, MAX_MIB_SIZE << bsize[0],
-            &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
-                       (sbc * MAX_MIB_SIZE << bsize[0])],
-            stride, nhb, nvb, coeff_shift);
-        sb_index[sb_count] =
-            MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
       }
       sb_count++;
     }
@@ -222,7 +247,7 @@
   /* Search for different number of signalling bits. */
   for (i = 0; i <= 3; i++) {
     nb_strengths = 1 << i;
-    tot_mse = joint_strength_search(best_lev, nb_strengths, mse, sb_count);
+    tot_mse = joint_strength_search(best_lev, nb_strengths, mse[0], sb_count);
     /* Count superblock signalling cost. */
     tot_mse += (uint64_t)(sb_count * lambda * i);
     /* Count header signalling cost. */
@@ -243,21 +268,44 @@
     uint64_t best_mse = (uint64_t)1 << 63;
     best_gi = 0;
     for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
-      if (mse[i][best_lev[gi]] < best_mse) {
+      if (mse[0][i][best_lev[gi]] < best_mse) {
         best_gi = gi;
-        best_mse = mse[i][best_lev[gi]];
+        best_mse = mse[0][i][best_lev[gi]];
       }
     }
+    selected_strength[i] = best_gi;
     cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
   }
-
-  aom_free(src);
-  aom_free(ref_coeff);
-  aom_free(mse);
+  int str;
+  /* For each strength option we picked in luma, find the optimal chroma
+     strength. */
+  if (nplanes >= 3) {
+    for (str = 0; str < cm->nb_cdef_strengths; str++) {
+      int gi;
+      int best_gi = 0;
+      best_tot_mse = (uint64_t)1 << 63;
+      for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
+        tot_mse = 0;
+        for (i = 0; i < sb_count; i++) {
+          if (selected_strength[i] == str) {
+            tot_mse += mse[1][i][gi] + mse[2][i][gi];
+          }
+        }
+        if (tot_mse < best_tot_mse) {
+          best_gi = gi;
+          best_tot_mse = tot_mse;
+        }
+      }
+      cm->cdef_uv_strengths[str] = best_gi;
+    }
+  } else {
+    for (str = 0; str < nb_strengths; str++) selected_strength[str] = 0;
+  }
+  for (pli = 0; pli < nplanes; pli++) {
+    aom_free(src[pli]);
+    aom_free(ref_coeff[pli]);
+    aom_free(mse[pli]);
+  }
   aom_free(sb_index);
-
-  av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_u,
-                      AOM_PLANE_U);
-  av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_v,
-                      AOM_PLANE_V);
+  aom_free(selected_strength);
 }