Merge dering/clpf rdo and filtering

* Dering and clpf were merged into a single pass.
* 32x32 and 128x128 filter block sizes for clpf were removed.
* RDO for dering and clpf merged and improved:
  - "0" no longer required to be in the strength selection
  - Dering strength can now be 0, 1 or 2 bits per block

              LL    HL
PSNR:       -0.04 -0.01
PSNR HVS:   -0.27 -0.18
SSIM:       -0.15 +0.01
CIEDE 2000: -0.11 -0.03
APSNR:      -0.03 -0.00
MS SSIM:    -0.18 -0.11

Change-Id: I9f002a16ad218eab6007f90f1f176232443495f0
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 7ff1e16..587134f 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -24,8 +24,8 @@
 #endif  // CONFIG_BITSTREAM_DEBUG
 
 #if CONFIG_CDEF
+#include "av1/common/cdef.h"
 #include "av1/common/clpf.h"
-#include "av1/common/dering.h"
 #endif  // CONFIG_CDEF
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
@@ -2735,95 +2735,29 @@
 #if CONFIG_CDEF
 #if CONFIG_EXT_PARTITION
   if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 &&
-      cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
+      !sb_all_skip(cm, mi_row, mi_col)) {
     aom_write_literal(
         w,
         cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain,
-        DERING_REFINEMENT_BITS);
+        cm->dering_bits);
+    aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+                             ->mbmi.clpf_strength,
+                      cm->clpf_bits);
   } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 &&
 #else
   if (bsize == BLOCK_64X64 &&
 #endif  // CONFIG_EXT_PARTITION
-             cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
-    aom_write_literal(
-        w,
-        cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain,
-        DERING_REFINEMENT_BITS);
+             !sb_all_skip(cm, mi_row, mi_col)) {
+    if (cm->dering_bits)
+      aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+                               ->mbmi.dering_gain,
+                        cm->dering_bits);
+    if (cm->clpf_bits)
+      aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
+                               ->mbmi.clpf_strength,
+                        cm->clpf_bits);
   }
 #endif
-
-#if CONFIG_CDEF
-#if CONFIG_EXT_PARTITION
-  if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 &&
-      cm->clpf_blocks && cm->clpf_strength_y && cm->clpf_size != CLPF_NOSIZE) {
-    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
-                   mi_col * MI_SIZE / MIN_FB_SIZE;
-    if (cm->clpf_size == CLPF_128X128 && cm->clpf_blocks[tl] != CLPF_NOFLAG) {
-      aom_write_literal(w, cm->clpf_blocks[tl], 1);
-    } else if (cm->clpf_size == CLPF_64X64) {
-      const int tr = tl + 2;
-      const int bl = tl + 2 * cm->clpf_stride;
-      const int br = tr + 2 * cm->clpf_stride;
-
-      // Up to four bits per SB.
-      if (cm->clpf_blocks[tl] != CLPF_NOFLAG)
-        aom_write_literal(w, cm->clpf_blocks[tl], 1);
-
-      if (mi_col + MI_SIZE < cm->mi_cols && cm->clpf_blocks[tr] != CLPF_NOFLAG)
-        aom_write_literal(w, cm->clpf_blocks[tr], 1);
-
-      if (mi_row + MI_SIZE < cm->mi_rows && cm->clpf_blocks[bl] != CLPF_NOFLAG)
-        aom_write_literal(w, cm->clpf_blocks[bl], 1);
-
-      if (mi_row + MI_SIZE < cm->mi_rows && mi_col + MI_SIZE < cm->mi_cols &&
-          cm->clpf_blocks[br] != CLPF_NOFLAG)
-        aom_write_literal(w, cm->clpf_blocks[br], 1);
-    } else if (cm->clpf_size == CLPF_32X32) {
-      int i, j;
-      const int size = 32 / MI_SIZE;
-      // Up to sixteen bits per SB.
-      for (i = 0; i < 4; ++i)
-        for (j = 0; j < 4; ++j) {
-          const int index = tl + i * cm->clpf_stride + j;
-          if (mi_row + i * size < cm->mi_rows &&
-              mi_col + j * size < cm->mi_cols &&
-              cm->clpf_blocks[index] != CLPF_NOFLAG)
-            aom_write_literal(w, cm->clpf_blocks[index], 1);
-        }
-    }
-  } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 &&
-#else
-  if (bsize == BLOCK_64X64 &&
-#endif  // CONFIG_EXT_PARTITION
-             cm->clpf_blocks && cm->clpf_strength_y &&
-             cm->clpf_size != CLPF_NOSIZE) {
-    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
-                   mi_col * MI_SIZE / MIN_FB_SIZE;
-    const int tr = tl + 1;
-    const int bl = tl + cm->clpf_stride;
-    const int br = tr + cm->clpf_stride;
-
-    // Up to four bits per SB.
-    // When clpf_size indicates a size larger than the SB size
-    // (CLPF_128X128), one bit for every fourth SB will be transmitted
-    // regardless of skip blocks.
-    if (cm->clpf_blocks[tl] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[tl], 1);
-
-    if (mi_col + MI_SIZE / 2 < cm->mi_cols &&
-        cm->clpf_blocks[tr] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[tr], 1);
-
-    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
-        cm->clpf_blocks[bl] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[bl], 1);
-
-    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
-        mi_col + MI_SIZE / 2 < cm->mi_cols &&
-        cm->clpf_blocks[br] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[br], 1);
-  }
-#endif  // CONFIG_CDEF
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
@@ -3522,22 +3456,13 @@
 }
 
 #if CONFIG_CDEF
-static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, cm->clpf_strength_y, 2);
+static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, cm->dering_level, DERING_LEVEL_BITS);
   aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
   aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
-  if (cm->clpf_strength_y) {
-    aom_wb_write_literal(wb, cm->clpf_size, 2);
-  }
 }
 #endif
 
-#if CONFIG_CDEF
-static void encode_dering(int level, struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, level, DERING_LEVEL_BITS);
-}
-#endif  // CONFIG_CDEF
-
 static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
     aom_wb_write_bit(wb, 1);
@@ -4481,8 +4406,7 @@
 
   encode_loopfilter(cm, wb);
 #if CONFIG_CDEF
-  encode_dering(cm->dering_level, wb);
-  encode_clpf(cm, wb);
+  encode_cdef(cm, wb);
 #endif
 #if CONFIG_LOOP_RESTORATION
   encode_restoration_mode(cm, wb);
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 3ef67cc..0173681 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@@ -142,68 +142,17 @@
 }
 #endif
 
-int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
-                      const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                      int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, int8_t *res, int plane) {
-  int m, n, sum0 = 0, sum1 = 0;
-  int damping =
-      cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
-
-  for (m = 0; m < h; m++) {
-    for (n = 0; n < w; n++) {
-      int xpos = (l << fb_size_log2) + n * block_size;
-      int ypos = (k << fb_size_log2) + m * block_size;
-      if (fb_size_log2 == MAX_FB_SIZE_LOG2 ||
-          !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
-               ->mbmi.skip) {
-#if CONFIG_AOM_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer),
-                              CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
-                              org->y_stride, xpos, ypos, rec->y_crop_width,
-                              rec->y_crop_height, &sum0, &sum1, strength,
-                              block_size, cm->bit_depth, damping);
-        } else {
-          aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
-                          org->y_stride, xpos, ypos, rec->y_crop_width,
-                          rec->y_crop_height, &sum0, &sum1, strength,
-                          block_size, damping);
-        }
-#else
-        aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
-                        org->y_stride, xpos, ypos, rec->y_crop_width,
-                        rec->y_crop_height, &sum0, &sum1, strength, block_size,
-                        damping);
-#endif
-      }
-    }
-  }
-  *res = sum1 < sum0;
-  return *res;
-}
-
 // Calculate the square error of all filter settings.  Result:
 // res[0][0]   : unfiltered
 // res[0][1-3] : strength=1,2,4, no signals
-// (Only for luma:)
-// res[1][0]   : (bit count, fb size = 128)
-// res[1][1-3] : strength=1,2,4, fb size = 128
-// res[1][4]   : unfiltered, including skip
-// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128
-// res[2][0]   : (bit count, fb size = 64)
-// res[2][1-3] : strength=1,2,4, fb size = 64
-// res[3][0]   : (bit count, fb size = 32)
-// res[3][1-3] : strength=1,2,4, fb size = 32
-static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
-                    const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                    unsigned int block_size, unsigned int fb_size_log2, int w,
-                    int h, int64_t res[4][8], int plane) {
-  int c, m, n, filtered = 0;
-  int sum[8];
+static void clpf_rdo(const YV12_BUFFER_CONFIG *rec,
+                     const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
+                     unsigned int block_size, int w, int h, uint64_t res[4],
+                     int plane) {
+  int m, n;
+  int sum[4];
   const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
   const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
-  int bslog = get_msb(block_size);
   uint8_t *rec_buffer =
       plane != AOM_PLANE_Y
           ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
@@ -220,166 +169,64 @@
   int damping =
       cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
 
-  sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
-  if (plane == AOM_PLANE_Y &&
-      fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
-    int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
-
-    filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2;
-    w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
-    h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
-    w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
-    h2 = AOMMIN(h - (1 << (fb_size_log2 - bslog)), h >> 1);
-    i = get_msb(MAX_FB_SIZE) - fb_size_log2;
-    sum1 = (int)res[i][1];
-    sum2 = (int)res[i][2];
-    sum3 = (int)res[i][3];
-    oldfiltered = (int)res[i][0];
-    res[i][0] = 0;
-
-    filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
-                         res, plane);
-    if (1 << (fb_size_log2 - bslog) < w)
-      filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
-                           fb_size_log2, w2, h1, res, plane);
-    if (1 << (fb_size_log2 - bslog) < h) {
-      filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
-                           fb_size_log2, w1, h2, res, plane);
-      filtered |=
-          clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
-                   cm, block_size, fb_size_log2, w2, h2, res, plane);
-    }
-
-    // Correct sums for unfiltered blocks
-    res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
-    res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
-    res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
-    if (i == 1) {
-      res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]);
-      res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]);
-      res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]);
-    }
-
-    res[i][0] = oldfiltered + filtered;  // Number of signal bits
-
-    return filtered;
-  }
+  sum[0] = sum[1] = sum[2] = sum[3] = 0;
 
   for (m = 0; m < h; m++) {
     for (n = 0; n < w; n++) {
-      int xpos = x + n * block_size;
-      int ypos = y + m * block_size;
-      int skip =  // Filtered skip blocks stored only for fb_size == 128
-          4 *
-          !!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
-                                (xpos << subx) / MI_SIZE]
-                ->mbmi.skip;
+      int xpos = n * block_size;
+      int ypos = m * block_size;
+      if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                               (xpos << subx) / MI_SIZE]
+               ->mbmi.skip) {
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        aom_clpf_detect_multi_hbd(
-            CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
-            rec_stride, org_stride, xpos, ypos, rec_width, rec_height,
-            sum + skip, block_size, cm->bit_depth, damping);
-      } else {
-        aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                              xpos, ypos, rec_width, rec_height, sum + skip,
-                              block_size, damping);
-      }
+        if (cm->use_highbitdepth) {
+          aom_clpf_detect_multi_hbd(
+              CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
+              rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
+              block_size, cm->bit_depth, damping);
+        } else {
+          aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                                xpos, ypos, rec_width, rec_height, sum,
+                                block_size, damping);
+        }
 #else
-      aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                            xpos, ypos, rec_width, rec_height, sum + skip,
-                            block_size, damping);
+        aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                              xpos, ypos, rec_width, rec_height, sum,
+                              block_size, damping);
 #endif
-      filtered |= !skip;
+      }
     }
   }
 
-  for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) {
-    res[c][0] += sum[0];
-    res[c][1] += sum[1];
-    res[c][2] += sum[2];
-    res[c][3] += sum[3];
-    if (c != 1) continue;
-    // Only needed when fb_size == 128
-    res[c][4] += sum[4];
-    res[c][5] += sum[5];
-    res[c][6] += sum[6];
-    res[c][7] += sum[7];
-  }
-  return filtered;
+  res[0] += sum[0];
+  res[1] += sum[1];
+  res[2] += sum[2];
+  res[3] += sum[3];
 }
 
-void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
+void av1_clpf_test_plane(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                         int *best_strength, int *best_bs, int plane) {
-  int c, j, k, l;
-  int64_t best, sums[4][8];
+                         int *best_strength, int plane) {
+  int i;
+  uint64_t best, sums[4];
   int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
   int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   const int bs = MI_SIZE;
   const int bslog = get_msb(bs);
-  int fb_size_log2 = get_msb(MAX_FB_SIZE);
-  int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2;
-  int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2;
 
   memset(sums, 0, sizeof(sums));
 
-  if (plane != AOM_PLANE_Y)
-    // Use a block size of MI_SIZE regardless of the subsampling.  This
-    // This is accurate enough to determine the best strength and
-    // we don't need to add SIMD optimisations for 4x4 blocks.
-    clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog,
-             height >> bslog, sums, plane);
-  else
-    for (k = 0; k < num_fb_ver; k++) {
-      for (l = 0; l < num_fb_hor; l++) {
-        // Calculate the block size after frame border clipping
-        int h =
-            AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
-        int w =
-            AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
-        h += !h << fb_size_log2;
-        w += !w << fb_size_log2;
-        clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE,
-                 fb_size_log2, w >> bslog, h >> bslog, sums, plane);
-      }
-    }
+  clpf_rdo(rec, org, cm, bs, width >> bslog, height >> bslog, sums, plane);
 
-  // For fb_size == 128 skip blocks are included in the result.
-  if (plane == AOM_PLANE_Y) {
-    sums[1][1] += sums[1][5] - sums[1][4];
-    sums[1][2] += sums[1][6] - sums[1][4];
-    sums[1][3] += sums[1][7] - sums[1][4];
-  } else {  // Slightly favour unfiltered chroma
-    sums[0][0] -= sums[0][0] >> 7;
-  }
+  // Add a favourable bias for conservative strengths
+  for (i = 0; i < 4; i++) sums[i] -= sums[i] >> (7 + i);
 
-  for (j = 0; j < 4; j++) {
-    static const double lambda_square[] = {
-      // exp(x / 8.5)
-      1.0000, 1.1248, 1.2653, 1.4232, 1.6009, 1.8008, 2.0256, 2.2785,
-      2.5630, 2.8830, 3.2429, 3.6478, 4.1032, 4.6155, 5.1917, 5.8399,
-      6.5689, 7.3891, 8.3116, 9.3492, 10.516, 11.829, 13.306, 14.967,
-      16.836, 18.938, 21.302, 23.962, 26.953, 30.318, 34.103, 38.361,
-      43.151, 48.538, 54.598, 61.414, 69.082, 77.706, 87.408, 98.320,
-      110.59, 124.40, 139.93, 157.40, 177.05, 199.16, 224.02, 251.99,
-      283.45, 318.84, 358.65, 403.42, 453.79, 510.45, 574.17, 645.86,
-      726.49, 817.19, 919.22, 1033.9, 1163.0, 1308.2, 1471.6, 1655.3
-    };
+  // Tag the strength to the error
+  for (i = 0; i < 4; i++) sums[i] = (sums[i] << 2) + i;
 
-    // Estimate the bit costs and adjust the square errors
-    double lambda =
-        lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2];
-    int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5));
-    for (i = 0; i < 4; i++)
-      sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i;
-  }
-
-  best = (int64_t)1 << 62;
-  for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++)
-    for (j = 0; j < 4; j++)
-      if ((!c || j) && sums[c][j] < best) best = sums[c][j];
-  best &= 15;
-  if (best_bs) *best_bs = (best > 3) * (5 + (best < 12) + (best < 8));
-  *best_strength = best ? 1 << ((best - 1) & 3) : 0;
+  // Identify the strength with the smallest error
+  best = (uint64_t)1 << 63;
+  for (i = 0; i < 4; i++)
+    if (sums[i] < best) best = sums[i];
+  *best_strength = best & 3 ? 1 << ((best - 1) & 3) : 0;
 }
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index f92f7d2..e137378 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@@ -14,13 +14,8 @@
 
 #include "av1/common/reconinter.h"
 
-int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
-                      const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                      int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, int8_t *res, int plane);
-
-void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
+void av1_clpf_test_plane(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
-                         int *best_strength, int *best_bs, int plane);
+                         int *best_strength, int plane);
 
 #endif
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index aca6e0b..7bc9710 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -17,10 +17,9 @@
 
 #include "av1/common/alloccommon.h"
 #if CONFIG_CDEF
-#include "aom/aom_image.h"
+#include "av1/common/cdef.h"
 #include "av1/common/clpf.h"
 #include "av1/encoder/clpf_rdo.h"
-#include "av1/common/dering.h"
 #endif  // CONFIG_CDEF
 #include "av1/common/filter.h"
 #include "av1/common/idct.h"
@@ -3526,57 +3525,18 @@
   }
 #if CONFIG_CDEF
   if (is_lossless_requested(&cpi->oxcf)) {
-    cm->dering_level = 0;
+    cm->dering_level = cm->clpf_strength_u = cm->clpf_strength_v = 0;
   } else {
-    cm->dering_level =
-        av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd);
-    av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
-  }
-  cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
-  cm->clpf_size = CLPF_64X64;
+    // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
+    av1_cdef_search(cm->frame_to_show, cpi->Source, cm, xd);
 
-  // Allocate buffer to hold the status of all filter blocks:
-  // 1 = On, 0 = off, -1 = implicitly off
-  {
-    int size;
-    cm->clpf_stride = ((cm->frame_to_show->y_crop_width + MIN_FB_SIZE - 1) &
-                       ~(MIN_FB_SIZE - 1)) >>
-                      MIN_FB_SIZE_LOG2;
-    size = cm->clpf_stride *
-               ((cm->frame_to_show->y_crop_height + MIN_FB_SIZE - 1) &
-                ~(MIN_FB_SIZE - 1)) >>
-           MIN_FB_SIZE_LOG2;
-    CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
-    memset(cm->clpf_blocks, CLPF_NOFLAG, size);
-  }
+    // Apply the filter
+    av1_cdef_frame(cm->frame_to_show, cm, xd, cm->dering_level,
+                   cm->clpf_strength_u, cm->clpf_strength_v);
 
-  if (!is_lossless_requested(&cpi->oxcf)) {
-    const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
-
-    // Find the best strength and block size for the entire frame
-    int fb_size_log2, strength_y, strength_u, strength_v;
-    av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2,
-                        AOM_PLANE_Y);
-    av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, 0, AOM_PLANE_U);
-    av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, 0, AOM_PLANE_V);
-
-    if (strength_y) {
-      // Apply the filter using the chosen strength
-      cm->clpf_strength_y = strength_y - (strength_y == 4);
-      cm->clpf_size =
-          fb_size_log2 ? fb_size_log2 - MAX_FB_SIZE_LOG2 + 3 : CLPF_NOSIZE;
-      av1_clpf_frame(frame, cpi->Source, cm, cm->clpf_size != CLPF_NOSIZE,
-                     strength_y, 4 + cm->clpf_size, AOM_PLANE_Y,
-                     av1_clpf_decision);
-    }
-    if (strength_u) {
-      cm->clpf_strength_u = strength_u - (strength_u == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, AOM_PLANE_U, NULL);
-    }
-    if (strength_v) {
-      cm->clpf_strength_v = strength_v - (strength_v == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, AOM_PLANE_V, NULL);
-    }
+    // Pack the clpf chroma strengths into two bits each
+    cm->clpf_strength_u -= cm->clpf_strength_u == 4;
+    cm->clpf_strength_v -= cm->clpf_strength_v == 4;
   }
 #endif
 #if CONFIG_LOOP_RESTORATION
@@ -4980,11 +4940,6 @@
   if (cm->show_frame) dump_filtered_recon_frames(cpi);
 #endif  // DUMP_RECON_FRAMES
 
-#if CONFIG_CDEF
-  aom_free(cm->clpf_blocks);
-  cm->clpf_blocks = 0;
-#endif
-
   if (cm->seg.update_map) update_reference_segmentation_map(cpi);
 
   if (frame_is_intra_only(cm) == 0) {
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
new file mode 100644
index 0000000..4ff308e
--- /dev/null
+++ b/av1/encoder/pickcdef.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "./aom_scale_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/clpf_rdo.h"
+#include "av1/encoder/encoder.h"
+
+static double compute_dist(uint16_t *x, int xstride, uint16_t *y, int ystride,
+                           int nhb, int nvb, int coeff_shift) {
+  int i, j;
+  double sum;
+  sum = 0;
+  for (i = 0; i < nvb << 3; i++) {
+    for (j = 0; j < nhb << 3; j++) {
+      double tmp;
+      tmp = x[i * xstride + j] - y[i * ystride + j];
+      sum += tmp * tmp;
+    }
+  }
+  return sum / (double)(1 << 2 * coeff_shift);
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd) {
+  int r, c;
+  int sbr, sbc;
+  uint16_t *src;
+  uint16_t *ref_coeff;
+  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
+  int stride;
+  int bsize[3];
+  int dec[3];
+  int pli;
+  int level;
+  int dering_count;
+  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  uint64_t best_tot_mse = 0;
+  int sb_count;
+  int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
+  uint64_t(*mse)[DERING_STRENGTHS][CLPF_STRENGTHS] =
+      aom_malloc(sizeof(*mse) * nvsb * nhsb);
+  int clpf_damping = 3 + (cm->base_qindex >> 6);
+  int i;
+  int lev[DERING_REFINEMENT_LEVELS];
+  int best_lev[DERING_REFINEMENT_LEVELS];
+  int str[CLPF_REFINEMENT_LEVELS];
+  int best_str[CLPF_REFINEMENT_LEVELS];
+  double lambda = exp(cm->base_qindex / 36.0);
+  static int log2[] = { 0, 1, 2, 2 };
+
+  src = aom_memalign(32, sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
+  ref_coeff =
+      aom_memalign(32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
+  av1_setup_dst_planes(xd->plane, frame, 0, 0);
+  for (pli = 0; pli < 3; pli++) {
+    dec[pli] = xd->plane[pli].subsampling_x;
+    bsize[pli] = OD_DERING_SIZE_LOG2 - dec[pli];
+  }
+  stride = cm->mi_cols << bsize[0];
+  for (r = 0; r < cm->mi_rows << bsize[0]; ++r) {
+    for (c = 0; c < cm->mi_cols << bsize[0]; ++c) {
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (cm->use_highbitdepth) {
+        src[r * stride + c] = CONVERT_TO_SHORTPTR(
+            xd->plane[0].dst.buf)[r * xd->plane[0].dst.stride + c];
+        ref_coeff[r * stride + c] =
+            CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c];
+      } else {
+#endif
+        src[r * stride + c] =
+            xd->plane[0].dst.buf[r * xd->plane[0].dst.stride + c];
+        ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c];
+#if CONFIG_AOM_HIGHBITDEPTH
+      }
+#endif
+    }
+  }
+  sb_count = 0;
+  for (sbr = 0; sbr < nvsb; sbr++) {
+    for (sbc = 0; sbc < nhsb; sbc++) {
+      int nvb, nhb;
+      int gi;
+      DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
+      DECLARE_ALIGNED(32, uint16_t,
+                      tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
+      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
+      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
+                                            sbc * MAX_MIB_SIZE, dlist);
+      if (dering_count == 0) continue;
+      for (gi = 0; gi < DERING_STRENGTHS; gi++) {
+        int threshold;
+        DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
+        uint16_t *in;
+        int j;
+        level = dering_level_table[gi];
+        threshold = level << coeff_shift;
+        for (r = 0; r < nvb << bsize[0]; r++) {
+          for (c = 0; c < nhb << bsize[0]; c++) {
+            dst[(r * MAX_MIB_SIZE << bsize[0]) + c] =
+                src[((sbr * MAX_MIB_SIZE << bsize[0]) + r) * stride +
+                    (sbc * MAX_MIB_SIZE << bsize[0]) + c];
+          }
+        }
+        in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
+        /* We avoid filtering the pixels for which some of the pixels to average
+           are outside the frame. We could change the filter instead, but it
+           would
+           add special cases for any future vectorization. */
+        for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
+          inbuf[i] = OD_DERING_VERY_LARGE;
+        for (i = -OD_FILT_VBORDER * (sbr != 0);
+             i < (nvb << bsize[0]) + OD_FILT_VBORDER * (sbr != nvsb - 1); i++) {
+          for (j = -OD_FILT_HBORDER * (sbc != 0);
+               j < (nhb << bsize[0]) + OD_FILT_HBORDER * (sbc != nhsb - 1);
+               j++) {
+            uint16_t *x;
+            x = &src[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
+                     (sbc * MAX_MIB_SIZE << bsize[0])];
+            in[i * OD_FILT_BSTRIDE + j] = x[i * stride + j];
+          }
+        }
+        for (i = 0; i < CLPF_STRENGTHS; i++) {
+          od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold,
+                    i + (i == 3), clpf_damping, coeff_shift, 0);
+          copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst,
+                                     dlist, dering_count, bsize[0]);
+          mse[sb_count][gi][i] = (int)compute_dist(
+              dst, MAX_MIB_SIZE << bsize[0],
+              &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
+                         (sbc * MAX_MIB_SIZE << bsize[0])],
+              stride, nhb, nvb, coeff_shift);
+        }
+        sb_index[sb_count] =
+            MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
+      }
+      sb_count++;
+    }
+  }
+  best_tot_mse = (uint64_t)1 << 63;
+
+  int l0;
+  for (l0 = 0; l0 < DERING_STRENGTHS; l0++) {
+    int l1;
+    lev[0] = l0;
+    for (l1 = l0; l1 < DERING_STRENGTHS; l1++) {
+      int l2;
+      lev[1] = l1;
+      for (l2 = l1; l2 < DERING_STRENGTHS; l2++) {
+        int l3;
+        lev[2] = l2;
+        for (l3 = l2; l3 < DERING_STRENGTHS; l3++) {
+          int cs0;
+          lev[3] = l3;
+          for (cs0 = 0; cs0 < CLPF_STRENGTHS; cs0++) {
+            int cs1;
+            str[0] = cs0;
+            for (cs1 = cs0; cs1 < CLPF_STRENGTHS; cs1++) {
+              uint64_t tot_mse = 0;
+              str[1] = cs1;
+              for (i = 0; i < sb_count; i++) {
+                int gi;
+                int cs;
+                uint64_t best_mse = (uint64_t)1 << 63;
+                for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
+                  for (cs = 0; cs < CLPF_REFINEMENT_LEVELS; cs++) {
+                    if (mse[i][lev[gi]][str[cs]] < best_mse) {
+                      best_mse = mse[i][lev[gi]][str[cs]];
+                    }
+                  }
+                }
+                tot_mse += best_mse;
+              }
+
+              // Add the bit cost
+              int dering_diffs = 0, clpf_diffs = 0;
+              for (i = 1; i < DERING_REFINEMENT_LEVELS; i++)
+                dering_diffs += lev[i] != lev[i - 1];
+              for (i = 1; i < CLPF_REFINEMENT_LEVELS; i++)
+                clpf_diffs += str[i] != str[i - 1];
+              tot_mse += (uint64_t)(sb_count * lambda *
+                                    (log2[dering_diffs] + log2[clpf_diffs]));
+
+              if (tot_mse < best_tot_mse) {
+                for (i = 0; i < DERING_REFINEMENT_LEVELS; i++)
+                  best_lev[i] = lev[i];
+                for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++)
+                  best_str[i] = str[i];
+                best_tot_mse = tot_mse;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  for (i = 0; i < DERING_REFINEMENT_LEVELS; i++) lev[i] = best_lev[i];
+  for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++) str[i] = best_str[i];
+
+  id_to_levels(lev, str, levels_to_id(lev, str));  // Pack tables
+  cdef_get_bits(lev, str, &cm->dering_bits, &cm->clpf_bits);
+
+  for (i = 0; i < sb_count; i++) {
+    int gi, cs;
+    int best_gi, best_clpf;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    best_gi = best_clpf = 0;
+    for (gi = 0; gi < (1 << cm->dering_bits); gi++) {
+      for (cs = 0; cs < (1 << cm->clpf_bits); cs++) {
+        if (mse[i][lev[gi]][str[cs]] < best_mse) {
+          best_gi = gi;
+          best_clpf = cs;
+          best_mse = mse[i][lev[gi]][str[cs]];
+        }
+      }
+    }
+    cm->mi_grid_visible[sb_index[i]]->mbmi.dering_gain = best_gi;
+    cm->mi_grid_visible[sb_index[i]]->mbmi.clpf_strength = best_clpf;
+  }
+
+  aom_free(src);
+  aom_free(ref_coeff);
+  aom_free(mse);
+  aom_free(sb_index);
+
+  av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_u,
+                      AOM_PLANE_U);
+  av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_v,
+                      AOM_PLANE_V);
+  cm->dering_level = levels_to_id(best_lev, best_str);
+}
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
deleted file mode 100644
index dce7686..0000000
--- a/av1/encoder/pickdering.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include <math.h>
-
-#include "./aom_scale_rtcd.h"
-#include "av1/common/dering.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-#include "av1/encoder/encoder.h"
-#include "aom/aom_integer.h"
-
-static double compute_dist(int16_t *x, int xstride, int16_t *y, int ystride,
-                           int nhb, int nvb, int coeff_shift) {
-  int i, j;
-  double sum;
-  sum = 0;
-  for (i = 0; i < nvb << 3; i++) {
-    for (j = 0; j < nhb << 3; j++) {
-      double tmp;
-      tmp = x[i * xstride + j] - y[i * ystride + j];
-      sum += tmp * tmp;
-    }
-  }
-  return sum / (double)(1 << 2 * coeff_shift);
-}
-
-int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
-                      AV1_COMMON *cm, MACROBLOCKD *xd) {
-  int r, c;
-  int sbr, sbc;
-  int nhsb, nvsb;
-  int16_t *src;
-  int16_t *ref_coeff;
-  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
-  int stride;
-  int bsize[3];
-  int dec[3];
-  int pli;
-  int level;
-  int best_level;
-  int dering_count;
-  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
-  src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
-  ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
-  av1_setup_dst_planes(xd->plane, frame, 0, 0);
-  for (pli = 0; pli < 3; pli++) {
-    dec[pli] = xd->plane[pli].subsampling_x;
-    bsize[pli] = OD_DERING_SIZE_LOG2 - dec[pli];
-  }
-  stride = cm->mi_cols << bsize[0];
-  for (r = 0; r < cm->mi_rows << bsize[0]; ++r) {
-    for (c = 0; c < cm->mi_cols << bsize[0]; ++c) {
-#if CONFIG_AOM_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        src[r * stride + c] = CONVERT_TO_SHORTPTR(
-            xd->plane[0].dst.buf)[r * xd->plane[0].dst.stride + c];
-        ref_coeff[r * stride + c] =
-            CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c];
-      } else {
-#endif
-        src[r * stride + c] =
-            xd->plane[0].dst.buf[r * xd->plane[0].dst.stride + c];
-        ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c];
-#if CONFIG_AOM_HIGHBITDEPTH
-      }
-#endif
-    }
-  }
-  nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  /* Pick a base threshold based on the quantizer. The threshold will then be
-     adjusted on a 64x64 basis. We use a threshold of the form T = a*Q^b,
-     where a and b are derived empirically trying to optimize rate-distortion
-     at different quantizer settings. */
-  best_level = AOMMIN(
-      MAX_DERING_LEVEL - 1,
-      (int)floor(.5 +
-                 .45 * pow(av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >>
-                               (cm->bit_depth - 8),
-                           0.6)));
-  for (sbr = 0; sbr < nvsb; sbr++) {
-    for (sbc = 0; sbc < nhsb; sbc++) {
-      int nvb, nhb;
-      int gi;
-      int best_gi;
-      int32_t best_mse = INT32_MAX;
-      int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
-      int16_t tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
-      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
-      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
-      dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
-                                            sbc * MAX_MIB_SIZE, dlist);
-      if (dering_count == 0) continue;
-      best_gi = 0;
-      for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
-        int cur_mse;
-        int threshold;
-        int16_t inbuf[OD_DERING_INBUF_SIZE];
-        int16_t *in;
-        int i, j;
-        level = compute_level_from_index(best_level, gi);
-        threshold = level << coeff_shift;
-        for (r = 0; r < nvb << bsize[0]; r++) {
-          for (c = 0; c < nhb << bsize[0]; c++) {
-            dst[(r * MAX_MIB_SIZE << bsize[0]) + c] =
-                src[((sbr * MAX_MIB_SIZE << bsize[0]) + r) * stride +
-                    (sbc * MAX_MIB_SIZE << bsize[0]) + c];
-          }
-        }
-        in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
-        /* We avoid filtering the pixels for which some of the pixels to average
-           are outside the frame. We could change the filter instead, but it
-           would
-           add special cases for any future vectorization. */
-        for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
-          inbuf[i] = OD_DERING_VERY_LARGE;
-        for (i = -OD_FILT_VBORDER * (sbr != 0);
-             i < (nvb << bsize[0]) + OD_FILT_VBORDER * (sbr != nvsb - 1); i++) {
-          for (j = -OD_FILT_HBORDER * (sbc != 0);
-               j < (nhb << bsize[0]) + OD_FILT_HBORDER * (sbc != nhsb - 1);
-               j++) {
-            int16_t *x;
-            x = &src[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
-                     (sbc * MAX_MIB_SIZE << bsize[0])];
-            in[i * OD_FILT_BSTRIDE + j] = x[i * stride + j];
-          }
-        }
-        od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold,
-                  coeff_shift);
-        copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst,
-                                   dlist, dering_count, bsize[0]);
-        cur_mse = (int)compute_dist(
-            dst, MAX_MIB_SIZE << bsize[0],
-            &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
-                       (sbc * MAX_MIB_SIZE << bsize[0])],
-            stride, nhb, nvb, coeff_shift);
-        if (cur_mse < best_mse) {
-          best_gi = gi;
-          best_mse = cur_mse;
-        }
-      }
-      cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                          MAX_MIB_SIZE * sbc]
-          ->mbmi.dering_gain = best_gi;
-    }
-  }
-  aom_free(src);
-  aom_free(ref_coeff);
-  return best_level;
-}