Remove redundant dering passes in CDEF search

Change-Id: I3b932544426cb9aeb63ab4e1f516a4d8195d0702
diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index 71ce202..c578eb9 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@@ -382,7 +382,7 @@
         od_dering(dst,
                   &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
                   dec[pli], dir, NULL, var, pli, dlist, dering_count, threshold,
-                  clpf_strength, clpf_damping, coeff_shift);
+                  clpf_strength, clpf_damping, coeff_shift, 0);
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           copy_dering_16bit_to_16bit(
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 849c9b1..65ea4f2 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -243,7 +243,8 @@
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
                int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                dering_list *dlist, int dering_count, int threshold,
-               int clpf_strength, int clpf_damping, int coeff_shift) {
+               int clpf_strength, int clpf_damping, int coeff_shift,
+               int skip_dering) {
   int bi;
   int bx;
   int by;
@@ -252,45 +253,49 @@
     od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
   };
   bsize = OD_DERING_SIZE_LOG2 - xdec;
-  if (pli == 0) {
-    if (!dirinit || !*dirinit) {
+  if (!skip_dering) {
+    if (pli == 0) {
+      if (!dirinit || !*dirinit) {
+        for (bi = 0; bi < dering_count; bi++) {
+          by = dlist[bi].by;
+          bx = dlist[bi].bx;
+          dir[by][bx] =
+              od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
+                           OD_FILT_BSTRIDE, &var[by][bx], coeff_shift);
+        }
+        if (dirinit) *dirinit = 1;
+      }
       for (bi = 0; bi < dering_count; bi++) {
         by = dlist[bi].by;
         bx = dlist[bi].bx;
-        dir[by][bx] = od_dir_find8(&in[8 * by * OD_FILT_BSTRIDE + 8 * bx],
-                                   OD_FILT_BSTRIDE, &var[by][bx], coeff_shift);
+        /* Deringing orthogonal to the direction uses a tighter threshold
+           because we want to be conservative. We've presumably already
+           achieved some deringing, so the amount of change is expected
+           to be low. Also, since we might be filtering across an edge, we
+           want to make sure not to blur it. That being said, we might want
+           to be a little bit more aggressive on pure horizontal/vertical
+           since the ringing there tends to be directional, so it doesn't
+           get removed by the directional filtering. */
+        (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+            &y[bi << 2 * bsize], 1 << bsize,
+            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+            od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
       }
-      if (dirinit) *dirinit = 1;
-    }
-    for (bi = 0; bi < dering_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      /* Deringing orthogonal to the direction uses a tighter threshold
-         because we want to be conservative. We've presumably already
-         achieved some deringing, so the amount of change is expected
-         to be low. Also, since we might be filtering across an edge, we
-         want to make sure not to blur it. That being said, we might want
-         to be a little bit more aggressive on pure horizontal/vertical
-         since the ringing there tends to be directional, so it doesn't
-         get removed by the directional filtering. */
-      (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[bi << 2 * bsize], 1 << bsize,
-          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
-          od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
-    }
-  } else {
-    for (bi = 0; bi < dering_count; bi++) {
-      by = dlist[bi].by;
-      bx = dlist[bi].bx;
-      (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[bi << 2 * bsize], 1 << bsize,
-          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
-          dir[by][bx]);
+    } else {
+      for (bi = 0; bi < dering_count; bi++) {
+        by = dlist[bi].by;
+        bx = dlist[bi].bx;
+        (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+            &y[bi << 2 * bsize], 1 << bsize,
+            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
+            dir[by][bx]);
+      }
     }
   }
   if (!clpf_strength) return;
-  copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count,
-                             bsize);
+  if (threshold && !skip_dering)
+    copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count,
+                               bsize);
   for (bi = 0; bi < dering_count; bi++) {
     by = dlist[bi].by;
     bx = dlist[bi].bx;
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 60d17bc..a3efec2 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -50,7 +50,8 @@
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
                int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                dering_list *dlist, int dering_count, int threshold,
-               int clpf_strength, int clpf_damping, int coeff_shift);
+               int clpf_strength, int clpf_damping, int coeff_shift,
+               int skip_dering);
 int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
                                      const uint16_t *in, int threshold,
                                      int dir);
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 9a9813c..bcb82b7 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -257,15 +257,20 @@
               (nvb << bsize[pli]) + OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff;
           int xsize =
               (nhb << bsize[pli]) + OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff;
-          copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
-                       src[pli], (sbr * MAX_MIB_SIZE << bsize[pli]) - yoff,
-                       (sbc * MAX_MIB_SIZE << bsize[pli]) - xoff, stride[pli],
-                       ysize, xsize);
           clpf_strength = gi % CLPF_STRENGTHS;
+          if (clpf_strength == 0)
+            copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
+                         src[pli], (sbr * MAX_MIB_SIZE << bsize[pli]) - yoff,
+                         (sbc * MAX_MIB_SIZE << bsize[pli]) - xoff, stride[pli],
+                         ysize, xsize);
           od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
                     dering_count, threshold,
                     clpf_strength + (clpf_strength == 3), clpf_damping,
-                    coeff_shift);
+                    coeff_shift, clpf_strength != 0);
+          if (clpf_strength == 0) {
+            copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, tmp_dst, dlist,
+                                       dering_count, bsize[pli]);
+          }
           mse[pli][sb_count][gi] = compute_dering_mse(
               ref_coeff[pli] +
                   (sbr * MAX_MIB_SIZE << bsize[pli]) * stride[pli] +