SSIM-like contrast term for CDEF distortion function

high-latency, cpu=0:

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
0.0378 |  0.1946 |  0.1385 |  -0.1159 | -0.2058 | -0.2085 |     0.1353

low-latency, cpu=0:

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
0.2388 |  0.2234 |  0.3290 |   0.0623 | -0.1716 | -0.1704 |     0.2542

low-latency, cpu=4:

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
0.4089 |  0.3477 |  0.6132 |   0.1729 | -0.1905 | -0.1610 |     0.5522

Change-Id: I35b8596667d82a127847b209416ad83e3b839a9a
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 9113b08..f224ea1 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -90,6 +90,35 @@
   }
 }
 
+static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                      int sstride, int coeff_shift) {
+  uint64_t svar = 0;
+  uint64_t dvar = 0;
+  uint64_t sum_s = 0;
+  uint64_t sum_d = 0;
+  uint64_t sum_s2 = 0;
+  uint64_t sum_d2 = 0;
+  uint64_t sum_sd = 0;
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      sum_s += src[i * sstride + j];
+      sum_d += dst[i * dstride + j];
+      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+    }
+  }
+  /* Compute the variance -- the calculation cannot go negative. */
+  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+  return (uint64_t)floor(
+      .5 +
+      (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+          (svar + dvar + (400 << 2 * coeff_shift)) /
+          (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+}
+
 static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
                                      int sstride) {
   uint64_t sum = 0;
@@ -117,17 +146,22 @@
 }
 
 /* Compute MSE only on the blocks we filtered. */
-uint64_t compute_dering_mse(uint16_t *dst, int dstride, uint16_t *src,
-                            dering_list *dlist, int dering_count,
-                            BLOCK_SIZE bsize, int coeff_shift) {
+uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
+                             dering_list *dlist, int dering_count,
+                             BLOCK_SIZE bsize, int coeff_shift, int pli) {
   uint64_t sum = 0;
   int bi, bx, by;
   if (bsize == BLOCK_8X8) {
     for (bi = 0; bi < dering_count; bi++) {
       by = dlist[bi].by;
       bx = dlist[bi].bx;
-      sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
-                           &src[bi << (2 * 3)], 8);
+      if (pli == 0) {
+        sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                              &src[bi << (2 * 3)], 8, coeff_shift);
+      } else {
+        sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                             &src[bi << (2 * 3)], 8);
+      }
     }
   } else {
     for (bi = 0; bi < dering_count; bi++) {
@@ -278,12 +312,12 @@
                     dering_count, threshold,
                     clpf_strength + (clpf_strength == 3), clpf_damping,
                     coeff_shift, clpf_strength != 0, 1);
-          mse[pli][sb_count][gi] = compute_dering_mse(
+          mse[pli][sb_count][gi] = compute_dering_dist(
               ref_coeff[pli] +
                   (sbr * MAX_MIB_SIZE << mi_high_l2[pli]) * stride[pli] +
                   (sbc * MAX_MIB_SIZE << mi_wide_l2[pli]),
               stride[pli], tmp_dst, dlist, dering_count, bsize[pli],
-              coeff_shift);
+              coeff_shift, pli);
           sb_index[sb_count] =
               MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc;
         }