Jointly optimizing deringing and clpf

We now signal joint strengths and use a greedy algorithm for the search.

low-latency, cpu-used=4:

ll4-cdef@2017-03-22T03:42:10.815Z -> ll4-cdef-newsearch-var-header-newlambda-refine4@2017-03-22T15:56:46.471Z

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0792 |  0.3551 |  0.4393 |  -0.0108 | -0.1338 | -0.0141 |     0.1452

Change-Id: I619ae1c7c7d7ec04fe993cabc5773b07c3f5b201
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 3bbce0f..fdc7ac8 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -2785,14 +2785,10 @@
   if (bsize == BLOCK_64X64 &&
 #endif  // CONFIG_EXT_PARTITION
              !sb_all_skip(cm, mi_row, mi_col)) {
-    if (cm->dering_bits)
+    if (cm->cdef_bits != 0)
       aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
-                               ->mbmi.dering_gain,
-                        cm->dering_bits);
-    if (cm->clpf_bits)
-      aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]
-                               ->mbmi.clpf_strength,
-                        cm->clpf_bits);
+                               ->mbmi.cdef_strength,
+                        cm->cdef_bits);
   }
 #endif
 }
@@ -3496,7 +3492,11 @@
 
 #if CONFIG_CDEF
 static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
-  aom_wb_write_literal(wb, cm->dering_level, DERING_LEVEL_BITS);
+  int i;
+  aom_wb_write_literal(wb, cm->cdef_bits, 2);
+  for (i = 0; i < cm->nb_cdef_strengths; i++) {
+    aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
+  }
   aom_wb_write_literal(wb, cm->clpf_strength_u, 2);
   aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
 }
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f3157c4..757d589 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3522,14 +3522,17 @@
   }
 #if CONFIG_CDEF
   if (is_lossless_requested(&cpi->oxcf)) {
-    cm->dering_level = cm->clpf_strength_u = cm->clpf_strength_v = 0;
+    cm->clpf_strength_u = cm->clpf_strength_v = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
   } else {
     // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v
     av1_cdef_search(cm->frame_to_show, cpi->Source, cm, xd);
 
     // Apply the filter
-    av1_cdef_frame(cm->frame_to_show, cm, xd, cm->dering_level,
-                   cm->clpf_strength_u, cm->clpf_strength_v);
+    av1_cdef_frame(cm->frame_to_show, cm, xd, cm->clpf_strength_u,
+                   cm->clpf_strength_v);
 
     // Pack the clpf chroma strengths into two bits each
     cm->clpf_strength_u -= cm->clpf_strength_u == 4;
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 5523a9c..241d27a 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -20,6 +20,64 @@
 #include "av1/encoder/clpf_rdo.h"
 #include "av1/encoder/encoder.h"
 
+#define TOTAL_STRENGTHS (DERING_STRENGTHS * CLPF_STRENGTHS)
+
+/* Search for the best strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count) {
+  uint64_t tot_mse[TOTAL_STRENGTHS];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id = 0;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      if (mse[i][lev[gi]] < best_mse) {
+        best_mse = mse[i][lev[gi]];
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < TOTAL_STRENGTHS; j++) {
+      uint64_t best = best_mse;
+      if (mse[i][j] < best) best = mse[i][j];
+      tot_mse[j] += best;
+    }
+  }
+  for (j = 0; j < TOTAL_STRENGTHS; j++) {
+    if (tot_mse[j] < best_tot_mse) {
+      best_tot_mse = tot_mse[j];
+      best_id = j;
+    }
+  }
+  lev[nb_strengths] = best_id;
+  return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+                                      uint64_t mse[][TOTAL_STRENGTHS],
+                                      int sb_count) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse = search_one(best_lev, i, mse, sb_count);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  for (i = 0; i < 4 * nb_strengths; i++) {
+    int j;
+    for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+    best_tot_mse = search_one(best_lev, nb_strengths - 1, mse, sb_count);
+  }
+  return best_tot_mse;
+}
+
 static double compute_dist(uint16_t *x, int xstride, uint16_t *y, int ystride,
                            int nhb, int nvb, int coeff_shift) {
   int i, j;
@@ -50,21 +108,24 @@
   int level;
   int dering_count;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
-  uint64_t best_tot_mse = 0;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  uint64_t tot_mse;
   int sb_count;
   int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index));
-  uint64_t(*mse)[DERING_STRENGTHS][CLPF_STRENGTHS] =
+  uint64_t(*mse)[DERING_STRENGTHS * CLPF_STRENGTHS] =
       aom_malloc(sizeof(*mse) * nvsb * nhsb);
   int clpf_damping = 3 + (cm->base_qindex >> 6);
   int i;
-  int lev[DERING_REFINEMENT_LEVELS];
-  int best_lev[DERING_REFINEMENT_LEVELS];
-  int str[CLPF_REFINEMENT_LEVELS];
-  int best_str[CLPF_REFINEMENT_LEVELS];
-  double lambda = exp(cm->base_qindex / 36.0);
-  static int log2[] = { 0, 1, 2, 2 };
+  int best_lev[CDEF_MAX_STRENGTHS];
+  int nb_strengths;
+  int nb_strength_bits;
+  int quantizer;
+  double lambda;
+  quantizer =
+      av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
+  lambda = .12 * quantizer * quantizer / 256.;
 
   src = aom_memalign(32, sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
   ref_coeff =
@@ -143,7 +204,7 @@
                     i + (i == 3), clpf_damping, coeff_shift);
           copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst,
                                      dlist, dering_count, bsize[0]);
-          mse[sb_count][gi][i] = (int)compute_dist(
+          mse[sb_count][gi * CLPF_STRENGTHS + i] = (int)compute_dist(
               dst, MAX_MIB_SIZE << bsize[0],
               &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) +
                          (sbc * MAX_MIB_SIZE << bsize[0])],
@@ -155,85 +216,38 @@
       sb_count++;
     }
   }
-  best_tot_mse = (uint64_t)1 << 63;
 
-  int l0;
-  for (l0 = 0; l0 < DERING_STRENGTHS; l0++) {
-    int l1;
-    lev[0] = l0;
-    for (l1 = l0; l1 < DERING_STRENGTHS; l1++) {
-      int l2;
-      lev[1] = l1;
-      for (l2 = l1; l2 < DERING_STRENGTHS; l2++) {
-        int l3;
-        lev[2] = l2;
-        for (l3 = l2; l3 < DERING_STRENGTHS; l3++) {
-          int cs0;
-          lev[3] = l3;
-          for (cs0 = 0; cs0 < CLPF_STRENGTHS; cs0++) {
-            int cs1;
-            str[0] = cs0;
-            for (cs1 = cs0; cs1 < CLPF_STRENGTHS; cs1++) {
-              uint64_t tot_mse = 0;
-              str[1] = cs1;
-              for (i = 0; i < sb_count; i++) {
-                int gi;
-                int cs;
-                uint64_t best_mse = (uint64_t)1 << 63;
-                for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
-                  for (cs = 0; cs < CLPF_REFINEMENT_LEVELS; cs++) {
-                    if (mse[i][lev[gi]][str[cs]] < best_mse) {
-                      best_mse = mse[i][lev[gi]][str[cs]];
-                    }
-                  }
-                }
-                tot_mse += best_mse;
-              }
-
-              // Add the bit cost
-              int dering_diffs = 0, clpf_diffs = 0;
-              for (i = 1; i < DERING_REFINEMENT_LEVELS; i++)
-                dering_diffs += lev[i] != lev[i - 1];
-              for (i = 1; i < CLPF_REFINEMENT_LEVELS; i++)
-                clpf_diffs += str[i] != str[i - 1];
-              tot_mse += (uint64_t)(sb_count * lambda *
-                                    (log2[dering_diffs] + log2[clpf_diffs]));
-
-              if (tot_mse < best_tot_mse) {
-                for (i = 0; i < DERING_REFINEMENT_LEVELS; i++)
-                  best_lev[i] = lev[i];
-                for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++)
-                  best_str[i] = str[i];
-                best_tot_mse = tot_mse;
-              }
-            }
-          }
-        }
-      }
+  nb_strength_bits = 0;
+  /* Search for different number of signalling bits. */
+  for (i = 0; i <= 3; i++) {
+    nb_strengths = 1 << i;
+    tot_mse = joint_strength_search(best_lev, nb_strengths, mse, sb_count);
+    /* Count superblock signalling cost. */
+    tot_mse += (uint64_t)(sb_count * lambda * i);
+    /* Count header signalling cost. */
+    tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
+    if (tot_mse < best_tot_mse) {
+      best_tot_mse = tot_mse;
+      nb_strength_bits = i;
     }
   }
-  for (i = 0; i < DERING_REFINEMENT_LEVELS; i++) lev[i] = best_lev[i];
-  for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++) str[i] = best_str[i];
+  nb_strengths = 1 << nb_strength_bits;
 
-  id_to_levels(lev, str, levels_to_id(lev, str));  // Pack tables
-  cdef_get_bits(lev, str, &cm->dering_bits, &cm->clpf_bits);
-
+  cm->cdef_bits = nb_strength_bits;
+  cm->nb_cdef_strengths = nb_strengths;
+  for (i = 0; i < nb_strengths; i++) cm->cdef_strengths[i] = best_lev[i];
   for (i = 0; i < sb_count; i++) {
-    int gi, cs;
-    int best_gi, best_clpf;
+    int gi;
+    int best_gi;
     uint64_t best_mse = (uint64_t)1 << 63;
-    best_gi = best_clpf = 0;
-    for (gi = 0; gi < (1 << cm->dering_bits); gi++) {
-      for (cs = 0; cs < (1 << cm->clpf_bits); cs++) {
-        if (mse[i][lev[gi]][str[cs]] < best_mse) {
-          best_gi = gi;
-          best_clpf = cs;
-          best_mse = mse[i][lev[gi]][str[cs]];
-        }
+    best_gi = 0;
+    for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
+      if (mse[i][best_lev[gi]] < best_mse) {
+        best_gi = gi;
+        best_mse = mse[i][best_lev[gi]];
       }
     }
-    cm->mi_grid_visible[sb_index[i]]->mbmi.dering_gain = best_gi;
-    cm->mi_grid_visible[sb_index[i]]->mbmi.clpf_strength = best_clpf;
+    cm->mi_grid_visible[sb_index[i]]->mbmi.cdef_strength = best_gi;
   }
 
   aom_free(src);
@@ -245,5 +259,4 @@
                       AOM_PLANE_U);
   av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_v,
                       AOM_PLANE_V);
-  cm->dering_level = levels_to_id(best_lev, best_str);
 }