[optimize-b] Use a greedy search method

The greedy search method improves the BD-rate over the baseline by
more than 0.2% for lowres test set. Larger gain 0.55% is observed for hdres test set.

[2017.04.06] Cleanup to remove redundant computation. On a local linux
machine, the greedy method is now faster than the trellis method in
encoding the first 100 frames of foreman_cif. However, the BD-rate seems
to become smaller due to the recent changes to the codebase.
[2017.04.06-2] Style changes to meet the requirements.
remove "greedy-optimize-b" in configure
[2017.04.10] Move the changes under the macro USE_GREEDY_OPTIMIZE_B
[2017.04.11] Adjust rdmult to accommodate CpuSpeedTest
[2017.04.12] Set USE_GREEDY_OPTIMIZE_B to 0 at the request of debargha@.
[2017.04.13] Move greedy implementation of optimize_b into a separate
function with the same name (selected by USE_GREEDY_OPTIMIZE_B, default
is 0)

Change-Id: Ic15534f8d696d5f02e8f2e3e9f08b52e41e9efd2
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 4e0002f..4199380 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -106,16 +106,6 @@
                  pd->dst.buf, pd->dst.stride);
 }
 
-typedef struct av1_token_state {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} av1_token_state;
-
 // These numbers are empirically obtained.
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
 #if CONFIG_EC_ADAPT
@@ -142,6 +132,375 @@
 #endif
 }
 
+#define USE_GREEDY_OPTIMIZE_B 0
+
+#if USE_GREEDY_OPTIMIZE_B
+
+typedef struct av1_token_state {
+  int16_t token;
+  tran_low_t qc;
+  tran_low_t dqc;
+} av1_token_state;
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   TX_SIZE tx_size, int ctx) {
+#if !CONFIG_PVQ
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+  uint8_t token_cache[MAX_TX_SQUARE];
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const int16_t *const dequant_ptr = pd->dequant;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  const int16_t *const scan = scan_order->scan;
+  const int16_t *const nb = scan_order->neighbors;
+  int dqv;
+  const int shift = get_tx_scale(tx_size);
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+#endif
+#if CONFIG_NEW_QUANT
+  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
+  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#elif !CONFIG_AOM_QM
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
+#endif  // CONFIG_NEW_QUANT
+  int sz = 0;
+  const int64_t rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int16_t t0, t1;
+  int i, final_eob;
+#if CONFIG_HIGHBITDEPTH
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
+#else
+  const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
+#endif
+  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+  const int default_eob = tx_size_2d[tx_size];
+
+  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  assert((!plane_type && !plane) || (plane_type && plane));
+  assert(eob <= default_eob);
+
+  int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+/* CpuSpeedTest uses "--min-q=0 --max-q=0" and expects 100dB psnr
+* This creates conflict with search for a better EOB position
+* The line below is to make sure EOB search is disabled at this corner case.
+*/
+#if !CONFIG_NEW_QUANT && !CONFIG_AOM_QM
+  if (dq_step[1] <= 4) {
+    rdmult = 1;
+  }
+#endif
+
+  int64_t rate0, rate1;
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    t0 = av1_get_token(x);
+
+    tokens[i][0].qc = x;
+    tokens[i][0].token = t0;
+    tokens[i][0].dqc = dqcoeff[rc];
+
+    token_cache[rc] = av1_pt_energy_class[t0];
+  }
+  tokens[eob][0].token = EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  tokens[eob][0].dqc = 0;
+  tokens[eob][1] = tokens[eob][0];
+
+  unsigned int(*token_costs_ptr)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      token_costs;
+
+  final_eob = 0;
+
+  int64_t eob_cost0, eob_cost1;
+
+  const int ctx0 = ctx;
+  /* Record the r-d cost */
+  int64_t accu_rate = 0;
+  int64_t accu_error = 0;
+
+  rate0 = get_token_bit_costs(*(token_costs_ptr + band_translate[0]), 0, ctx0,
+                              EOB_TOKEN);
+  int64_t best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+  // int64_t best_block_rd_cost_all0 = best_block_rd_cost;
+
+  int x_prev = 1;
+
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    sz = -(x < 0);
+
+    int band_cur = band_translate[i];
+    int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    int token_tree_sel_cur = (x_prev == 0);
+
+    if (x == 0) {
+      // no need to search when x == 0
+      rate0 =
+          get_token_bit_costs(*(token_costs_ptr + band_cur), token_tree_sel_cur,
+                              ctx_cur, tokens[i][0].token);
+      accu_rate += rate0;
+      x_prev = 0;
+      // accu_error does not change when x==0
+    } else {
+      /*  Computing distortion
+       */
+      // compute the distortion for the first candidate
+      // and the distortion for quantizing to 0.
+      int dx0 = (-coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx0 >>= xd->bd - 8;
+      }
+#endif
+      int64_t d0 = (int64_t)dx0 * dx0;
+
+      int x_a = x - 2 * sz - 1;
+      int64_t d2, d2_a;
+
+      int dx;
+
+#if CONFIG_AOM_QM
+      int iwt = iqmatrix[rc];
+      dqv = dequant_ptr[rc != 0];
+      dqv = ((iwt * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#else
+      dqv = dequant_ptr[rc != 0];
+#endif
+
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+#if CONFIG_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
+      }
+#endif  // CONFIG_HIGHBITDEPTH
+      d2 = (int64_t)dx * dx;
+
+      /* compute the distortion for the second candidate
+       * x_a = x - 2 * sz + 1;
+       */
+      if (x_a != 0) {
+#if CONFIG_NEW_QUANT
+        dx = av1_dequant_coeff_nuq(x, dqv, dequant_val[band_translate[i]]) -
+             (coeff[rc] << shift);
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx >>= xd->bd - 8;
+        }
+#endif  // CONFIG_HIGHBITDEPTH
+#else   // CONFIG_NEW_QUANT
+#if CONFIG_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          dx -= ((dqv >> (xd->bd - 8)) + sz) ^ sz;
+        } else {
+          dx -= (dqv + sz) ^ sz;
+        }
+#else
+        dx -= (dqv + sz) ^ sz;
+#endif  // CONFIG_HIGHBITDEPTH
+#endif  // CONFIG_NEW_QUANT
+        d2_a = (int64_t)dx * dx;
+      } else {
+        d2_a = d0;
+      }
+      /*  Computing rates and r-d cost
+       */
+
+      int best_x, best_eob_x;
+      int64_t base_bits, next_bits0, next_bits1;
+      int64_t next_eob_bits0, next_eob_bits1;
+
+      // rate cost of x
+      base_bits = av1_get_token_cost(x, &t0, cat6_bits);
+      rate0 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
+                                              token_tree_sel_cur, ctx_cur, t0);
+
+      base_bits = av1_get_token_cost(x_a, &t1, cat6_bits);
+      rate1 = base_bits + get_token_bit_costs(*(token_costs_ptr + band_cur),
+                                              token_tree_sel_cur, ctx_cur, t1);
+
+      next_bits0 = 0;
+      next_bits1 = 0;
+      next_eob_bits0 = 0;
+      next_eob_bits1 = 0;
+
+      if (i < default_eob - 1) {
+        int ctx_next, token_tree_sel_next;
+        int band_next = band_translate[i + 1];
+
+        token_cache[rc] = av1_pt_energy_class[t0];
+        ctx_next = get_coef_context(nb, token_cache, i + 1);
+        token_tree_sel_next = (x == 0);
+
+        next_bits0 = get_token_bit_costs(*(token_costs_ptr + band_next),
+                                         token_tree_sel_next, ctx_next,
+                                         tokens[i + 1][0].token);
+        next_eob_bits0 =
+            get_token_bit_costs(*(token_costs_ptr + band_next),
+                                token_tree_sel_next, ctx_next, EOB_TOKEN);
+
+        token_cache[rc] = av1_pt_energy_class[t1];
+        ctx_next = get_coef_context(nb, token_cache, i + 1);
+        token_tree_sel_next = (x_a == 0);
+
+        next_bits1 = get_token_bit_costs(*(token_costs_ptr + band_next),
+                                         token_tree_sel_next, ctx_next,
+                                         tokens[i + 1][0].token);
+
+        if (x_a != 0) {
+          next_eob_bits1 =
+              get_token_bit_costs(*(token_costs_ptr + band_next),
+                                  token_tree_sel_next, ctx_next, EOB_TOKEN);
+        }
+      }
+
+      rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), d2);
+      rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), d2_a);
+
+      best_x = (rd_cost1 < rd_cost0);
+
+      eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+                         (accu_error + d2 - d0));
+      eob_cost1 = eob_cost0;
+      if (x_a != 0) {
+        eob_cost1 = RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+                           (accu_error + d2_a - d0));
+        best_eob_x = (eob_cost1 < eob_cost0);
+      } else {
+        best_eob_x = 0;
+      }
+
+      int dqc, dqc_a = 0;
+
+      dqc = dqcoeff[rc];
+      if (best_x + best_eob_x) {
+        if (x_a != 0) {
+#if CONFIG_NEW_QUANT
+          dqc_a = av1_dequant_abscoeff_nuq(abs(x_a), dqv,
+                                           dequant_val[band_translate[i]]);
+          dqc_a = shift ? ROUND_POWER_OF_TWO(dqc_a, shift) : dqc_a;
+          if (sz) dqc_a = -dqc_a;
+#else
+// The 32x32 transform coefficient uses half quantization step size.
+// Account for the rounding difference in the dequantized coefficeint
+// value when the quantization index is dropped from an even number
+// to an odd number.
+
+#if CONFIG_AOM_QM
+          tran_low_t offset = dqv >> shift;
+#else
+          tran_low_t offset = dq_step[rc != 0];
+#endif
+          if (shift & x_a) offset += (dqv & 0x01);
+
+          if (sz == 0)
+            dqc_a = dqcoeff[rc] - offset;
+          else
+            dqc_a = dqcoeff[rc] + offset;
+#endif  // CONFIG_NEW_QUANT
+        } else {
+          dqc_a = 0;
+        }  // if (x_a != 0)
+      }
+
+      // record the better quantized value
+      if (best_x) {
+        qcoeff[rc] = x_a;
+        dqcoeff[rc] = dqc_a;
+
+        accu_rate += rate1;
+        accu_error += d2_a - d0;
+        assert(d2_a <= d0);
+
+        token_cache[rc] = av1_pt_energy_class[t1];
+      } else {
+        accu_rate += rate0;
+        accu_error += d2 - d0;
+        assert(d2 <= d0);
+
+        token_cache[rc] = av1_pt_energy_class[t0];
+      }
+
+      x_prev = qcoeff[rc];
+
+      // determine whether to move the eob position to i+1
+      int64_t best_eob_cost_i = eob_cost0;
+
+      tokens[i][1].token = t0;
+      tokens[i][1].qc = x;
+      tokens[i][1].dqc = dqc;
+
+      if ((x_a != 0) && (best_eob_x)) {
+        best_eob_cost_i = eob_cost1;
+
+        tokens[i][1].token = t1;
+        tokens[i][1].qc = x_a;
+        tokens[i][1].dqc = dqc_a;
+      }
+
+      if (best_eob_cost_i < best_block_rd_cost) {
+        best_block_rd_cost = best_eob_cost_i;
+        final_eob = i + 1;
+      }
+    }  // if (x==0)
+  }    // for (i)
+
+  assert(final_eob <= eob);
+  if (final_eob > 0) {
+    assert(tokens[final_eob - 1][1].qc != 0);
+    i = final_eob - 1;
+    int rc = scan[i];
+    qcoeff[rc] = tokens[i][1].qc;
+    dqcoeff[rc] = tokens[i][1].dqc;
+  }
+
+  for (i = final_eob; i < eob; i++) {
+    int rc = scan[i];
+    qcoeff[rc] = 0;
+    dqcoeff[rc] = 0;
+  }
+
+  mb->plane[plane].eobs[block] = final_eob;
+  return final_eob;
+
+#else   // !CONFIG_PVQ
+  (void)cm;
+  (void)tx_size;
+  (void)ctx;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  return p->eobs[block];
+#endif  // !CONFIG_PVQ
+}
+
+#else  // USE_GREEDY_OPTIMIZE_B
+
+typedef struct av1_token_state {
+  int64_t error;
+  int rate;
+  int16_t next;
+  int16_t token;
+  tran_low_t qc;
+  tran_low_t dqc;
+  uint8_t best_index;
+} av1_token_state;
+
 int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
                    TX_SIZE tx_size, int ctx) {
 #if !CONFIG_PVQ
@@ -494,6 +853,8 @@
 #endif  // !CONFIG_PVQ
 }
 
+#endif  // USE_GREEDY_OPTIMIZE_B
+
 #if !CONFIG_PVQ
 #if CONFIG_HIGHBITDEPTH
 typedef enum QUANT_FUNC {