Update MV cdf stats per superblock

Adds a control and parameter for MV CDF updates.
Mv updates are enabled by default to be at a per
superblock level leading to an improvement in BDRATE
by -0.1 to -0.15%.
Slowdown by instruction count is ~ 1%

STATS_CHANGED

Change-Id: I549aace428724cb6f50d3d33c3e19c23b77283d3
diff --git a/aom/aomcx.h b/aom/aomcx.h
index 946bc82..1724ea9 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1126,6 +1126,14 @@
    */
   AV1E_SET_MODE_COST_UPD_FREQ,
 
+  /*!\brief Control to set frequency of the cost updates for motion vectors
+   * Possible values are:
+   * 0: Update at SB level (default)
+   * 1: Update at SB row level in tile
+   * 2: Update at tile level
+   */
+  AV1E_SET_MV_COST_UPD_FREQ,
+
   /*!\brief Control to set bit mask that specifies which tier each of the 32
    * possible operating points conforms to.
    * Bit value 0: Main Tier; 1: High Tier.
@@ -1608,6 +1616,9 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
 #define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ
 
+AOM_CTRL_USE_TYPE(AV1E_SET_MV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MV_COST_UPD_FREQ
+
 AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
 #define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
 
diff --git a/apps/aomenc.c b/apps/aomenc.c
index ef3bd17..7b01af2 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c
@@ -591,6 +591,10 @@
     ARG_DEF(NULL, "mode-cost-upd-freq", 1,
             "Update freq for mode costs"
             "0: SB, 1: SB Row per Tile, 2: Tile");
+static const arg_def_t mv_cost_upd_freq =
+    ARG_DEF(NULL, "mv-cost-upd-freq", 1,
+            "Update freq for mv costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile");
 #if CONFIG_DIST_8X8
 static const arg_def_t enable_dist_8x8 =
     ARG_DEF(NULL, "enable-dist-8x8", 1,
@@ -857,6 +861,7 @@
                                        &quant_b_adapt,
                                        &coeff_cost_upd_freq,
                                        &mode_cost_upd_freq,
+                                       &mv_cost_upd_freq,
 #if CONFIG_DIST_8X8
                                        &enable_dist_8x8,
 #endif
@@ -961,6 +966,7 @@
                                         AV1E_SET_QUANT_B_ADAPT,
                                         AV1E_SET_COEFF_COST_UPD_FREQ,
                                         AV1E_SET_MODE_COST_UPD_FREQ,
+                                        AV1E_SET_MV_COST_UPD_FREQ,
 #if CONFIG_DIST_8X8
                                         AV1E_SET_ENABLE_DIST_8X8,
 #endif
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index b79899a..2477a83 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -148,6 +148,7 @@
   unsigned int min_cr;
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
+  COST_UPDATE_TYPE mv_cost_upd_freq;
 };
 
 static struct av1_extracfg default_extra_cfg = {
@@ -271,6 +272,7 @@
   0,            // min_cr
   COST_UPD_SB,  // coeff_cost_upd_freq
   COST_UPD_SB,  // mode_cost_upd_freq
+  COST_UPD_SB,  // mv_cost_upd_freq
 };
 
 struct aom_codec_alg_priv {
@@ -502,6 +504,7 @@
   RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
   RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
   RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 2);
 
   RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
   RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
@@ -673,6 +676,7 @@
   oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
   oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
   oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
+  oxcf->mv_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
 #if CONFIG_DIST_8X8
   oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
   if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
@@ -1540,6 +1544,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_film_grain_test_vector(
     aom_codec_alg_priv_t *ctx, va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -2404,6 +2415,7 @@
   { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
   { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq },
   { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
+  { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq },
   { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
   { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode },
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index f8d479a..26e3dae 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -276,6 +276,8 @@
   int best_pred_mv_sad;
 
   int nmv_vec_cost[MV_JOINTS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
   int *nmvcost[2];
   int *nmvcost_hp[2];
   int **mv_cost_stack;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index d3d6a00..ca9cb37 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1402,6 +1402,16 @@
         }
       }
     }
+    if (new_mv) {
+      const int allow_hp = cm->cur_frame_force_integer_mv
+                               ? MV_SUBPEL_NONE
+                               : cm->allow_high_precision_mv;
+      for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      }
+    }
   }
 }
 
@@ -4057,6 +4067,20 @@
       default: assert(0);
     }
 
+    switch (cpi->oxcf.mv_cost_upd_freq) {
+      case COST_UPD_TILE:  // Tile level
+        if (mi_row != tile_info->mi_row_start) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case COST_UPD_SBROW:  // SB row level in tile
+        if (mi_col != tile_info->mi_col_start) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case COST_UPD_SB:  // SB level
+        av1_fill_mv_costs(xd->tile_ctx, cm->cur_frame_force_integer_mv,
+                          cm->allow_high_precision_mv, x);
+        break;
+      default: assert(0);
+    }
+
     x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 7a7668a..43b7237 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -38,6 +38,60 @@
   return c;
 }
 
+static void update_mv_component_stats(int comp, nmv_component *mvcomp,
+                                      MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  update_cdf(mvcomp->sign_cdf, sign, 2);
+
+  // Class
+  update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i)
+      update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_cdf_prob *fp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf;
+    update_cdf(fp_cdf, fr, MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    aom_cdf_prob *hp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf;
+    update_cdf(hp_cdf, hp, 2);
+  }
+}
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+  update_cdf(mvctx->joints_cdf, j, MV_JOINTS);
+
+  if (mv_joint_vertical(j))
+    update_mv_component_stats(diff.row, &mvctx->comps[0], precision);
+
+  if (mv_joint_horizontal(j))
+    update_mv_component_stats(diff.col, &mvctx->comps[1], precision);
+}
+
 static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
                                 MvSubpelPrecision precision) {
   assert(comp != 0);
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 37ff547..ebd4d2c 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -21,6 +21,9 @@
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
                    nmv_context *mvctx, int usehp);
 
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision);
+
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *mvctx,
                               MvSubpelPrecision precision);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index e54295e..938adf2 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -435,13 +435,17 @@
 
 static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv,
                                   int cur_frame_force_integer_mv) {
-  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   cpi->common.allow_high_precision_mv =
       allow_high_precision_mv && cur_frame_force_integer_mv == 0;
   const int copy_hp =
       cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0;
-  int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost;
-  mb->mv_cost_stack = *src;
+  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
+  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
+  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
+  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+  int *(*src)[2] = copy_hp ? &x->nmvcost_hp : &x->nmvcost;
+  x->mv_cost_stack = *src;
 }
 
 static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
@@ -774,8 +778,8 @@
   // intended for use in a re-code loop in av1_compress_frame where the
   // quantizer value is adjusted between loop iterations.
   av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost);
-  av1_copy(cc->nmv_costs, cpi->nmv_costs);
-  av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
+  av1_copy(cc->nmv_costs, cpi->td.mb.nmv_costs);
+  av1_copy(cc->nmv_costs_hp, cpi->td.mb.nmv_costs_hp);
 
   cc->fc = *cm->fc;
 }
@@ -787,8 +791,8 @@
   // Restore key state variables to the snapshot state stored in the
   // previous call to av1_save_coding_context.
   av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost);
-  av1_copy(cpi->nmv_costs, cc->nmv_costs);
-  av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
+  av1_copy(cpi->td.mb.nmv_costs, cc->nmv_costs);
+  av1_copy(cpi->td.mb.nmv_costs_hp, cc->nmv_costs_hp);
 
   *cm->fc = cc->fc;
 }
@@ -2860,9 +2864,6 @@
   cpi->last_show_frame_buf = NULL;
   realloc_segmentation_maps(cpi);
 
-  memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
-  memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
-
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
     CHECK_MEM_ERROR(
@@ -2915,11 +2916,6 @@
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX];
-  cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX];
-  cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX];
-  cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX];
-
 #ifdef OUTPUT_YUV_SKINMAP
   yuv_skinmap_file = fopen("skinmap.yuv", "ab");
 #endif
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 857758d..d4775ed 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -415,6 +415,7 @@
   int quant_b_adapt;
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
+  COST_UPDATE_TYPE mv_cost_upd_freq;
   int border_in_pixels;
   AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
   // Bit mask to specify which tier each of the 32 possible operating points
@@ -835,9 +836,6 @@
   int gmtype_cost[TRANS_TYPES];
   int gmparams_cost[REF_FRAMES];
 
-  int nmv_costs[2][MV_VALS];
-  int nmv_costs_hp[2][MV_VALS];
-
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index d70f46c..f859d09 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -580,15 +580,21 @@
   }
 }
 
-void av1_initialize_cost_tables(const AV1_COMMON *const cm, MACROBLOCK *x) {
-  if (cm->cur_frame_force_integer_mv) {
-    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc,
+void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
+                       MACROBLOCK *x) {
+  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
+  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
+  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
+  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+  if (integer_mv) {
+    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &fc->nmvc,
                              MV_SUBPEL_NONE);
+    x->mv_cost_stack = (int **)&x->nmvcost;
   } else {
+    int *(*src)[2] = usehp ? &x->nmvcost_hp : &x->nmvcost;
+    x->mv_cost_stack = *src;
     av1_build_nmv_cost_table(
-        x->nmv_vec_cost,
-        cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
-        cm->allow_high_precision_mv);
+        x->nmv_vec_cost, usehp ? x->nmvcost_hp : x->nmvcost, &fc->nmvc, usehp);
   }
 }
 
@@ -607,7 +613,8 @@
 
   if (!cpi->sf.use_nonrd_pick_mode || frame_is_intra_only(cm) ||
       (cm->current_frame.frame_number & 0x07) == 1)
-    av1_initialize_cost_tables(cm, x);
+    av1_fill_mv_costs(cm->fc, cm->cur_frame_force_integer_mv,
+                      cm->allow_high_precision_mv, x);
 
   if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
       cpi->oxcf.pass != 1) {
diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 93d1d8b..276c473 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h
@@ -202,8 +202,6 @@
 
 void av1_initialize_rd_consts(struct AV1_COMP *cpi);
 
-void av1_initialize_cost_tables(const AV1_COMMON *const cm, MACROBLOCK *x);
-
 void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
                               int qindex);
 
@@ -295,6 +293,9 @@
 void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
                           const int num_planes);
 
+void av1_fill_mv_costs(const FRAME_CONTEXT *const fc, int integer_mv, int usehp,
+                       MACROBLOCK *x);
+
 int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
 
 int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta);
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index f6a1b65..7b70699 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -1559,7 +1559,8 @@
   rdmult = av1_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
   set_error_per_bit(&cpi->td.mb, rdmult);
   av1_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
-  av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
+  av1_fill_mv_costs(cpi->common.fc, cpi->common.cur_frame_force_integer_mv,
+                    cpi->common.allow_high_precision_mv, &cpi->td.mb);
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,
                             frames_to_blur_backward, strength, sigma,