Prune obmc evaluation

Pruned obmc evaluation based on gathered stats from previously encoded
frames. This feature is turned on at speed 4 now.

Ran Borg test at speed 4.
      avg_psnr: ovr_psnr: ssim:   avg_speedup(whole set)
hdres:  0.051    0.050    0.027     2.6%
midres: 0.137    0.114    0.154     3.4%

STATS_CHANGED

Change-Id: I6677986d8e153933d34cb10c0654dcf763f522a6
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 77d5a3d..9bcc3c7 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1643,6 +1643,24 @@
     if (tile_data->allow_update_cdf) {
       update_stats(&cpi->common, td, mi_row, mi_col);
     }
+
+    // Gather obmc count to update the probability.
+    if (cpi->sf.prune_obmc_using_stats) {
+      const int inter_block = is_inter_block(mbmi);
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active && inter_block) {
+        const MOTION_MODE motion_allowed =
+            cm->switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+        if (mbmi->ref_frame[1] != INTRA_FRAME &&
+            motion_allowed == OBMC_CAUSAL) {
+          td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+        }
+      }
+    }
   }
   // TODO(Ravi/Remya): Move this copy function to a better logical place
   copy_winner_ref_mode_from_mbmi_ext(x);
@@ -4999,6 +5017,7 @@
   av1_zero(*td->counts);
   av1_zero(rdc->comp_pred_diff);
   av1_zero(rdc->tx_type_used);
+  av1_zero(rdc->obmc_used);
 
   // Reset the flag.
   cpi->intrabc_used = 0;
@@ -5297,6 +5316,19 @@
       }
     }
   }
+
+  if (cpi->sf.prune_obmc_using_stats) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+      int sum = 0;
+      for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
+
+      int new_prob = sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
+      cpi->obmc_probs[update_type][i] =
+          (cpi->obmc_probs[update_type][i] + new_prob) >> 1;
+    }
+  }
 }
 
 #define CHECK_PRECOMPUTED_REF_FRAME_MAP 0
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 191b0b7..a94467d 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -232,6 +232,20 @@
     { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
 };
 
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
+    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
+    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
+  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
+    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
+    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
+};
+
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -5032,6 +5046,12 @@
     }
   }
 
+  if (cpi->sf.prune_obmc_using_stats &&
+      cm->current_frame.frame_type == KEY_FRAME) {
+    av1_copy(cpi->obmc_probs, default_obmc_probs);
+    cpi->obmc_probs_thresh = 16;
+  }
+
   // Loop variables
   int loop_count = 0;
   int loop_at_this_size = 0;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 44b765c..3781139 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -609,6 +609,7 @@
   int compound_ref_used_flag;
   int skip_mode_used_flag;
   int tx_type_used[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
+  int obmc_used[BLOCK_SIZES_ALL][2];
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -984,6 +985,8 @@
   int64_t vbp_threshold_copy;
   BLOCK_SIZE vbp_bsize_min;
 
+  int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+  int obmc_probs_thresh;
   int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
   int tx_type_probs_thresh[FRAME_UPDATE_TYPES];
 
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 6424d63..5741f89 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -35,6 +35,12 @@
             td_t->rd_counts.tx_type_used[i][j][k];
     }
   }
+
+  for (int i = 0; i < BLOCK_SIZES_ALL; i++) {
+    for (int j = 0; j < 2; j++) {
+      td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j];
+    }
+  }
 }
 
 static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 16a9a32..8c456a4 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9747,7 +9747,12 @@
       assert(mbmi->ref_frame[1] != INTRA_FRAME);
     }
 
-    if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.use_fast_nonrd_pick_mode) &&
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int prune_obmc =
+        cpi->sf.prune_obmc_using_stats &&
+        (cpi->obmc_probs[update_type][bsize] < cpi->obmc_probs_thresh);
+    if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.use_fast_nonrd_pick_mode ||
+         prune_obmc) &&
         mbmi->motion_mode == OBMC_CAUSAL)
       continue;
 
@@ -11903,8 +11908,11 @@
   }
 
   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
-
-  if (cpi->oxcf.enable_obmc) {
+  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  const int prune_obmc =
+      cpi->sf.prune_obmc_using_stats &&
+      (cpi->obmc_probs[update_type][bsize] < cpi->obmc_probs_thresh);
+  if (cpi->oxcf.enable_obmc && !prune_obmc) {
     if (check_num_overlappable_neighbors(mbmi) &&
         is_motion_variation_allowed_bsize(bsize)) {
       int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 7addf4d..22f355f 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -443,6 +443,7 @@
     // sf->tx_domain_dist_level = 2;
     sf->tx_domain_dist_thres_level = 2;
     sf->simple_motion_search_prune_agg = 2;
+    sf->prune_obmc_using_stats = 1;
   }
 }
 
@@ -903,6 +904,7 @@
   sf->prune_comp_type_by_model_rd = 0;
   sf->disable_smooth_intra = 0;
   sf->perform_best_rd_based_gating_for_chroma = 0;
+  sf->prune_obmc_using_stats = 0;
 
   if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, speed);
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 2478d97..7cb63c6 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -790,6 +790,9 @@
   // frame.
   int adaptive_overlay_encoding;
 
+  // Prune obmc search using previous frame stats.
+  int prune_obmc_using_stats;
+
   // Use ALTREF frame in non-RD mode decision.
   int use_nonrd_altref_frame;