Add skip_related_full_newmv speed feature

This CL introduces a new speed feature that prunes the current
ref_mv in NEW_MV mode after a full_pixel search if we have already
encountered another ref_mv in the drl such that:
 1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
    search process as the current fullpel_mv.
 2. The rate needed to encode the current fullpel_mv is larger than that
    for the other ref_mv.

This is turned on on speed 3 and above.

Performance on midres:
  SPD_SET | AVG_PSNR | OVR_PSNR |   SSIM  |  SPD  |
     3    |  +0.150% |  +0.157% | +0.084% | +3.8% |
     4    |  +0.129% |  +0.133% | +0.114% | +3.6% |
     5    |  +0.120% |  +0.128% | +0.135% | +3.1% |

STATS_CHANGED

Change-Id: Ic29d553437cba15fdac66ef9ed304df19ad59df2
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 3a0fd2e..870a308 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -20,7 +20,7 @@
 
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
-                              int search_range) {
+                              int search_range, inter_mode_info *mode_info) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -150,9 +150,50 @@
 
   x->mv_limits = tmp_mv_limits;
 
+  // Terminate search with the current ref_idx if we have already encountered
+  // another ref_mv in the drl such that:
+  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
+  //     search process as the current fullpel_mv.
+  //  2. The rate needed to encode the current fullpel_mv is larger than that
+  //     for the other ref_mv.
+  if (cpi->sf.inter_sf.skip_repeated_full_newmv &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION &&
+      x->best_mv.as_int != INVALID_MV) {
+    int_mv this_mv;
+    this_mv.as_mv = get_mv_from_fullmv(&x->best_mv.as_fullmv);
+    const int ref_mv_idx = mbmi->ref_mv_idx;
+    const int this_mv_rate =
+        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, x->nmv_vec_cost,
+                        x->mv_cost_stack, MV_COST_WEIGHT);
+    mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
+    mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+
+    for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+      // Check if the motion search result same as previous results
+      if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) {
+        // Compare the rate cost
+        const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate +
+                                   mode_info[prev_ref_idx].drl_cost;
+        const int this_rate_cost =
+            this_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+        if (prev_rate_cost <= this_rate_cost) {
+          // If the current rate_cost is worse than the previous rate_cost, then
+          // we terminate the search. Since av1_single_motion_search is only
+          // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the
+          // best_mv to INVALID mv to signal that we wish to terminate search
+          // for the current mode.
+          x->best_mv.as_int = INVALID_MV;
+          return;
+        }
+      }
+    }
+  }
+
   if (cpi->common.cur_frame_force_integer_mv) {
     convert_fullmv_to_mv(&x->best_mv);
   }
+
   const int use_fractional_mv =
       bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
   if (use_fractional_mv) {
diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index 960df34..bf5d635 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h
@@ -18,9 +18,20 @@
 extern "C" {
 #endif
 
+typedef struct {
+  int64_t rd;
+  int drl_cost;
+
+  int rate_mv;
+  int_mv mv;
+
+  int_mv full_search_mv;
+  int full_mv_rate;
+} inter_mode_info;
+
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
-                              int search_range);
+                              int search_range, inter_mode_info *mode_info);
 
 void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                              BLOCK_SIZE bsize, int_mv *cur_mv,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 083e348..0c170d0 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1055,8 +1055,8 @@
 
 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize, int_mv *cur_mv,
-                            int *const rate_mv,
-                            HandleInterModeArgs *const args) {
+                            int *const rate_mv, HandleInterModeArgs *const args,
+                            inter_mode_info *mode_info) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int is_comp_pred = has_second_ref(mbmi);
@@ -1165,7 +1165,8 @@
       }
     }
 
-    av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range);
+    av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
+                             mode_info);
     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
 
     args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv;
@@ -1358,7 +1359,7 @@
       const uint32_t cur_mv = mbmi->mv[0].as_int;
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
-        av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX);
+        av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL);
         mbmi->mv[0].as_int = x->best_mv.as_int;
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
@@ -1890,13 +1891,6 @@
   return false;
 }
 
-typedef struct {
-  int64_t rd;
-  int drl_cost;
-  int rate_mv;
-  int_mv mv;
-} inter_mode_info;
-
 // Compute the estimated RD cost for the motion vector with simple translation.
 static int64_t simple_translation_pred_rd(
     AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
@@ -2129,6 +2123,7 @@
   const int base_rate =
       args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
     mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
     mode_info[ref_mv_idx].rd = INT64_MAX;
     if (!mask_check_bit(idx_mask, ref_mv_idx)) {
@@ -2175,7 +2170,8 @@
         cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
         rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
       } else {
-        newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args);
+        newmv_ret_val =
+            handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
       }
 #if CONFIG_COLLECT_COMPONENT_TIMING
       end_timing(cpi, handle_newmv_time);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 2adb00e..d5c9bc9 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -445,6 +445,7 @@
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
     sf->mv_sf.search_method = DIAMOND;
+
     sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
     // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
     // it with cpi->sf.disable_wedge_search_var_thresh.
@@ -456,6 +457,7 @@
     sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
     sf->inter_sf.selective_ref_frame = 4;
     sf->inter_sf.skip_repeated_ref_mv = 1;
+    sf->inter_sf.skip_repeated_full_newmv = 1;
     if (cpi->oxcf.enable_smooth_interintra)
       sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
@@ -966,6 +968,7 @@
   inter_sf->prune_comp_search_by_single_result = 0;
   inter_sf->skip_repeated_ref_mv = 0;
   inter_sf->skip_repeated_newmv = 0;
+  inter_sf->skip_repeated_full_newmv = 0;
   inter_sf->prune_single_motion_modes_by_simple_trans = 0;
   inter_sf->inter_mode_rd_model_estimation = 0;
   inter_sf->prune_compound_using_single_ref = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 3facef7..bb5cb6d 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -523,6 +523,14 @@
   // flag to skip NEWMV mode in drl if the motion search result is the same
   int skip_repeated_newmv;
 
+  // Skip the current ref_mv in NEW_MV mode if we have already encountered
+  // another ref_mv in the drl such that:
+  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
+  //     search process as the current fullpel_mv.
+  //  2. The rate needed to encode the current fullpel_mv is larger than that
+  //     for the other ref_mv.
+  int skip_repeated_full_newmv;
+
   // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
   // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
   // TODO(any): Instead of skipping repeated ref mv, use the recalculated