Preload wedge mask index to bypass certain motion search

Allow the encoder to re-use the wedge index from initial motion
search results, and bypass mask index search in the subsequent
reference mvs search. Local test shows about 20% speed-up in
speed 1, 150 frames. The coding performance loss is

        overall PSNR   SSIM
lowres   0.050%       0.125%
midres2  0.083%       0.178%

STATS_CHANGED

Change-Id: I47328ffee24297e38f41c38926defe6a80337043
diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 659b432..8fc29f3 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c
@@ -1224,7 +1224,8 @@
 static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
 
 int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
                          int masked_compound_used, const BUFFER_SET *orig_dst,
                          const BUFFER_SET *tmp_dst,
                          const CompoundTypeRdBuffers *buffers, int *rate_mv,
@@ -1353,7 +1354,12 @@
       int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
       int best_rate_mv = *rate_mv;
       const int wedge_mask_size = get_wedge_types_lookup(bsize);
-      for (int wedge_mask = 0; wedge_mask < wedge_mask_size; ++wedge_mask) {
+
+      int need_mask_search =
+          args->wedge_index == -1 || !have_newmv_in_inter_mode(this_mode);
+
+      for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search;
+           ++wedge_mask) {
         for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
           tmp_rate_mv = *rate_mv;
           mbmi->interinter_comp.wedge_index = wedge_mask;
@@ -1390,6 +1396,24 @@
         }
       }
 
+      if (need_mask_search) {
+        args->wedge_index = best_mask_index;
+        args->wedge_sign = best_wedge_sign;
+      } else {
+        mbmi->interinter_comp.wedge_index = args->wedge_index;
+        mbmi->interinter_comp.wedge_sign = args->wedge_sign;
+        rs2 = masked_type_cost[cur_type];
+        rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+        tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                            bsize, this_mode);
+
+        best_mask_index = args->wedge_index;
+        best_wedge_sign = args->wedge_sign;
+        tmp_mv[0] = mbmi->mv[0];
+        tmp_mv[1] = mbmi->mv[1];
+        best_rate_mv = tmp_rate_mv;
+      }
+
       mbmi->interinter_comp.wedge_index = best_mask_index;
       mbmi->interinter_comp.wedge_sign = best_wedge_sign;
       mbmi->mv[0] = tmp_mv[0];
diff --git a/av1/encoder/compound_type.h b/av1/encoder/compound_type.h
index bad0ba8..a028a35 100644
--- a/av1/encoder/compound_type.h
+++ b/av1/encoder/compound_type.h
@@ -36,7 +36,8 @@
                                 const BUFFER_SET *orig_dst);
 
 int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         HandleInterModeArgs *args, BLOCK_SIZE bsize,
+                         int_mv *cur_mv, int mode_search_mask,
                          int masked_compound_used, const BUFFER_SET *orig_dst,
                          const BUFFER_SET *tmp_dst,
                          const CompoundTypeRdBuffers *buffers, int *rate_mv,
diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index 8f4fcd7..be57872 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h
@@ -123,6 +123,14 @@
    * Index of the last set of saved stats in the interp_filter_stats array.
    */
   int interp_filter_stats_idx;
+  /*!
+   * Estimated wedge index.
+   */
+  int wedge_index;
+  /*!
+   * Estimated wedge sign.
+   */
+  int wedge_sign;
 } HandleInterModeArgs;
 
 /*!\cond */
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 1a6c42e..c81f213 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2554,9 +2554,9 @@
   // (for example, the mask parameters if it is a masked mode) and compute
   // the RD
   *compmode_interinter_cost = av1_compound_type_rd(
-      cpi, x, bsize, cur_mv, mode_search_mask, masked_compound_used, orig_dst,
-      tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats, ref_best_rd,
-      skip_rd[1], &is_luma_interp_done, rd_thresh);
+      cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used,
+      orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats,
+      ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
   if (ref_best_rd < INT64_MAX &&
       (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
           ref_best_rd) {
@@ -2766,6 +2766,9 @@
     save_mv[i][1].as_int = INVALID_MV;
   }
 
+  args->wedge_index = -1;
+  args->wedge_sign = -1;
+
   // Main loop of this function. This will  iterate over all of the ref mvs
   // in the dynamic reference list and do the following:
   //    1.) Get the current MV. Create newmv MV if necessary
@@ -5346,6 +5349,8 @@
                                0,
                                interintra_modes,
                                { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
+                               0,
+                               0,
                                0 };
   // Indicates the appropriate number of simple translation winner modes for
   // exhaustive motion mode evaluation