Add speed feature to skip repeat interp search

Add speed feature to skip repeat interpolation
filter search  (turn on for speed>=1)

1. Save the search result and current mv and ref_frame
at the end of each search for one block
2. At the begining of each search, check the saved
results, if there is matched result which has same
mv and ref_frame, the result can be reused.
3. Clear saved result at the begining of
av1_rd_pick_inter_mode_sb, search result only reused
in the same block.

4. For encoder, about 2.5% faster shows by encoding
15 frame of BasketballDrill_832x480_50.y4m, with no
coding loss.  ( 486668 ms -> 474152 ms)

a) gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
b) CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=800 --limit=15

Change-Id: I333c02736c8f4a280ff710fed5e5ec053ad5d9aa
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index d372890..bbd5734 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -166,6 +166,13 @@
   int sample_counts;                // Number of samples collected.
 } FIRST_PARTITION_PASS_STATS;
 
+#define MAX_INTERP_FILTER_STATS 64
+typedef struct {
+  InterpFilters filters;
+  MV mv[2];
+  int8_t ref_frames[2];
+} INTERPOLATION_FILTER_STATS;
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -183,6 +190,10 @@
   FIRST_PARTITION_PASS_STATS
   first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
 
+  // [comp_idx][saved stat_idx]
+  INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS];
+  int interp_filter_stats_idx[2];
+
   // Activate constrained coding block partition search range.
   int use_cb_search_range;
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 08adfec..24fc462 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -7453,6 +7453,50 @@
   return 0;
 }
 
+// check if there is saved result match with this search
+static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
+                                         MB_MODE_INFO *const mi) {
+  for (int i = 0; i < 2; ++i) {
+    const MV *mv = &mi->mv[i].as_mv;
+    if ((st->ref_frames[i] != mi->ref_frame[i]) || (st->mv[i].row != mv->row) ||
+        (st->mv[i].col != mv->col)) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
+                                              MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  for (int j = 0; j < offset; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j];
+    if (is_interp_filter_match(st, mbmi)) {
+      mbmi->interp_filters = st->filters;
+      return j;
+    }
+  }
+  return -1;  // no match result found
+}
+
+static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
+                                                  MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  if (offset < MAX_INTERP_FILTER_STATS) {
+    const MV mv0 = mbmi->mv[0].as_mv;
+    const MV mv1 = mbmi->mv[1].as_mv;
+    INTERPOLATION_FILTER_STATS stat = {
+      mbmi->interp_filters,
+      { mv0, mv1 },
+      { mbmi->ref_frame[0], mbmi->ref_frame[1] },
+    };
+    x->interp_filter_stats[comp_idx][offset] = stat;
+    x->interp_filter_stats_idx[comp_idx]++;
+  }
+}
+
 static int64_t interpolation_filter_search(
     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
     int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
@@ -7463,24 +7507,30 @@
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search =
+      av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
   int i, tmp_rate;
   int64_t tmp_dist;
 
   (void)single_filter;
-
+  int match_found = -1;
   const InterpFilter assign_filter = cm->interp_filter;
-  set_default_interp_filters(mbmi, assign_filter);
-
+  if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+    match_found = find_interp_filter_in_stats(x, mbmi);
+  }
+  if (!need_search || match_found == -1) {
+    set_default_interp_filters(mbmi, assign_filter);
+  }
   *switchable_rate = av1_get_switchable_rate(cm, x, xd);
   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
   model_rd_for_sb(cpi, bsize, x, xd, 0, num_planes - 1, &tmp_rate, &tmp_dist,
                   skip_txfm_sb, skip_sse_sb, NULL, NULL, NULL);
   *rd = RDCOST(x->rdmult, *switchable_rate + tmp_rate, tmp_dist);
 
-  if (assign_filter != SWITCHABLE) {
+  if (assign_filter != SWITCHABLE || match_found != -1) {
     return 0;
   }
-  if (!av1_is_interp_needed(xd) || !av1_is_interp_search_needed(xd)) {
+  if (!need_search) {
     assert(mbmi->interp_filters ==
            av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
     return 0;
@@ -7523,6 +7573,11 @@
     }
   }
   swap_dst_buf(xd, dst_bufs, num_planes);
+  // save search results
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    assert(match_found == -1);
+    save_interp_filter_search_stat(x, mbmi);
+  }
   return 0;
 }
 
@@ -9367,6 +9422,10 @@
     x->use_default_inter_tx_type = 1;
   else
     x->use_default_inter_tx_type = 0;
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    x->interp_filter_stats_idx[0] = 0;
+    x->interp_filter_stats_idx[1] = 0;
+  }
 }
 
 static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 6893d85..36fbc35 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -165,6 +165,7 @@
     sf->mode_pruning_based_on_two_pass_partition_search = 1;
     sf->prune_ext_partition_types_search_level = 2;
     sf->use_fast_interpolation_filter_search = 1;
+    sf->skip_repeat_interpolation_filter_search = 1;
     sf->tx_type_search.skip_tx_search = 1;
     sf->tx_type_search.ml_tx_split_thresh = 40;
     sf->model_based_prune_tx_search_level = 0;
@@ -490,6 +491,7 @@
   sf->use_transform_domain_distortion = 0;
   sf->gm_search_type = GM_FULL_SEARCH;
   sf->use_fast_interpolation_filter_search = 0;
+  sf->skip_repeat_interpolation_filter_search = 0;
   sf->use_hash_based_trellis = 0;
 
   // Set decoder side speed feature to use less dual sgr modes
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index ecfd15b..ee9a4823e 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -566,6 +566,11 @@
   // usually includes EIGHTTAP_REGULAR.
   int use_fast_interpolation_filter_search;
 
+  // Save results of interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are same with previous
+  // saved results, it can be skipped.
+  int skip_repeat_interpolation_filter_search;
+
   // Use a hash table to store previously computed optimized qcoeffs from
   // expensive calls to optimize_txb.
   int use_hash_based_trellis;