Add a speed feature to limit ref frame search

Limit the ref frame candidates to those picked during the initial
partition search(the adaptive_txb_search speed feature).

Compression quality impact is neutral.

Encoding speed improvement on top of speed 1(30 frames):

                  QP=20    QP=40
akiyo_cif:         15%      22%
cheer_cif:          2%       1%
city_cif:          10%      14%
coastguard_cif:     8%      16%
container_cif:    6.5%    17.5%
crew_cif:         8.5%    11.5%
AVERAGE:          8.4%    13.8%

Enabled for speed 1 and above.

STATS_CHANGED

Change-Id: I5f43fb7c4e9932240f8d163bad663e8249952fa3
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 9ae6366..643861f 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -154,6 +154,10 @@
   // cost in the first pass search.
   int cb_partition_scan;
 
+  // If 0, do not allow corresponding ref frame during RD search.
+  uint8_t ref0_candidate_mask[REF_FRAMES + 1];  // The last entry is a counter.
+  uint8_t ref1_candidate_mask[REF_FRAMES];
+
   // Activate constrained coding block partition search range.
   int use_cb_search_range;
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index ecba645..f403792 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3536,11 +3536,17 @@
 
       reset_partition(pc_root, cm->seq_params.sb_size);
       x->use_cb_search_range = 0;
+      memset(x->ref0_candidate_mask, 1, sizeof(x->ref0_candidate_mask));
+      memset(x->ref1_candidate_mask, 1, sizeof(x->ref1_candidate_mask));
       if (cpi->sf.two_pass_partition_search &&
           mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
           mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
           cm->frame_type != KEY_FRAME) {
         x->cb_partition_scan = 1;
+        if (sf->mode_pruning_based_on_two_pass_partition_search) {
+          av1_zero(x->ref0_candidate_mask);
+          av1_zero(x->ref1_candidate_mask);
+        }
         rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
                               cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
                               pc_root, NULL);
@@ -3573,6 +3579,22 @@
         }
 
         x->use_cb_search_range = 1;
+
+        if (sf->mode_pruning_based_on_two_pass_partition_search) {
+          if (x->ref0_candidate_mask[REF_FRAMES] < 16) {
+            // If there are not enough samples recorded, make all available.
+            memset(x->ref0_candidate_mask, 1, sizeof(x->ref0_candidate_mask));
+            memset(x->ref1_candidate_mask, 1, sizeof(x->ref1_candidate_mask));
+          } else if (sf->selective_ref_frame < 2) {
+            // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the initial
+            // partition scan, so we don't eliminate them.
+            x->ref0_candidate_mask[ALTREF2_FRAME] = 1;
+            x->ref1_candidate_mask[ALTREF2_FRAME] = 1;
+            x->ref0_candidate_mask[BWDREF_FRAME] = 1;
+            x->ref1_candidate_mask[BWDREF_FRAME] = 1;
+          }
+        }
+
         rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
                           cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
                           pc_root, NULL);
@@ -4632,6 +4654,15 @@
   const int mi_height = mi_size_high[bsize];
   const int is_inter = is_inter_block(mbmi);
 
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      x->cb_partition_scan) {
+    // Increase the counter of data samples.
+    ++x->ref0_candidate_mask[REF_FRAMES];
+    // Record that ref_frame[0] and ref_frame[1] are picked.
+    x->ref0_candidate_mask[mbmi->ref_frame[0]] = 1;
+    if (mbmi->ref_frame[1] >= 0) x->ref1_candidate_mask[mbmi->ref_frame[1]] = 1;
+  }
+
   if (!is_inter) {
     xd->cfl.is_chroma_reference = is_chroma_reference(
         mi_row, mi_col, bsize, cm->subsampling_x, cm->subsampling_y);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 47ba46c..db66b37 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -8993,6 +8993,13 @@
   const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
   const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
 
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      !x->cb_partition_scan) {
+    if (!x->ref0_candidate_mask[ref_frame[0]] ||
+        (ref_frame[1] >= 0 && !x->ref1_candidate_mask[ref_frame[1]]))
+      return 1;
+  }
+
   if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
     // Mode must by compatible
     if (!is_interintra_allowed_mode(this_mode)) return 1;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index a3cb7a0..c573ecb 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -148,6 +148,7 @@
     sf->tx_size_search_init_depth_sqr = 1;
     sf->tx_size_search_lgr_block = 1;
     sf->two_pass_partition_search = 1;
+    sf->mode_pruning_based_on_two_pass_partition_search = 1;
     sf->prune_ext_partition_types_search = 1;
     sf->use_fast_interpolation_filter_search = 1;
     sf->tx_type_search.skip_tx_search = 1;
@@ -445,6 +446,7 @@
   sf->txb_split_cap = 1;
   sf->adaptive_txb_search = 0;
   sf->two_pass_partition_search = 0;
+  sf->mode_pruning_based_on_two_pass_partition_search = 0;
   sf->use_intra_txb_hash = 0;
   sf->use_inter_txb_hash = 1;
   sf->use_mb_rd_hash = 1;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index d6edab9..4e38709 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -383,6 +383,10 @@
   // 2-pass coding block partition search
   int two_pass_partition_search;
 
+  // Use the mode decisions made in the initial partition search to prune mode
+  // candidates, e.g. ref frames.
+  int mode_pruning_based_on_two_pass_partition_search;
+
   // Skip rectangular partition test when partition type none gives better
   // rd than partition type split.
   int less_rectangular_check;