Use more localized stats for ref frame candidate pruning

For the mode_pruning_based_on_two_pass_partition_search speed feature,
collect mode decision stats for every 32x32 block in the first pass of
parition search pass, and use the stats to prune modes in the
corresponding regions.

Currently the pruning is only applied to reference frames; the pruning
strategy is to skip reference frames that have not been selected in the
first parition search pass.

This only affects speed >= 1.

Compression loss on lowres and midres(30 frames) is about 0.1%.
Encoding speed gains on some cif sequences with QP = 30:
cheer_cif:        6.4%
city_cif:         9.4%
coastguard_cif:  16.3%
container_cif:    9.0%
crew_cif:        11.3%
average:         10.5%

Change-Id: I31211f16cbe639d2230babc21870d7ac54ca9a89
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 48356a0..98b4df6 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -141,6 +141,31 @@
   struct tx_size_rd_info_node *children[4];
 } TXB_RD_INFO_NODE;
 
+// Region size for mode decision sampling in the first pass of partition
+// search(two_pass_partition_search speed feature), in units of mi size(4).
+// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
+#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
+#define FIRST_PARTITION_PASS_STATS_TABLES                     \
+  (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \
+      (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+#define FIRST_PARTITION_PASS_STATS_STRIDE \
+  (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+
+static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) {
+  const int row =
+      (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  const int col =
+      (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col;
+}
+
+typedef struct {
+  uint8_t ref0_counts[REF_FRAMES];  // Counters for ref_frame[0].
+  uint8_t ref1_counts[REF_FRAMES];  // Counters for ref_frame[1].
+  int sample_counts;                // Number of samples collected.
+} FIRST_PARTITION_PASS_STATS;
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -155,9 +180,8 @@
   // cost in the first pass search.
   int cb_partition_scan;
 
-  // If 0, do not allow corresponding ref frame during RD search.
-  uint8_t ref0_candidate_mask[REF_FRAMES + 1];  // The last entry is a counter.
-  uint8_t ref1_candidate_mask[REF_FRAMES];
+  FIRST_PARTITION_PASS_STATS
+  first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
 
   // Activate constrained coding block partition search range.
   int use_cb_search_range;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 28e0162..17a42a4 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3428,6 +3428,20 @@
   }
 }
 
+// Set all the counters as max.
+static void init_first_partition_pass_stats_tables(
+    FIRST_PARTITION_PASS_STATS *stats) {
+  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+    memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
+    memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
+    stats[i].sample_counts = INT_MAX;
+  }
+}
+
+// Minimum number of samples to trigger the
+// mode_pruning_based_on_two_pass_partition_search feature.
+#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
+
 static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
@@ -3582,17 +3596,15 @@
 
       reset_partition(pc_root, cm->seq_params.sb_size);
       x->use_cb_search_range = 0;
-      memset(x->ref0_candidate_mask, 1, sizeof(x->ref0_candidate_mask));
-      memset(x->ref1_candidate_mask, 1, sizeof(x->ref1_candidate_mask));
+      init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
       if (cpi->sf.two_pass_partition_search &&
           mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
           mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
           cm->frame_type != KEY_FRAME) {
         x->cb_partition_scan = 1;
-        if (sf->mode_pruning_based_on_two_pass_partition_search) {
-          av1_zero(x->ref0_candidate_mask);
-          av1_zero(x->ref1_candidate_mask);
-        }
+        // Reset the stats tables.
+        if (sf->mode_pruning_based_on_two_pass_partition_search)
+          av1_zero(x->first_partition_pass_stats);
         rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
                               cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
                               pc_root, NULL);
@@ -3627,17 +3639,21 @@
         x->use_cb_search_range = 1;
 
         if (sf->mode_pruning_based_on_two_pass_partition_search) {
-          if (x->ref0_candidate_mask[REF_FRAMES] < 16) {
-            // If there are not enough samples recorded, make all available.
-            memset(x->ref0_candidate_mask, 1, sizeof(x->ref0_candidate_mask));
-            memset(x->ref1_candidate_mask, 1, sizeof(x->ref1_candidate_mask));
-          } else if (sf->selective_ref_frame < 2) {
-            // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the initial
-            // partition scan, so we don't eliminate them.
-            x->ref0_candidate_mask[ALTREF2_FRAME] = 1;
-            x->ref1_candidate_mask[ALTREF2_FRAME] = 1;
-            x->ref0_candidate_mask[BWDREF_FRAME] = 1;
-            x->ref1_candidate_mask[BWDREF_FRAME] = 1;
+          for (i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+            FIRST_PARTITION_PASS_STATS *const stat =
+                &x->first_partition_pass_stats[i];
+            if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+              // If there are not enough samples collected, make all available.
+              memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+              memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+            } else if (sf->selective_ref_frame < 2) {
+              // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+              // initial partition scan, so we don't eliminate them.
+              stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref0_counts[BWDREF_FRAME] = 0xff;
+              stat->ref1_counts[BWDREF_FRAME] = 0xff;
+            }
           }
         }
 
@@ -4705,11 +4721,23 @@
 
   if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
       x->cb_partition_scan) {
-    // Increase the counter of data samples.
-    ++x->ref0_candidate_mask[REF_FRAMES];
-    // Record that ref_frame[0] and ref_frame[1] are picked.
-    x->ref0_candidate_mask[mbmi->ref_frame[0]] = 1;
-    if (mbmi->ref_frame[1] >= 0) x->ref1_candidate_mask[mbmi->ref_frame[1]] = 1;
+    for (int row = mi_row; row < mi_row + mi_width;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        // Increase the counter of data samples.
+        ++stats->sample_counts;
+        // Increase the counter for ref_frame[0] and ref_frame[1].
+        if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref0_counts[mbmi->ref_frame[0]];
+        if (mbmi->ref_frame[1] >= 0 &&
+            stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref1_counts[mbmi->ref_frame[1]];
+      }
+    }
   }
 
   if (!is_inter) {
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 5be29e7..2c634d5 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9069,7 +9069,8 @@
 static int inter_mode_search_order_independent_skip(const AV1_COMP *cpi,
                                                     const MACROBLOCK *x,
                                                     BLOCK_SIZE bsize,
-                                                    int mode_index) {
+                                                    int mode_index, int mi_row,
+                                                    int mi_col) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
@@ -9081,9 +9082,26 @@
 
   if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
       !x->cb_partition_scan) {
-    if (!x->ref0_candidate_mask[ref_frame[0]] ||
-        (ref_frame[1] >= 0 && !x->ref1_candidate_mask[ref_frame[1]]))
-      return 1;
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    int found = 0;
+    // Search in the stats table to see if the ref frames have been used in the
+    // first pass of partition search.
+    for (int row = mi_row; row < mi_row + mi_width && !found;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height && !found;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        const FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        if (stats->ref0_counts[ref_frame[0]] &&
+            (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) {
+          found = 1;
+          break;
+        }
+      }
+    }
+    if (!found) return 1;
   }
 
   if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
@@ -9257,7 +9275,8 @@
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
-    if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index))
+    if (inter_mode_search_order_independent_skip(cpi, x, bsize, mode_index,
+                                                 mi_row, mi_col))
       continue;
 
     if (ref_frame == INTRA_FRAME) {