Enable 2-pass coding block partition search

Obtain the most likely partition range from a first pass square
block base partition search. Use the constrained partition search
region for full rate-distortion optimization search in the second
pass.

Tested on pedestrian 1080p at 2000 kbps, it makes the encoding
speed 40% faster for speed 0 and 30% faster for speed 1. The
average coding performance loss is around 0.15%.

Change-Id: Ifc83d48e6413d1b887e68cd1962084e018a2258f
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 28fa1a7..3751c19 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -169,8 +169,14 @@
   // to select transform kernel.
   int rd_model;
 
+  // Indicate if the encoder is running in the first pass partition search.
+  // In that case, apply certain speed features therein to reduce the overhead
+  // cost in the first pass search.
   int cb_partition_scan;
 
+  // Activate constrained coding block partition search range.
+  int use_cb_search_range;
+
   // Also save RD info on the TX size search level for square TX sizes.
   TX_SIZE_RD_RECORD
   tx_size_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index a14e1f2..c99a9c3 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -23,6 +23,17 @@
 struct AV1Common;
 struct ThreadData;
 
+typedef enum {
+  // Search all the partition types in this plane.
+  SEARCH_FULL_PLANE = 0,
+  // Only search none_partition coding block.
+  NONE_PARTITION_PLANE = 1,
+  // Search all the partition types in this plane except split.
+  SEARCH_SAME_PLANE = 2,
+  // Skip search partition on this plane. Go split directly.
+  SPLIT_PLANE = 3,
+} CB_TREE_SEARCH;
+
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
@@ -80,6 +91,7 @@
   PICK_MODE_CONTEXT horizontal4[4];
   PICK_MODE_CONTEXT vertical4[4];
 #endif
+  CB_TREE_SEARCH cb_search_range;
   struct PC_TREE *split[4];
 } PC_TREE;
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 0a9c189..015bf0c 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2418,6 +2418,17 @@
 }
 #endif  // CONFIG_DIST_8X8
 
+static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      reset_partition(pc_tree->split[idx], subsize);
+  }
+}
+
 static void rd_pick_sqr_partition(const AV1_COMP *const cpi, ThreadData *td,
                                   TileDataEnc *tile_data, TOKENEXTRA **tp,
                                   int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -2433,7 +2444,7 @@
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
   int tmp_partition_cost[PARTITION_TYPES];
   BLOCK_SIZE subsize;
-  RD_STATS this_rdc, sum_rdc, best_rdc;
+  RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc;
   const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
   int do_square_split = bsize_at_least_8x8;
   const int pl = bsize_at_least_8x8
@@ -2508,6 +2519,8 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
+    if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
 #if CONFIG_EXT_PARTITION_TYPES
                      PARTITION_NONE,
@@ -2536,6 +2549,8 @@
         best_rdc = this_rdc;
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
+        pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
         // If all y, u, v transform blocks in this partition are skippable, and
         // the dist & rate are within the thresholds, the partition search is
         // terminated for current branch of the partition search tree.
@@ -2556,6 +2571,7 @@
   if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
 
   int64_t temp_best_rdcost = best_rdc.rdcost;
+  pn_rdc = best_rdc;
 
 #if CONFIG_DIST_8X8
   uint8_t *src_plane_8x8[MAX_MB_PLANE], *dst_plane_8x8[MAX_MB_PLANE];
@@ -2623,6 +2639,26 @@
       }
     }
 
+    int has_split = 0;
+    if (pc_tree->partitioning == PARTITION_SPLIT) {
+      for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) {
+        if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT)
+          ++has_split;
+      }
+
+      if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) {
+        pc_tree->cb_search_range = SPLIT_PLANE;
+      }
+    }
+
+    if (pc_tree->partitioning == PARTITION_NONE) {
+      pc_tree->cb_search_range = SEARCH_SAME_PLANE;
+      if (pn_rdc.dist <= sum_rdc.dist)
+        pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+    }
+
+    if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
   }  // if (do_split)
 
@@ -2796,6 +2832,24 @@
     partition_vert_allowed &= !has_cols;
   }
 
+  if (x->use_cb_search_range && cpi->sf.auto_min_max_partition_size == 0) {
+    if (pc_tree->cb_search_range == SPLIT_PLANE) {
+      partition_none_allowed = 0;
+      partition_horz_allowed = 0;
+      partition_vert_allowed = 0;
+    }
+
+    if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) {
+      do_square_split = 0;
+    }
+
+    if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) {
+      do_square_split = 0;
+      partition_horz_allowed = 0;
+      partition_vert_allowed = 0;
+    }
+  }
+
   xd->above_txfm_context =
       cm->above_txfm_context + (mi_col << TX_UNIT_WIDE_LOG2);
   xd->left_txfm_context = xd->left_txfm_context_buffer +
@@ -3657,9 +3711,12 @@
                                 &x->min_partition_size, &x->max_partition_size);
       }
 
+      reset_partition(pc_root, cm->sb_size);
+      x->use_cb_search_range = 0;
       if (cpi->sf.two_pass_partition_search &&
           mi_row + mi_size_high[cm->sb_size] < cm->mi_rows &&
-          mi_col + mi_size_wide[cm->sb_size] < cm->mi_cols) {
+          mi_col + mi_size_wide[cm->sb_size] < cm->mi_cols &&
+          cm->frame_type != KEY_FRAME) {
         x->cb_partition_scan = 1;
         rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
                               cm->sb_size, &dummy_rdc, INT64_MAX, pc_root,
@@ -3693,6 +3750,7 @@
           }
         }
 
+        x->use_cb_search_range = 1;
         rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
                           &dummy_rdc, INT64_MAX, pc_root, NULL);
       } else {
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 0764608..ef04729 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9708,7 +9708,7 @@
 
 #if CONFIG_FRAME_MARKER
     if (sf->selective_ref_frame) {
-      if (sf->selective_ref_frame == 2) {
+      if (sf->selective_ref_frame == 2 || x->cb_partition_scan) {
         if (mbmi->ref_frame[0] == ALTREF2_FRAME ||
             mbmi->ref_frame[1] == ALTREF2_FRAME)
           if (cm->cur_frame->alt2_frame_offset < cm->frame_offset) continue;