Prune HORZ4/VERT4 based on HORZ/VERT of split

This patch adds a speed feature prune_4_partition_using_split_info
to prune HORZ4/VERT4 partitions based on HORZ/VERT winner
info from split partitions.

This speed feature is enabled for cpu-used >= 3.

            Encode Time             Quality loss
cpu-used     Reduction    avg.psnr    ovr.psnr    ssim
   3           2.348%      0.0583%     0.0653%     0.0711%
   4           0.772%      0.0383%     0.0311%     0.0221%

Change-Id: I03e55a7461727cb2de8e2aaa4309bea172cb0736
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 687d688..c1bbb7c 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2584,6 +2584,12 @@
   }
 }
 
+// Structure to keep win flags for HORZ and VERT partition evaluations
+typedef struct {
+  bool horz_win;
+  bool vert_win;
+} RD_RECT_PART_WIN_INFO;
+
 // TODO(jinging,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2593,7 +2599,8 @@
                               BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
                               RD_STATS *rd_cost, RD_STATS best_rdc,
                               PC_TREE *pc_tree, int64_t *none_rd,
-                              SB_MULTI_PASS_MODE multi_pass_mode) {
+                              SB_MULTI_PASS_MODE multi_pass_mode,
+                              RD_RECT_PART_WIN_INFO *rect_part_win_info) {
   const AV1_COMMON *const cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
   TileInfo *const tile_info = &tile_data->tile_info;
@@ -2626,6 +2633,10 @@
   int horz_ctx_is_ready = 0;
   int vert_ctx_is_ready = 0;
   BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+  // Initialise HORZ and VERT win flags as true for all split partitions
+  RD_RECT_PART_WIN_INFO split_part_rect_win[4] = {
+    { true, true }, { true, true }, { true, true }, { true, true }
+  };
 
   bool found_best_partition = false;
   if (best_rdc.rdcost < 0) {
@@ -2977,7 +2988,8 @@
       if (!rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
                              mi_col + x_idx, subsize, max_sq_part, min_sq_part,
                              &this_rdc, best_remain_rdcost, pc_tree->split[idx],
-                             p_split_rd, multi_pass_mode)) {
+                             p_split_rd, multi_pass_mode,
+                             &split_part_rect_win[idx])) {
         av1_invalid_rd_stats(&sum_rdc);
         break;
       }
@@ -3128,6 +3140,11 @@
         found_best_partition = true;
         pc_tree->partitioning = PARTITION_HORZ;
       }
+    } else {
+      // Update HORZ win flag
+      if (rect_part_win_info != NULL) {
+        rect_part_win_info->horz_win = false;
+      }
     }
 
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
@@ -3210,6 +3227,11 @@
       best_rdc = sum_rdc;
       found_best_partition = true;
       pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      // Update VERT win flag
+      if (rect_part_win_info != NULL) {
+        rect_part_win_info->vert_win = false;
+      }
     }
 
     restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
@@ -3539,6 +3561,27 @@
     partition_vert4_allowed = 0;
   }
 
+  if (cpi->sf.part_sf.prune_4_partition_using_split_info &&
+      (partition_horz4_allowed || partition_vert4_allowed)) {
+    // Count of child blocks in which HORZ or VERT partition has won
+    int num_child_horz_win = 0, num_child_vert_win = 0;
+    for (int idx = 0; idx < 4; idx++) {
+      num_child_horz_win += (split_part_rect_win[idx].horz_win) ? 1 : 0;
+      num_child_vert_win += (split_part_rect_win[idx].vert_win) ? 1 : 0;
+    }
+
+    // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+    // split partiitons.
+    // Conservative pruning for high quantizers
+    const int num_win_thresh = 3 * (MAXQ - x->qindex) / MAXQ + 1;
+    if (num_child_horz_win < num_win_thresh) {
+      partition_horz4_allowed = 0;
+    }
+    if (num_child_vert_win < num_win_thresh) {
+      partition_vert4_allowed = 0;
+    }
+  }
+
   // PARTITION_HORZ_4
   assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
   if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
@@ -4578,14 +4621,14 @@
     if (num_passes == 1) {
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
                         max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_SINGLE_PASS);
+                        pc_root, NULL, SB_SINGLE_PASS, NULL);
     } else {
       // First pass
       SB_FIRST_PASS_STATS sb_fp_stats;
       backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
                         max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_DRY_PASS);
+                        pc_root, NULL, SB_DRY_PASS, NULL);
 
       // Second pass
       init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col,
@@ -4597,7 +4640,7 @@
 
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
                         max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
-                        pc_root, NULL, SB_WET_PASS);
+                        pc_root, NULL, SB_WET_PASS, NULL);
     }
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index d5c9bc9..6b0e1a6 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -437,6 +437,8 @@
 
     sf->part_sf.less_rectangular_check_level = 2;
     sf->part_sf.simple_motion_search_prune_agg = 1;
+    sf->part_sf.prune_4_partition_using_split_info =
+        cm->allow_screen_content_tools ? 0 : 1;
 
     // adaptive_motion_search breaks encoder multi-thread tests.
     // The values in x->pred_mv[] differ for single and multi-thread cases.
@@ -931,6 +933,7 @@
   part_sf->simple_motion_search_early_term_none = 0;
   part_sf->intra_cnn_split = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->prune_4_partition_using_split_info = 0;
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index bb5cb6d..d610383 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -428,6 +428,9 @@
 
   // Disable extended partition search for lower block sizes.
   int ext_partition_eval_thresh;
+
+  // Prune 1:4 partition search based on winner info from split partitions
+  int prune_4_partition_using_split_info;
 } PARTITION_SPEED_FEATURES;
 
 typedef struct MV_SPEED_FEATURES {