AV1 RT: Don't check leaf split if merged block can be skipped

~20% speed up on speed 7 lowres with 0.5% BDRate degradation overall and
1.1% max BDRate degradation. On midres the max degradation is 4% so
turning this off for 480p and higher

Change-Id: I1d97ca9ece8842f71d334d028d2b0b1b00ccda04
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 9f1e0c7..a46d228 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2316,12 +2316,12 @@
       }
       break;
     case PARTITION_SPLIT:
-      if (cpi->sf.rt_sf.nonrd_check_partition_merge &&
+      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
           is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
           !frame_is_intra_only(cm)) {
         RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
         RD_STATS split_rdc, none_rdc;
-        av1_init_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&split_rdc);
         av1_invalid_rd_stats(&none_rdc);
         save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
         xd->above_txfm_context =
@@ -2335,32 +2335,36 @@
         none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
         restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 ||
+            none_rdc.skip != 1 || pc_tree->none.mic.mode == NEWMV) {
+          av1_init_rd_stats(&split_rdc);
+          for (int i = 0; i < 4; i++) {
+            RD_STATS block_rdc;
+            av1_invalid_rd_stats(&block_rdc);
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= cm->mi_rows) ||
+                (mi_col + x_idx >= cm->mi_cols))
+              continue;
+            xd->above_txfm_context =
+                cm->above_txfm_context[tile_info->tile_row] + mi_col + x_idx;
+            xd->left_txfm_context = xd->left_txfm_context_buffer +
+                                    ((mi_row + y_idx) & MAX_MIB_MASK);
+            pc_tree->split[i]->partitioning = PARTITION_NONE;
+            pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                          &block_rdc, PARTITION_NONE, subsize,
+                          &pc_tree->split[i]->none, invalid_rd,
+                          PICK_MODE_NONRD);
+            split_rdc.rate += block_rdc.rate;
+            split_rdc.dist += block_rdc.dist;
 
-        for (int i = 0; i < 4; i++) {
-          RD_STATS block_rdc;
-          av1_invalid_rd_stats(&block_rdc);
-          int x_idx = (i & 1) * hbs;
-          int y_idx = (i >> 1) * hbs;
-          if ((mi_row + y_idx >= cm->mi_rows) ||
-              (mi_col + x_idx >= cm->mi_cols))
-            continue;
-          xd->above_txfm_context =
-              cm->above_txfm_context[tile_info->tile_row] + mi_col + x_idx;
-          xd->left_txfm_context =
-              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
-          pc_tree->split[i]->partitioning = PARTITION_NONE;
-          pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                        &block_rdc, PARTITION_NONE, subsize,
-                        &pc_tree->split[i]->none, invalid_rd, PICK_MODE_NONRD);
-          split_rdc.rate += block_rdc.rate;
-          split_rdc.dist += block_rdc.dist;
-
-          encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
-                   subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+          split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+          split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
         }
-        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
-        split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
-        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
         if (none_rdc.rdcost < split_rdc.rdcost) {
           mib[0]->sb_type = bsize;
           pc_tree->partitioning = PARTITION_NONE;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 963796c..6230c17 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -257,6 +257,9 @@
     }
   }
   if (!is_480p_or_larger) {
+    if (speed == 7) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
     if (speed >= 8) {
       sf->mv_sf.subpel_search_method = SUBPEL_TREE;
 
@@ -835,7 +838,7 @@
     sf->rt_sf.use_comp_ref_nonrd = 0;
     sf->rt_sf.use_nonrd_altref_frame = 1;
     sf->rt_sf.use_nonrd_pick_mode = 1;
-    sf->rt_sf.nonrd_check_partition_merge = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
     sf->rt_sf.nonrd_check_partition_split = 0;
     sf->rt_sf.hybrid_intra_pickmode = 1;
   }
@@ -847,7 +850,7 @@
     sf->rt_sf.nonrd_use_blockyrd_interp_filter = 0;
     sf->rt_sf.use_nonrd_altref_frame = 0;
     sf->rt_sf.nonrd_reduce_golden_mode_search = 1;
-    sf->rt_sf.nonrd_check_partition_merge = 0;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
     sf->rt_sf.nonrd_check_partition_split = 0;
 
 // TODO(kyslov) Enable when better model is available
@@ -1086,7 +1089,7 @@
   rt_sf->force_tx_search_off = 0;
   rt_sf->num_inter_modes_for_tx_search = INT_MAX;
   rt_sf->use_simple_rd_model = 0;
-  rt_sf->nonrd_check_partition_merge = 0;
+  rt_sf->nonrd_check_partition_merge_mode = 0;
   rt_sf->nonrd_check_partition_split = 0;
 }
 
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 5ae0308..abac1a8 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -864,9 +864,13 @@
   // Perform coarse ME before calculating variance in variance-based partition
   int estimate_motion_for_var_based_partition;
 
-  // For nonrd_use_partition: perform extra check of leaf partition split and
-  // merge
-  int nonrd_check_partition_merge;
+  // For nonrd_use_partition: mode of extra check of leaf partition
+  // 0 - don't check merge
+  // 1 - always check merge
+  // 2 - check merge and prune checking final split
+  int nonrd_check_partition_merge_mode;
+
+  // For nonrd_use_partition: check of leaf partition extra split
   int nonrd_check_partition_split;
 
   // Implements various heuristics to skip searching modes