AV1 RT: Implement nonRD partition merge

Change-Id: Id1be0aa99d967c598d97bcd5bfec1633f860e352
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 05e6055..d4961d4 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2120,6 +2120,24 @@
   x->rdmult = orig_rdmult;
 }
 
+static int is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  assert(bsize >= BLOCK_8X8);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  for (int i = 0; i < 4; i++) {
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+      return 0;
+    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+        PARTITION_NONE)
+      return 0;
+  }
+  return 1;
+}
+
 static AOM_INLINE void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
                                            TileDataEnc *tile_data,
                                            MB_MODE_INFO **mib, TOKENEXTRA **tp,
@@ -2139,6 +2157,10 @@
                            : PARTITION_NONE;
   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
   assert(subsize <= BLOCK_LARGEST);
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+
   RD_STATS dummy_cost;
   av1_invalid_rd_stats(&dummy_cost);
   RD_STATS invalid_rd;
@@ -2200,15 +2222,92 @@
       }
       break;
     case PARTITION_SPLIT:
-      for (int i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * hbs;
-        int y_idx = (i >> 1) * hbs;
-        int jj = i >> 1, ii = i & 0x01;
-        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
-          continue;
-        nonrd_use_partition(
-            cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
-            mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+      if (cpi->sf.nonrd_merge_partition &&
+          is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+          !frame_is_intra_only(cm)) {
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+        RD_STATS split_rdc, none_rdc;
+        av1_init_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        xd->above_txfm_context =
+            cm->above_txfm_context[tile_info->tile_row] + mi_col;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+        pc_tree->partitioning = PARTITION_NONE;
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      sf->use_fast_nonrd_pick_mode ? PICK_MODE_FAST_NONRD
+                                                   : PICK_MODE_NONRD);
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        for (int i = 0; i < 4; i++) {
+          RD_STATS block_rdc;
+          av1_invalid_rd_stats(&block_rdc);
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          if ((mi_row + y_idx >= cm->mi_rows) ||
+              (mi_col + x_idx >= cm->mi_cols))
+            continue;
+          xd->above_txfm_context =
+              cm->above_txfm_context[tile_info->tile_row] + mi_col + x_idx;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+          pc_tree->split[i]->partitioning = PARTITION_NONE;
+          pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                        &block_rdc, PARTITION_NONE, subsize,
+                        &pc_tree->split[i]->none, invalid_rd,
+                        sf->use_fast_nonrd_pick_mode ? PICK_MODE_FAST_NONRD
+                                                     : PICK_MODE_NONRD);
+          split_rdc.rate += block_rdc.rate;
+          split_rdc.dist += block_rdc.dist;
+
+          encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                   subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+        }
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+                                         mi_col, bsize);
+          mib[0]->sb_type = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   &pc_tree->none, NULL);
+        } else {
+          set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+                                         mi_col, bsize);
+          mib[0]->sb_type = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < 4; i++) {
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= cm->mi_rows) ||
+                (mi_col + x_idx >= cm->mi_cols))
+              continue;
+
+            set_offsets_without_segment_id(cpi, &tile_data->tile_info, x,
+                                           mi_row + y_idx, mi_col + x_idx,
+                                           subsize);
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; i++) {
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          int jj = i >> 1, ii = i & 0x01;
+          if ((mi_row + y_idx >= cm->mi_rows) ||
+              (mi_col + x_idx >= cm->mi_cols))
+            continue;
+          nonrd_use_partition(
+              cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+              mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+        }
       }
       break;
     case PARTITION_VERT_A:
@@ -2220,8 +2319,6 @@
       assert(0 && "Cannot handle extended partition types");
     default: assert(0); break;
   }
-  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
 #if !CONFIG_REALTIME_ONLY
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 244a455..3c58d50 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -930,6 +930,7 @@
   ctx->best_mode_index = mode_index;
 #endif  // CONFIG_INTERNAL_STATS
   ctx->mic = *xd->mi[0];
+  ctx->skippable = x->skip;
   ctx->mbmi_ext = *x->mbmi_ext;
   ctx->comp_pred_diff = 0;
   ctx->hybrid_pred_diff = 0;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c690a8f..5eb8993 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -683,6 +683,10 @@
   }
   if (speed >= 8) {
     sf->use_fast_nonrd_pick_mode = 1;
+    // Consider moving nonrd_merge_partition to speed 7
+#if 0
+    sf->nonrd_merge_partition = 1;
+#endif
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->tx_size_search_level = 1;
     sf->estimate_motion_for_var_based_partition = 0;
@@ -911,6 +915,7 @@
   sf->disable_smooth_intra = 0;
   sf->perform_best_rd_based_gating_for_chroma = 0;
   sf->prune_obmc_prob_thresh = 0;
+  sf->nonrd_merge_partition = 0;
 
   if (oxcf->mode == GOOD)
     set_good_speed_features_framesize_independent(cpi, sf, speed);
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 96eaf5f..1345ebf 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -808,6 +808,8 @@
 
   // Number of best inter modes to search transform. INT_MAX - search all.
   int num_inter_modes_for_tx_search;
+
+  int nonrd_merge_partition;
 } SPEED_FEATURES;
 
 struct AV1_COMP;