Speed up of ext-partition types
Search the new horz/vert a/b/4 partitions only if the best so far
is either oriented along the same direction or split/none, or if
the rd costs obtained from the previous partition searches indicate
there is potential in searching these partitions.
This brings about 25-30% speedup at less than 0.1% drop as seen on
lowres 30 frames.
Change-Id: I6c6c347e06c34ee0ca17479aeeb4075a66dc7e2c
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index b7da15a..58f1165 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -2481,7 +2481,7 @@
TileDataEnc *tile_data, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE bsize,
RD_STATS *rd_cost, int64_t best_rd,
- PC_TREE *pc_tree) {
+ PC_TREE *pc_tree, int64_t *none_rd) {
const AV1_COMMON *const cm = &cpi->common;
TileInfo *const tile_info = &tile_data->tile_info;
MACROBLOCK *const x = &td->mb;
@@ -2512,6 +2512,11 @@
pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
int do_rectangular_split = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+ int64_t split_rd[4] = { 0, 0, 0, 0 };
+ int64_t horz_rd[4] = { 0, 0 };
+ int64_t vert_rd[4] = { 0, 0 };
+#endif // CONFIG_EXT_PARTITION_TYPES
#if CONFIG_EXT_PARTITION_TYPES && !CONFIG_EXT_PARTITION_TYPES_AB
BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
#endif
@@ -2525,6 +2530,8 @@
BLOCK_SIZE min_size = x->min_partition_size;
BLOCK_SIZE max_size = x->max_partition_size;
+ if (none_rd) *none_rd = 0;
+
#if CONFIG_FP_MB_STATS
unsigned int src_diff_var = UINT_MAX;
int none_complexity = 0;
@@ -2679,6 +2686,7 @@
PARTITION_NONE,
#endif
bsize, ctx_none, best_rdc.rdcost);
+ if (none_rd) *none_rd = this_rdc.rdcost;
if (this_rdc.rate != INT_MAX) {
if (bsize_at_least_8x8) {
const int pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
@@ -2786,9 +2794,14 @@
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
pc_tree->split[idx]->index = idx;
+#if CONFIG_EXT_PARTITION_TYPES
+ int64_t *p_split_rd = &split_rd[idx];
+#else
+ int64_t *p_split_rd = NULL;
+#endif // CONFIG_EXT_PARTITION_TYPES
rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
subsize, &this_rdc, temp_best_rdcost - sum_rdc.rdcost,
- pc_tree->split[idx]);
+ pc_tree->split[idx], p_split_rd);
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
@@ -2846,6 +2859,9 @@
PARTITION_HORZ,
#endif
subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
+#if CONFIG_EXT_PARTITION_TYPES
+ horz_rd[0] = sum_rdc.rdcost;
+#endif // CONFIG_EXT_PARTITION_TYPES
if (sum_rdc.rdcost < temp_best_rdcost && !force_horz_split) {
PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
@@ -2866,6 +2882,9 @@
#endif
subsize, &pc_tree->horizontal[1],
best_rdc.rdcost - sum_rdc.rdcost);
+#if CONFIG_EXT_PARTITION_TYPES
+ horz_rd[1] = this_rdc.rdcost;
+#endif // CONFIG_EXT_PARTITION_TYPES
#if CONFIG_DIST_8X8
if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
@@ -2925,6 +2944,9 @@
PARTITION_VERT,
#endif
subsize, &pc_tree->vertical[0], best_rdc.rdcost);
+#if CONFIG_EXT_PARTITION_TYPES
+ vert_rd[0] = sum_rdc.rdcost;
+#endif // CONFIG_EXT_PARTITION_TYPES
const int64_t vert_max_rdcost = best_rdc.rdcost;
if (sum_rdc.rdcost < vert_max_rdcost && !force_vert_split) {
update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col,
@@ -2945,6 +2967,9 @@
#endif
subsize, &pc_tree->vertical[1],
best_rdc.rdcost - sum_rdc.rdcost);
+#if CONFIG_EXT_PARTITION_TYPES
+ vert_rd[1] = this_rdc.rdcost;
+#endif // CONFIG_EXT_PARTITION_TYPES
#if CONFIG_DIST_8X8
if (x->using_dist_8x8 && this_rdc.rate != INT_MAX && bsize == BLOCK_8X8) {
@@ -2998,105 +3023,157 @@
// subsample to 16x2, which doesn't have an enum. Also, there's no BLOCK_8X2
// or BLOCK_2X8, so we can't do 4:1 or 1:4 partitions for BLOCK_16X16 if there
// is any subsampling.
- const int horz4_partition_allowed =
- ext_partition_allowed && partition_horz_allowed;
- const int vert4_partition_allowed =
- ext_partition_allowed && partition_vert_allowed;
+ int horz4_partition_allowed = ext_partition_allowed && partition_horz_allowed;
+ int vert4_partition_allowed = ext_partition_allowed && partition_vert_allowed;
#if CONFIG_EXT_PARTITION_TYPES_AB
// The alternative AB partitions are allowed iff the corresponding 4:1
// partitions are allowed.
- const int horzab_partition_allowed = horz4_partition_allowed;
- const int vertab_partition_allowed = vert4_partition_allowed;
+ int horzab_partition_allowed = horz4_partition_allowed;
+ int vertab_partition_allowed = vert4_partition_allowed;
#else
// The standard AB partitions are allowed whenever ext-partition-types are
// allowed
- const int horzab_partition_allowed = ext_partition_allowed;
- const int vertab_partition_allowed = ext_partition_allowed;
-#endif
+ int horzab_partition_allowed = ext_partition_allowed;
+ int vertab_partition_allowed = ext_partition_allowed;
- // PARTITION_HORZ_A
- if (partition_horz_allowed && horzab_partition_allowed) {
+ if (cpi->sf.prune_ext_partition_types_search) {
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ }
+ int horza_partition_allowed = horzab_partition_allowed;
+ int horzb_partition_allowed = horzab_partition_allowed;
+ if (cpi->sf.prune_ext_partition_types_search) {
+ const int64_t horz_a_rd = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0) +
+ (split_rd[0] < INT64_MAX ? split_rd[0] : 0) +
+ (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+ const int64_t horz_b_rd = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0) +
+ (split_rd[2] < INT64_MAX ? split_rd[2] : 0) +
+ (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+ horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
+ horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
+ }
+#endif // CONFIG_EXT_PARTITION_TYPES_AB
+
+// PARTITION_HORZ_A
#if CONFIG_EXT_PARTITION_TYPES_AB
+ if (partition_horz_allowed && horzab_partition_allowed) {
rd_test_partition3(
cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col,
get_subsize(bsize, PARTITION_HORZ_4), mi_row + mi_step / 2, mi_col,
get_subsize(bsize, PARTITION_HORZ_4), mi_row + mi_step, mi_col,
get_subsize(bsize, PARTITION_HORZ));
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
#else
+ if (partition_horz_allowed && horza_partition_allowed) {
subsize = get_subsize(bsize, PARTITION_HORZ_A);
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
subsize);
-#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize);
}
- // PARTITION_HORZ_B
- if (partition_horz_allowed && horzab_partition_allowed) {
+#endif
+// PARTITION_HORZ_B
#if CONFIG_EXT_PARTITION_TYPES_AB
+ if (partition_horz_allowed && horzab_partition_allowed) {
rd_test_partition3(
cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col,
get_subsize(bsize, PARTITION_HORZ), mi_row + mi_step, mi_col,
get_subsize(bsize, PARTITION_HORZ_4), mi_row + 3 * mi_step / 2, mi_col,
get_subsize(bsize, PARTITION_HORZ_4));
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
+ (void)vert_rd;
+ (void)horz_rd;
+ (void)split_rd;
#else
+ if (partition_horz_allowed && horzb_partition_allowed) {
subsize = get_subsize(bsize, PARTITION_HORZ_B);
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_B, mi_row, mi_col, subsize,
mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
mi_col + mi_step, bsize2);
-#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize);
}
- // PARTITION_VERT_A
- if (partition_vert_allowed && vertab_partition_allowed) {
+
+ int verta_partition_allowed = vertab_partition_allowed;
+ int vertb_partition_allowed = vertab_partition_allowed;
+ if (cpi->sf.prune_ext_partition_types_search) {
+ const int64_t vert_a_rd = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0) +
+ (split_rd[0] < INT64_MAX ? split_rd[0] : 0) +
+ (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+ const int64_t vert_b_rd = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0) +
+ (split_rd[1] < INT64_MAX ? split_rd[1] : 0) +
+ (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+ verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
+ vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
+ }
+#endif // CONFIG_EXT_PARTITION_TYPES_AB
+
+// PARTITION_VERT_A
#if CONFIG_EXT_PARTITION_TYPES_AB
+ if (partition_vert_allowed && vertab_partition_allowed) {
rd_test_partition3(
cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col,
get_subsize(bsize, PARTITION_VERT_4), mi_row, mi_col + mi_step / 2,
get_subsize(bsize, PARTITION_VERT_4), mi_row, mi_col + mi_step,
get_subsize(bsize, PARTITION_VERT));
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
#else
+ if (partition_vert_allowed && verta_partition_allowed) {
subsize = get_subsize(bsize, PARTITION_VERT_A);
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_A, mi_row, mi_col, bsize2,
mi_row + mi_step, mi_col, bsize2, mi_row,
mi_col + mi_step, subsize);
-#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize);
}
- // PARTITION_VERT_B
- if (partition_vert_allowed && vertab_partition_allowed) {
+#endif
+// PARTITION_VERT_B
#if CONFIG_EXT_PARTITION_TYPES_AB
+ if (partition_vert_allowed && vertab_partition_allowed) {
rd_test_partition3(
cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col,
get_subsize(bsize, PARTITION_VERT), mi_row, mi_col + mi_step,
get_subsize(bsize, PARTITION_VERT_4), mi_row, mi_col + 3 * mi_step / 2,
get_subsize(bsize, PARTITION_VERT_4));
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+ }
#else
+ if (partition_vert_allowed && vertb_partition_allowed) {
subsize = get_subsize(bsize, PARTITION_VERT_B);
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
mi_col + mi_step, bsize2, mi_row + mi_step,
mi_col + mi_step, bsize2);
-#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize);
}
+#endif
// PARTITION_HORZ_4
// TODO(david.barker): For this and PARTITION_VERT_4,
// * Add support for BLOCK_16X16 once we support 2x8 and 8x2 blocks for the
// chroma plane
- // * Add support for supertx
+ if (cpi->sf.prune_ext_partition_types_search) {
+ horz4_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_HORZ_A ||
+ pc_tree->partitioning == PARTITION_HORZ_B ||
+ pc_tree->partitioning == PARTITION_NONE ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ }
if (horz4_partition_allowed && !force_horz_split &&
(do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
const int quarter_step = mi_size_high[bsize] / 4;
@@ -3130,6 +3207,13 @@
restore_context(x, &x_ctx, mi_row, mi_col, bsize);
}
// PARTITION_VERT_4
+ if (cpi->sf.prune_ext_partition_types_search) {
+ vert4_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_VERT_A ||
+ pc_tree->partitioning == PARTITION_VERT_B ||
+ pc_tree->partitioning == PARTITION_NONE ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ }
if (vert4_partition_allowed && !force_vert_split &&
(do_rectangular_split || av1_active_v_edge(cpi, mi_row, mi_step))) {
const int quarter_step = mi_size_wide[bsize] / 4;
@@ -3368,7 +3452,7 @@
&x->min_partition_size, &x->max_partition_size);
}
rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
- &dummy_rdc, INT64_MAX, pc_root);
+ &dummy_rdc, INT64_MAX, pc_root, NULL);
}
#if CONFIG_LPF_SB
if (USE_LOOP_FILTER_SUPERBLOCK) {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index d05ced4..da38f4c 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -138,6 +138,9 @@
sf->selective_ref_frame = 1;
sf->tx_size_search_init_depth_rect = 1;
sf->tx_size_search_init_depth_sqr = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+ sf->prune_ext_partition_types_search = 1;
+#endif // CONFIG_EXT_PARTITION_TYPES
}
if (speed >= 2) {
@@ -439,6 +442,9 @@
sf->partition_search_breakout_dist_thr = 0;
sf->partition_search_breakout_rate_thr = 0;
sf->simple_model_rd_from_var = 0;
+#if CONFIG_EXT_PARTITION_TYPES
+ sf->prune_ext_partition_types_search = 0;
+#endif // CONFIG_EXT_PARTITION_TYPES
// Set this at the appropriate speed levels
sf->use_transform_domain_distortion = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e11635e..fc3c8e0 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -338,6 +338,11 @@
// Has two levels for now: 1 and 2, where 2 is more aggressive than 1.
int selective_ref_frame;
+#if CONFIG_EXT_PARTITION_TYPES
+ // Conditionally prune extended partition types search
+ int prune_ext_partition_types_search;
+#endif // CONFIG_EXT_PARTITION_TYPES
+
// Skip rectangular partition test when partition type none gives better
// rd than partition type split.
int less_rectangular_check;