Optimize PC_TREE for lower resolution contents
For lower resolution contents, width or height <= 480,
sb_size is set as 64x64 instead of 128x128. PC_TREE for
these resolutions is allocated accordingly.
Observed memory footprint reduction with similar
encoding time.
Resolution Tile Memory reduction
Single Multi
Thread Thread
640x360 2x1 ~7% ~17% (2 threads)
832x480 2x1 ~5% ~11% (2 threads)
Memory measuring command:
$ command time -v ./aomenc ...
Change-Id: I217f6add0eadfcc24e46aac27ec2e7651f9e83f5
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 5544ce9..9b5b1cb 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -131,6 +131,16 @@
free_mode_context(&tree->vertical[1], num_planes);
}
+// This function will compute the number of pc_tree nodes to be allocated
+// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64
+static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
+ int stat_generation_stage) {
+ const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+ const int tree_nodes =
+ stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+ return tree_nodes;
+}
+
// This function sets up a tree of contexts such that at each square
// partition level. There are contexts for none, horizontal, vertical, and
// split. Along with a block_size value and a selected block_size which
@@ -138,9 +148,9 @@
void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
AV1_COMMON *const cm = &cpi->common;
int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
- const int tree_nodes_inc = 1024;
+ const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
const int tree_nodes =
- stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+ get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
int pc_tree_index = 0;
PC_TREE *this_pc;
PC_TREE_SHARED_BUFFERS shared_bufs;
@@ -166,7 +176,7 @@
}
if (!stat_generation_stage) {
- const int leaf_factor = 4;
+ const int leaf_factor = is_sb_size_128 ? 4 : 1;
const int leaf_nodes = 256 * leaf_factor;
// Sets up all the leaf nodes in the tree.
@@ -199,30 +209,20 @@
tree->block_size = square[square_index];
}
- // Set up the root node for the largest superblock size
- i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
- td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+ // Set up the root node for the applicable superblock size
+ td->pc_root = &td->pc_tree[tree_nodes - 1];
#if CONFIG_INTERNAL_STATS
- td->pc_root[i]->none.best_mode_index = THR_INVALID;
+ td->pc_root->none.best_mode_index = THR_INVALID;
#endif // CONFIG_INTERNAL_STATS
- if (!stat_generation_stage) {
- // Set up the root nodes for the rest of the possible superblock sizes
- while (--i >= 0) {
- td->pc_root[i] = td->pc_root[i + 1]->split[0];
-#if CONFIG_INTERNAL_STATS
- td->pc_root[i]->none.best_mode_index = THR_INVALID;
-#endif // CONFIG_INTERNAL_STATS
- }
- }
}
void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
- const int num_planes) {
+ const int num_planes, BLOCK_SIZE sb_size) {
int stat_generation_stage = is_stat_generation_stage(cpi);
if (td->pc_tree != NULL) {
- const int tree_nodes_inc = 1024;
+ const int is_sb_size_128 = sb_size == BLOCK_128X128;
const int tree_nodes =
- stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+ get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
for (int i = 0; i < tree_nodes; ++i) {
free_tree_contexts(&td->pc_tree[i], num_planes);
}
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index f4e930b..a399794 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -86,7 +86,7 @@
void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
- const int num_planes);
+ const int num_planes, BLOCK_SIZE sb_size);
void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
PICK_MODE_CONTEXT *src_ctx);
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 3240c54..a0f7775 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4916,7 +4916,7 @@
x->color_sensitivity[1] = 0;
x->content_state_sb = 0;
- PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
+ PC_TREE *const pc_root = td->pc_root;
pc_root->index = 0;
xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index d894fa4..bdd8158 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -873,7 +873,7 @@
aom_free(cpi->tplist[0][0]);
cpi->tplist[0][0] = NULL;
- av1_free_pc_tree(cpi, &cpi->td, num_planes);
+ av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
aom_free(cpi->td.mb.palette_buffer);
av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
@@ -2925,7 +2925,7 @@
if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
seq_params->sb_size != sb_size) {
av1_free_context_buffers(cm);
- av1_free_pc_tree(cpi, &cpi->td, num_planes);
+ av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size);
alloc_compressor_data(cpi);
realloc_segmentation_maps(cpi);
cpi->initial_width = cpi->initial_height = 0;
@@ -3616,7 +3616,8 @@
}
aom_free(thread_data->td->mask_buf);
aom_free(thread_data->td->counts);
- av1_free_pc_tree(cpi, thread_data->td, num_planes);
+ av1_free_pc_tree(cpi, thread_data->td, num_planes,
+ cm->seq_params.sb_size);
aom_free(thread_data->td->mbmi_ext);
aom_free(thread_data->td);
}
@@ -4332,7 +4333,7 @@
if (cpi->initial_width && cpi->initial_height &&
(cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
av1_free_context_buffers(cm);
- av1_free_pc_tree(cpi, &cpi->td, num_planes);
+ av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
alloc_compressor_data(cpi);
realloc_segmentation_maps(cpi);
cpi->initial_width = cpi->initial_height = 0;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index c12d48b..0a23299 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -685,7 +685,7 @@
RD_COUNTS rd_counts;
FRAME_COUNTS *counts;
PC_TREE *pc_tree;
- PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+ PC_TREE *pc_root;
tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index be5e102..a61f6fd 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -860,8 +860,7 @@
const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
- const PICK_MODE_CONTEXT *ctx =
- &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
+ const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
MV last_mv = kZeroMv;
const int qindex = find_fp_qindex(seq_params->bit_depth);
// First pass coding proceeds in raster scan order with unit size of 16x16.