Optimize allocation of PC_Tree
Variable partition sizes are not used in the stats generation
stage(firstpass/LAP), however, their contexts are allocated
memory for all partition sizes. Removed unneeded tree node
allocations in PC_Tree to reduce memory footprint in singlepass
AOMQ.
Resolution Memory
Single
Thread
1920x1080 ~3.25%
Memory measuring command:
$ command time -v ./aomenc ...
Change-Id: I87ead636dfb728fcce730d18d0da335feba5caec
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index ab245d5..78ccd4f 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -135,12 +135,12 @@
// partition level. There are contexts for none, horizontal, vertical, and
// split. Along with a block_size value and a selected block_size which
// represents the state of our search.
-void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
- int i, j;
+void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
const int tree_nodes_inc = 1024;
- const int leaf_factor = 4;
- const int leaf_nodes = 256 * leaf_factor;
- const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+ const int tree_nodes =
+ stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
int pc_tree_index = 0;
PC_TREE *this_pc;
PC_TREE_SHARED_BUFFERS shared_bufs;
@@ -165,24 +165,38 @@
shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
}
- // Sets up all the leaf nodes in the tree.
- for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
- PC_TREE *const tree = &td->pc_tree[pc_tree_index];
- tree->block_size = square[0];
- alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
- }
+ if (!stat_generation_stage) {
+ const int leaf_factor = 4;
+ const int leaf_nodes = 256 * leaf_factor;
- // Each node has 4 leaf nodes, fill each block_size level of the tree
- // from leafs to the root.
- for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
- for (i = 0; i < nodes; ++i) {
+ // Sets up all the leaf nodes in the tree.
+ for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
PC_TREE *const tree = &td->pc_tree[pc_tree_index];
- alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0, &shared_bufs);
- tree->block_size = square[square_index];
- for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
- ++pc_tree_index;
+ tree->block_size = square[0];
+ alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
}
- ++square_index;
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0,
+ &shared_bufs);
+ tree->block_size = square[square_index];
+ for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+ ++pc_tree_index;
+ }
+ ++square_index;
+ }
+ } else {
+ // Allocation for firstpass/LAP stage
+ // TODO(Mufaddal): refactor square_index to use a common block_size macro
+ // from firstpass.c
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ square_index = 2;
+ alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs);
+ tree->block_size = square[square_index];
}
// Set up the root node for the largest superblock size
@@ -200,10 +214,13 @@
}
}
-void av1_free_pc_tree(ThreadData *td, const int num_planes) {
+void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
+ const int num_planes) {
+ int stat_generation_stage = is_stat_generation_stage(cpi);
if (td->pc_tree != NULL) {
const int tree_nodes_inc = 1024;
- const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+ const int tree_nodes =
+ stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
for (int i = 0; i < tree_nodes; ++i) {
free_tree_contexts(&td->pc_tree[i], num_planes);
}
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index d5b4e83..f4e930b 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -84,8 +84,9 @@
int sms_rect_valid;
} PC_TREE;
-void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
-void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
+ const int num_planes);
void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
PICK_MODE_CONTEXT *src_ctx);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index aeb310c..0b620d0 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -867,7 +867,7 @@
aom_free(cpi->tplist[0][0]);
cpi->tplist[0][0] = NULL;
- av1_free_pc_tree(&cpi->td, num_planes);
+ av1_free_pc_tree(cpi, &cpi->td, num_planes);
aom_free(cpi->td.mb.palette_buffer);
av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
@@ -1124,7 +1124,7 @@
aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
sizeof(*cpi->tplist[0][0])));
- av1_setup_pc_tree(&cpi->common, &cpi->td);
+ av1_setup_pc_tree(cpi, &cpi->td);
}
void av1_new_framerate(AV1_COMP *cpi, double framerate) {
@@ -2914,7 +2914,7 @@
if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
seq_params->sb_size != sb_size) {
av1_free_context_buffers(cm);
- av1_free_pc_tree(&cpi->td, num_planes);
+ av1_free_pc_tree(cpi, &cpi->td, num_planes);
alloc_compressor_data(cpi);
realloc_segmentation_maps(cpi);
cpi->initial_width = cpi->initial_height = 0;
@@ -3032,9 +3032,9 @@
cpi->common.buffer_pool = pool;
- init_config(cpi, oxcf);
cpi->lap_enabled = num_lap_buffers > 0;
cpi->compressor_stage = stage;
+ init_config(cpi, oxcf);
if (cpi->compressor_stage == LAP_STAGE) {
cpi->oxcf.lag_in_frames = lap_lag_in_frames;
}
@@ -3593,7 +3593,7 @@
}
aom_free(thread_data->td->mask_buf);
aom_free(thread_data->td->counts);
- av1_free_pc_tree(thread_data->td, num_planes);
+ av1_free_pc_tree(cpi, thread_data->td, num_planes);
aom_free(thread_data->td->mbmi_ext);
aom_free(thread_data->td);
}
@@ -4317,7 +4317,7 @@
if (cpi->initial_width && cpi->initial_height &&
(cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
av1_free_context_buffers(cm);
- av1_free_pc_tree(&cpi->td, num_planes);
+ av1_free_pc_tree(cpi, &cpi->td, num_planes);
alloc_compressor_data(cpi);
realloc_segmentation_maps(cpi);
cpi->initial_width = cpi->initial_height = 0;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 82220ff..f13153a 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -425,7 +425,7 @@
// Set up pc_tree.
thread_data->td->pc_tree = NULL;
- av1_setup_pc_tree(cm, thread_data->td);
+ av1_setup_pc_tree(cpi, thread_data->td);
CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
(uint8_t *)aom_memalign(