Optimize allocation of PC_Tree

Variable partition sizes are not used in the stats generation
stage(firstpass/LAP), however, their contexts are allocated
memory for all partition sizes. Removed unneeded tree node
allocations in PC_Tree to reduce memory footprint in singlepass
AOMQ.

Resolution  Memory
            Single
            Thread
1920x1080   ~3.25%

Memory measuring command:
$ command time -v ./aomenc ...

Change-Id: I87ead636dfb728fcce730d18d0da335feba5caec
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index ab245d5..78ccd4f 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -135,12 +135,12 @@
 // partition level. There are contexts for none, horizontal, vertical, and
 // split.  Along with a block_size value and a selected block_size which
 // represents the state of our search.
-void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
-  int i, j;
+void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
   const int tree_nodes_inc = 1024;
-  const int leaf_factor = 4;
-  const int leaf_nodes = 256 * leaf_factor;
-  const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   PC_TREE_SHARED_BUFFERS shared_bufs;
@@ -165,24 +165,38 @@
     shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
   }
 
-  // Sets up all the leaf nodes in the tree.
-  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-    tree->block_size = square[0];
-    alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
-  }
+  if (!stat_generation_stage) {
+    const int leaf_factor = 4;
+    const int leaf_nodes = 256 * leaf_factor;
 
-  // Each node has 4 leaf nodes, fill each block_size level of the tree
-  // from leafs to the root.
-  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
-    for (i = 0; i < nodes; ++i) {
+    // Sets up all the leaf nodes in the tree.
+    for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0, &shared_bufs);
-      tree->block_size = square[square_index];
-      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
-      ++pc_tree_index;
+      tree->block_size = square[0];
+      alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
     }
-    ++square_index;
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (i = 0; i < nodes; ++i) {
+        PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+        alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0,
+                            &shared_bufs);
+        tree->block_size = square[square_index];
+        for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+        ++pc_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    square_index = 2;
+    alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs);
+    tree->block_size = square[square_index];
   }
 
   // Set up the root node for the largest superblock size
@@ -200,10 +214,13 @@
   }
 }
 
-void av1_free_pc_tree(ThreadData *td, const int num_planes) {
+void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
+                      const int num_planes) {
+  int stat_generation_stage = is_stat_generation_stage(cpi);
   if (td->pc_tree != NULL) {
     const int tree_nodes_inc = 1024;
-    const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+    const int tree_nodes =
+        stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
     for (int i = 0; i < tree_nodes; ++i) {
       free_tree_contexts(&td->pc_tree[i], num_planes);
     }
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index d5b4e83..f4e930b 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -84,8 +84,9 @@
   int sms_rect_valid;
 } PC_TREE;
 
-void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
-void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
+                      const int num_planes);
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx);
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index aeb310c..0b620d0 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -867,7 +867,7 @@
   aom_free(cpi->tplist[0][0]);
   cpi->tplist[0][0] = NULL;
 
-  av1_free_pc_tree(&cpi->td, num_planes);
+  av1_free_pc_tree(cpi, &cpi->td, num_planes);
 
   aom_free(cpi->td.mb.palette_buffer);
   av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
@@ -1124,7 +1124,7 @@
                   aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
                              sizeof(*cpi->tplist[0][0])));
 
-  av1_setup_pc_tree(&cpi->common, &cpi->td);
+  av1_setup_pc_tree(cpi, &cpi->td);
 }
 
 void av1_new_framerate(AV1_COMP *cpi, double framerate) {
@@ -2914,7 +2914,7 @@
     if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
         seq_params->sb_size != sb_size) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(&cpi->td, num_planes);
+      av1_free_pc_tree(cpi, &cpi->td, num_planes);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -3032,9 +3032,9 @@
 
   cpi->common.buffer_pool = pool;
 
-  init_config(cpi, oxcf);
   cpi->lap_enabled = num_lap_buffers > 0;
   cpi->compressor_stage = stage;
+  init_config(cpi, oxcf);
   if (cpi->compressor_stage == LAP_STAGE) {
     cpi->oxcf.lag_in_frames = lap_lag_in_frames;
   }
@@ -3593,7 +3593,7 @@
       }
       aom_free(thread_data->td->mask_buf);
       aom_free(thread_data->td->counts);
-      av1_free_pc_tree(thread_data->td, num_planes);
+      av1_free_pc_tree(cpi, thread_data->td, num_planes);
       aom_free(thread_data->td->mbmi_ext);
       aom_free(thread_data->td);
     }
@@ -4317,7 +4317,7 @@
   if (cpi->initial_width && cpi->initial_height &&
       (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(&cpi->td, num_planes);
+    av1_free_pc_tree(cpi, &cpi->td, num_planes);
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
     cpi->initial_width = cpi->initial_height = 0;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 82220ff..f13153a 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -425,7 +425,7 @@
 
       // Set up pc_tree.
       thread_data->td->pc_tree = NULL;
-      av1_setup_pc_tree(cm, thread_data->td);
+      av1_setup_pc_tree(cpi, thread_data->td);
 
       CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
                       (uint8_t *)aom_memalign(