Optimize PC_TREE for lower resolution contents

For lower resolution contents, width or height <= 480,
sb_size is set as 64x64 instead of 128x128. PC_TREE for
these resolutions is allocated accordingly.

Observed memory footprint reduction with similar
encoding time.

Resolution    Tile     Memory reduction
                       Single   Multi
                       Thread   Thread
640x360       2x1      ~7%      ~17% (2 threads)
832x480       2x1      ~5%      ~11% (2 threads)

Memory measuring command:
$ command time -v ./aomenc ...

Change-Id: I217f6add0eadfcc24e46aac27ec2e7651f9e83f5
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 5544ce9..9b5b1cb 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -131,6 +131,16 @@
   free_mode_context(&tree->vertical[1], num_planes);
 }
 
+// This function will compute the number of pc_tree nodes to be allocated
+// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64
+static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
+                                        int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
 // This function sets up a tree of contexts such that at each square
 // partition level. There are contexts for none, horizontal, vertical, and
 // split.  Along with a block_size value and a selected block_size which
@@ -138,9 +148,9 @@
 void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
   AV1_COMMON *const cm = &cpi->common;
   int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
-  const int tree_nodes_inc = 1024;
+  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
   const int tree_nodes =
-      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   PC_TREE_SHARED_BUFFERS shared_bufs;
@@ -166,7 +176,7 @@
   }
 
   if (!stat_generation_stage) {
-    const int leaf_factor = 4;
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
     const int leaf_nodes = 256 * leaf_factor;
 
     // Sets up all the leaf nodes in the tree.
@@ -199,30 +209,20 @@
     tree->block_size = square[square_index];
   }
 
-  // Set up the root node for the largest superblock size
-  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
-  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  // Set up the root node for the applicable superblock size
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
 #if CONFIG_INTERNAL_STATS
-  td->pc_root[i]->none.best_mode_index = THR_INVALID;
+  td->pc_root->none.best_mode_index = THR_INVALID;
 #endif  // CONFIG_INTERNAL_STATS
-  if (!stat_generation_stage) {
-    // Set up the root nodes for the rest of the possible superblock sizes
-    while (--i >= 0) {
-      td->pc_root[i] = td->pc_root[i + 1]->split[0];
-#if CONFIG_INTERNAL_STATS
-      td->pc_root[i]->none.best_mode_index = THR_INVALID;
-#endif  // CONFIG_INTERNAL_STATS
-    }
-  }
 }
 
 void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
-                      const int num_planes) {
+                      const int num_planes, BLOCK_SIZE sb_size) {
   int stat_generation_stage = is_stat_generation_stage(cpi);
   if (td->pc_tree != NULL) {
-    const int tree_nodes_inc = 1024;
+    const int is_sb_size_128 = sb_size == BLOCK_128X128;
     const int tree_nodes =
-        stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+        get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
     for (int i = 0; i < tree_nodes; ++i) {
       free_tree_contexts(&td->pc_tree[i], num_planes);
     }
diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index f4e930b..a399794 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h
@@ -86,7 +86,7 @@
 
 void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
 void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
-                      const int num_planes);
+                      const int num_planes, BLOCK_SIZE sb_size);
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx);
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 3240c54..a0f7775 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4916,7 +4916,7 @@
     x->color_sensitivity[1] = 0;
     x->content_state_sb = 0;
 
-    PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
+    PC_TREE *const pc_root = td->pc_root;
     pc_root->index = 0;
 
     xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index d894fa4..bdd8158 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -873,7 +873,7 @@
   aom_free(cpi->tplist[0][0]);
   cpi->tplist[0][0] = NULL;
 
-  av1_free_pc_tree(cpi, &cpi->td, num_planes);
+  av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
 
   aom_free(cpi->td.mb.palette_buffer);
   av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
@@ -2925,7 +2925,7 @@
     if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
         seq_params->sb_size != sb_size) {
       av1_free_context_buffers(cm);
-      av1_free_pc_tree(cpi, &cpi->td, num_planes);
+      av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size);
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
@@ -3616,7 +3616,8 @@
       }
       aom_free(thread_data->td->mask_buf);
       aom_free(thread_data->td->counts);
-      av1_free_pc_tree(cpi, thread_data->td, num_planes);
+      av1_free_pc_tree(cpi, thread_data->td, num_planes,
+                       cm->seq_params.sb_size);
       aom_free(thread_data->td->mbmi_ext);
       aom_free(thread_data->td);
     }
@@ -4332,7 +4333,7 @@
   if (cpi->initial_width && cpi->initial_height &&
       (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
     av1_free_context_buffers(cm);
-    av1_free_pc_tree(cpi, &cpi->td, num_planes);
+    av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
     alloc_compressor_data(cpi);
     realloc_segmentation_maps(cpi);
     cpi->initial_width = cpi->initial_height = 0;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index c12d48b..0a23299 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -685,7 +685,7 @@
   RD_COUNTS rd_counts;
   FRAME_COUNTS *counts;
   PC_TREE *pc_tree;
-  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+  PC_TREE *pc_root;
   tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
   tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
   tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index be5e102..a61f6fd 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -860,8 +860,7 @@
   const SequenceHeader *const seq_params = &cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
-  const PICK_MODE_CONTEXT *ctx =
-      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
   MV last_mv = kZeroMv;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
   // First pass coding proceeds in raster scan order with unit size of 16x16.