reduce memory usage in PICK__MODE_CONTEXT

coeff, qcoeff, dqcoeff in PICK_MODE_CONTEXT are temporal buffers, so its not necessary to allocate one buffer for every partition. It can save ~26MB per thread.

Change-Id: I4b5395439196a44eda302dfb141d420467cbb2fa
diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 57f59f3..40df6c1 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c
@@ -16,8 +16,15 @@
   BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
 };
 
+typedef struct {
+  tran_low_t *coeff_buf[MAX_MB_PLANE];
+  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
 static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
-                               PICK_MODE_CONTEXT *ctx) {
+                               PICK_MODE_CONTEXT *ctx,
+                               PC_TREE_SHARED_BUFFERS *shared_bufs) {
   const int num_planes = av1_num_planes(cm);
   int i;
   const int num_blk = num_pix / 16;
@@ -25,12 +32,9 @@
 
   CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < num_planes; ++i) {
-    CHECK_MEM_ERROR(cm, ctx->coeff[i],
-                    aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
-    CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
-                    aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
-    CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
-                    aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+    ctx->coeff[i] = shared_bufs->coeff_buf[i];
+    ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
+    ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
     CHECK_MEM_ERROR(cm, ctx->eobs[i],
                     aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
     CHECK_MEM_ERROR(
@@ -52,11 +56,8 @@
   aom_free(ctx->blk_skip);
   ctx->blk_skip = 0;
   for (i = 0; i < num_planes; ++i) {
-    aom_free(ctx->coeff[i]);
     ctx->coeff[i] = 0;
-    aom_free(ctx->qcoeff[i]);
     ctx->qcoeff[i] = 0;
-    aom_free(ctx->dqcoeff[i]);
     ctx->dqcoeff[i] = 0;
     aom_free(ctx->eobs[i]);
     ctx->eobs[i] = 0;
@@ -71,36 +72,37 @@
 }
 
 static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix,
-                                int is_leaf) {
-  alloc_mode_context(cm, num_pix, &tree->none);
+                                int is_leaf,
+                                PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  alloc_mode_context(cm, num_pix, &tree->none, shared_bufs);
 
   if (is_leaf) return;
 
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]);
-  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0], shared_bufs);
 
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
-  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1], shared_bufs);
 
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0]);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1]);
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2], shared_bufs);
 
-  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0]);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1]);
-  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2]);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2], shared_bufs);
 
-  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0]);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1]);
-  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2], shared_bufs);
 
-  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0]);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1]);
-  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2]);
+  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2], shared_bufs);
 
   for (int i = 0; i < 4; ++i) {
-    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i]);
-    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i]);
+    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i], shared_bufs);
+    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i], shared_bufs);
   }
 }
 
@@ -135,6 +137,7 @@
   const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
   int pc_tree_index = 0;
   PC_TREE *this_pc;
+  PC_TREE_SHARED_BUFFERS shared_bufs;
   int square_index = 1;
   int nodes;
 
@@ -143,11 +146,24 @@
                   aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
   this_pc = &td->pc_tree[0];
 
+  for (i = 0; i < 3; i++) {
+    const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE;
+    CHECK_MEM_ERROR(cm, td->tree_coeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    CHECK_MEM_ERROR(cm, td->tree_qcoeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    CHECK_MEM_ERROR(cm, td->tree_dqcoeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    shared_bufs.coeff_buf[i] = td->tree_coeff_buf[i];
+    shared_bufs.qcoeff_buf[i] = td->tree_qcoeff_buf[i];
+    shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
+  }
+
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
     PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
-    alloc_tree_contexts(cm, tree, 16, 1);
+    alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -155,7 +171,7 @@
   for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
-      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0);
+      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0, &shared_bufs);
       tree->block_size = square[square_index];
       for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
       ++pc_tree_index;
@@ -181,6 +197,14 @@
     for (int i = 0; i < tree_nodes; ++i) {
       free_tree_contexts(&td->pc_tree[i], num_planes);
     }
+    for (int i = 0; i < 3; ++i) {
+      aom_free(td->tree_coeff_buf[i]);
+      aom_free(td->tree_qcoeff_buf[i]);
+      aom_free(td->tree_dqcoeff_buf[i]);
+      td->tree_coeff_buf[i] = NULL;
+      td->tree_qcoeff_buf[i] = NULL;
+      td->tree_dqcoeff_buf[i] = NULL;
+    }
     aom_free(td->pc_tree);
     td->pc_tree = NULL;
   }
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 14ce6c1..016daac 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -532,6 +532,9 @@
   FRAME_COUNTS *counts;
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+  tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
+  tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
 #if CONFIG_COLLECT_INTER_MODE_RD_STATS
   InterModesInfo *inter_modes_info;
 #endif