Resolve extremely large stack alloc in rdopt

Move the large stack allocation from stack initialization to
dedicated mem space. This resolves the extremely large stack issue
when ext-partition, motion-var, and high bit-depth are all turned
on.

BUG=aomedia:415

Change-Id: I85b77bbc6429093fcb0152176d9e237087d6bbd8
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 47bf4f7..e16479e 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -147,6 +147,8 @@
 #if CONFIG_MOTION_VAR
   int32_t *wsrc_buf;
   int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
 #endif  // CONFIG_MOTION_VAR
 
 #if CONFIG_PALETTE
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index af89c0b..4782ce2 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -453,6 +453,20 @@
   aom_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+#if CONFIG_MOTION_VAR
+  aom_free(cpi->td.mb.above_pred_buf);
+  cpi->td.mb.above_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.left_pred_buf);
+  cpi->td.mb.left_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.wsrc_buf);
+  cpi->td.mb.wsrc_buf = NULL;
+
+  aom_free(cpi->td.mb.mask_buf);
+  cpi->td.mb.mask_buf = NULL;
+#endif
+
   // Free up-sampled reference buffers.
   for (i = 0; i < (REF_FRAMES + 1); i++)
     aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
@@ -2239,6 +2253,31 @@
   }
 #endif
 
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+  int buf_scaler = 2;
+#else
+  int buf_scaler = 1;
+#endif
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.above_pred_buf,
+      (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.above_pred_buf)));
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.left_pred_buf,
+      (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.left_pred_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
+
+#endif
+
   init_upsampled_ref_frame_bufs(cpi);
 
   av1_set_speed_features_framesize_independent(cpi);
@@ -2530,6 +2569,12 @@
       if (cpi->common.allow_screen_content_tools)
         aom_free(thread_data->td->palette_buffer);
 #endif  // CONFIG_PALETTE
+#if CONFIG_MOTION_VAR
+      aom_free(thread_data->td->above_pred_buf);
+      aom_free(thread_data->td->left_pred_buf);
+      aom_free(thread_data->td->wsrc_buf);
+      aom_free(thread_data->td->mask_buf);
+#endif  // CONFIG_MOTION_VAR
       aom_free(thread_data->td->counts);
       av1_free_pc_tree(thread_data->td);
       aom_free(thread_data->td);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 626df3f..ee1257c 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -321,6 +321,12 @@
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
   PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+#if CONFIG_MOTION_VAR
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
+#endif
 
 #if CONFIG_PALETTE
   PALETTE_BUFFER *palette_buffer;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index df4981f..7af5f78 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -93,6 +93,29 @@
         thread_data->td->pc_tree = NULL;
         av1_setup_pc_tree(cm, thread_data->td);
 
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+        int buf_scaler = 2;
+#else
+        int buf_scaler = 1;
+#endif
+        CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+                        (uint8_t *)aom_memalign(
+                            16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->above_pred_buf)));
+        CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+                        (uint8_t *)aom_memalign(
+                            16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                    sizeof(*thread_data->td->left_pred_buf)));
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->wsrc_buf,
+            (int32_t *)aom_memalign(
+                16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->mask_buf,
+            (int32_t *)aom_memalign(
+                16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+#endif
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         aom_calloc(1, sizeof(*thread_data->td->counts)));
@@ -132,6 +155,12 @@
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
       thread_data->td->rd_counts = cpi->td.rd_counts;
+#if CONFIG_MOTION_VAR
+      thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
+      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+      thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+#endif
     }
     if (thread_data->td->counts != &cpi->common.counts) {
       memcpy(thread_data->td->counts, &cpi->common.counts,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b23acce..57c8bc8 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9948,15 +9948,6 @@
   const MODE_INFO *left_mi = xd->left_mi;
 #endif  // CONFIG_PALETTE
 #if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif  // CONFIG_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
-  DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -9965,22 +9956,24 @@
 #if CONFIG_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
-    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
-    args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args.above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
     args.above_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
-    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
-    args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
+    args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args.left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
     args.left_pred_buf[2] =
-        CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
   } else {
 #endif  // CONFIG_HIGHBITDEPTH
-    args.above_pred_buf[0] = tmp_buf1;
-    args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE;
-    args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
-    args.left_pred_buf[0] = tmp_buf2;
-    args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE;
-    args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
+    args.above_pred_buf[0] = x->above_pred_buf;
+    args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
+    args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
+    args.left_pred_buf[0] = x->left_pred_buf;
+    args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
+    args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
 #if CONFIG_HIGHBITDEPTH
   }
 #endif  // CONFIG_HIGHBITDEPTH
@@ -10088,8 +10081,6 @@
                                        dst_height2, args.left_pred_stride);
     av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
                          mi_col);
-    x->mask_buf = mask2d_buf;
-    x->wsrc_buf = weighted_src_buf;
     calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
                               args.above_pred_stride[0], args.left_pred_buf[0],
                               args.left_pred_stride[0]);
diff --git a/configure b/configure
index bd4a982..969f8f3 100755
--- a/configure
+++ b/configure
@@ -507,9 +507,6 @@
     soft_enable palette_throughput
     soft_enable tempmv_signaling
 
-    # Workaround features currently incompatible with highbitdepth
-    enabled ext_partition && disable_feature highbitdepth
-
     # Fix up experiment dependencies
     enabled pvq && enable_feature ec_adapt
     enabled pvq && disable_feature chroma_sub8x8