Resolve extremely large stack alloc in rdopt
Move the large stack allocation from stack initialization to
dedicated mem space. This resolves the extremely large stack issue
when ext-partition, motion-var, and high bit-depth are all turned
on.
BUG=aomedia:415
Change-Id: I85b77bbc6429093fcb0152176d9e237087d6bbd8
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 47bf4f7..e16479e 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -147,6 +147,8 @@
#if CONFIG_MOTION_VAR
int32_t *wsrc_buf;
int32_t *mask_buf;
+ uint8_t *above_pred_buf;
+ uint8_t *left_pred_buf;
#endif // CONFIG_MOTION_VAR
#if CONFIG_PALETTE
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index af89c0b..4782ce2 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -453,6 +453,20 @@
aom_free(cpi->active_map.map);
cpi->active_map.map = NULL;
+#if CONFIG_MOTION_VAR
+ aom_free(cpi->td.mb.above_pred_buf);
+ cpi->td.mb.above_pred_buf = NULL;
+
+ aom_free(cpi->td.mb.left_pred_buf);
+ cpi->td.mb.left_pred_buf = NULL;
+
+ aom_free(cpi->td.mb.wsrc_buf);
+ cpi->td.mb.wsrc_buf = NULL;
+
+ aom_free(cpi->td.mb.mask_buf);
+ cpi->td.mb.mask_buf = NULL;
+#endif
+
// Free up-sampled reference buffers.
for (i = 0; i < (REF_FRAMES + 1); i++)
aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
@@ -2239,6 +2253,31 @@
}
#endif
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+ int buf_scaler = 2;
+#else
+ int buf_scaler = 1;
+#endif
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.above_pred_buf,
+ (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*cpi->td.mb.above_pred_buf)));
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.left_pred_buf,
+ (uint8_t *)aom_memalign(16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*cpi->td.mb.left_pred_buf)));
+
+ CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+
+ CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
+
+#endif
+
init_upsampled_ref_frame_bufs(cpi);
av1_set_speed_features_framesize_independent(cpi);
@@ -2530,6 +2569,12 @@
if (cpi->common.allow_screen_content_tools)
aom_free(thread_data->td->palette_buffer);
#endif // CONFIG_PALETTE
+#if CONFIG_MOTION_VAR
+ aom_free(thread_data->td->above_pred_buf);
+ aom_free(thread_data->td->left_pred_buf);
+ aom_free(thread_data->td->wsrc_buf);
+ aom_free(thread_data->td->mask_buf);
+#endif // CONFIG_MOTION_VAR
aom_free(thread_data->td->counts);
av1_free_pc_tree(thread_data->td);
aom_free(thread_data->td);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 626df3f..ee1257c 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -321,6 +321,12 @@
PICK_MODE_CONTEXT *leaf_tree;
PC_TREE *pc_tree;
PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+#if CONFIG_MOTION_VAR
+ int32_t *wsrc_buf;
+ int32_t *mask_buf;
+ uint8_t *above_pred_buf;
+ uint8_t *left_pred_buf;
+#endif
#if CONFIG_PALETTE
PALETTE_BUFFER *palette_buffer;
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index df4981f..7af5f78 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -93,6 +93,29 @@
thread_data->td->pc_tree = NULL;
av1_setup_pc_tree(cm, thread_data->td);
+#if CONFIG_MOTION_VAR
+#if CONFIG_HIGHBITDEPTH
+ int buf_scaler = 2;
+#else
+ int buf_scaler = 1;
+#endif
+ CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->above_pred_buf)));
+ CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, buf_scaler * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->left_pred_buf)));
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->wsrc_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mask_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+#endif
// Allocate frame counters in thread data.
CHECK_MEM_ERROR(cm, thread_data->td->counts,
aom_calloc(1, sizeof(*thread_data->td->counts)));
@@ -132,6 +155,12 @@
if (thread_data->td != &cpi->td) {
thread_data->td->mb = cpi->td.mb;
thread_data->td->rd_counts = cpi->td.rd_counts;
+#if CONFIG_MOTION_VAR
+ thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
+ thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+ thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+ thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+#endif
}
if (thread_data->td->counts != &cpi->common.counts) {
memcpy(thread_data->td->counts, &cpi->common.counts,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b23acce..57c8bc8 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -9948,15 +9948,6 @@
const MODE_INFO *left_mi = xd->left_mi;
#endif // CONFIG_PALETTE
#if CONFIG_MOTION_VAR
-#if CONFIG_HIGHBITDEPTH
- DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
-#else
- DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
-#endif // CONFIG_HIGHBITDEPTH
- DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
- DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
@@ -9965,22 +9956,24 @@
#if CONFIG_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int len = sizeof(uint16_t);
- args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
- args.above_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+ args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+ args.above_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
args.above_pred_buf[2] =
- CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
- args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
- args.left_pred_buf[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+ CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
+ args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+ args.left_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
args.left_pred_buf[2] =
- CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
+ CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
} else {
#endif // CONFIG_HIGHBITDEPTH
- args.above_pred_buf[0] = tmp_buf1;
- args.above_pred_buf[1] = tmp_buf1 + MAX_SB_SQUARE;
- args.above_pred_buf[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
- args.left_pred_buf[0] = tmp_buf2;
- args.left_pred_buf[1] = tmp_buf2 + MAX_SB_SQUARE;
- args.left_pred_buf[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
+ args.above_pred_buf[0] = x->above_pred_buf;
+ args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
+ args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
+ args.left_pred_buf[0] = x->left_pred_buf;
+ args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
+ args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
#if CONFIG_HIGHBITDEPTH
}
#endif // CONFIG_HIGHBITDEPTH
@@ -10088,8 +10081,6 @@
dst_height2, args.left_pred_stride);
av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
mi_col);
- x->mask_buf = mask2d_buf;
- x->wsrc_buf = weighted_src_buf;
calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
args.above_pred_stride[0], args.left_pred_buf[0],
args.left_pred_stride[0]);
diff --git a/configure b/configure
index bd4a982..969f8f3 100755
--- a/configure
+++ b/configure
@@ -507,9 +507,6 @@
soft_enable palette_throughput
soft_enable tempmv_signaling
- # Workaround features currently incompatible with highbitdepth
- enabled ext_partition && disable_feature highbitdepth
-
# Fix up experiment dependencies
enabled pvq && enable_feature ec_adapt
enabled pvq && disable_feature chroma_sub8x8