Move allocation of pred buffer in upsampled_pref_error()
This patch moves the allocation of pred buffer in 'upsampled_pref_error()' function from stack memory to MACROBLOCK structure which improves the function performance for GCC compiler due to better stack memory management.
This is a bit-exact change, and gives 1% - 3% encoder time reduction for GCC compiler (higher speedup is seen for low QPs).
Performance impact on the Clang compiler is negligible.
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index f82109f..5337f00 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -2644,6 +2644,12 @@
* 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
*/
uint16_t *tmp_obmc_bufs[2];
+
+ /*!
+ * Temporary buffer used for upsampled prediction.
+ */
+ uint16_t *tmp_upsample_pred;
+
/*!
* Enable IST for current coding block.
*/
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index cfa29b7..7204a93 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1591,6 +1591,11 @@
* prediction.
*/
uint16_t *tmp_pred_bufs[2];
+
+ /*!
+ * Buffer used for upsampled prediction.
+ */
+ uint16_t *upsample_pred;
/**@}*/
/*****************************************************************************
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 00d2417..542921b 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -834,6 +834,13 @@
x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
}
+ if (x->upsample_pred == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->upsample_pred,
+ aom_memalign(16, MAX_SB_SQUARE * sizeof(*x->upsample_pred)));
+ x->e_mbd.tmp_upsample_pred = x->upsample_pred;
+ }
+
// Temporary buffers used during the DMVR and OPFL processing.
if (x->opfl_vxy_bufs == NULL) {
CHECK_MEM_ERROR(
@@ -1346,6 +1353,7 @@
if (t == 0) continue;
aom_free(thread_data->td->palette_buffer);
aom_free(thread_data->td->tmp_conv_dst);
+ aom_free(thread_data->td->upsample_pred);
// Temporary buffers used during the DMVR and OPFL processing.
aom_free(thread_data->td->opfl_vxy_bufs);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index aae391e..a8481bd 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1968,6 +1968,8 @@
// during the OPFL/DMVR.
uint16_t *opfl_dst_bufs;
uint16_t *tmp_pred_bufs[2];
+ // Buffer used for upsampled prediction.
+ uint16_t *upsample_pred;
int intrabc_used;
int deltaq_used;
FRAME_CONTEXT *tctx;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index f563303..c3fbbe8 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -255,6 +255,7 @@
aom_free(cpi->td.mb.palette_buffer);
release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
aom_free(cpi->td.mb.tmp_conv_dst);
+ aom_free(cpi->td.mb.upsample_pred);
// Temporary buffers used during the DMVR and OPFL processing.
aom_free(cpi->td.mb.opfl_vxy_bufs);
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 6d052b6..4541889 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -599,6 +599,11 @@
aom_memalign(
32, MAX_SB_SQUARE * 2 * sizeof(*thread_data->td->opfl_dst_bufs)));
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->upsample_pred,
+ aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->upsample_pred)));
+
for (int j = 0; j < 2; ++j) {
CHECK_MEM_ERROR(
cm, thread_data->td->tmp_pred_bufs[j],
@@ -819,6 +824,7 @@
thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
// Temporary buffers used during the DMVR and OPFL processing.
thread_data->td->mb.opfl_vxy_bufs = thread_data->td->opfl_vxy_bufs;
thread_data->td->mb.opfl_gxy_bufs = thread_data->td->opfl_gxy_bufs;
@@ -830,6 +836,8 @@
}
thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ thread_data->td->mb.e_mbd.tmp_upsample_pred =
+ thread_data->td->mb.upsample_pred;
// Temporary buffers used during the DMVR and OPFL processing.
thread_data->td->mb.e_mbd.opfl_vxy_bufs =
thread_data->td->mb.opfl_vxy_bufs;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 307da01..7a493d8 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -3347,6 +3347,7 @@
const uint16_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
const int src_stride = ms_buffers->src->stride;
const int ref_stride = ms_buffers->ref->stride;
+ uint16_t *pred = xd->tmp_upsample_pred;
const uint16_t *second_pred = ms_buffers->second_pred;
const uint8_t *mask = ms_buffers->mask;
const int mask_stride = ms_buffers->mask_stride;
@@ -3363,7 +3364,6 @@
const int is_scaled_ref = ms_buffers->src->width == ms_buffers->ref->width &&
ms_buffers->src->height == ms_buffers->ref->height;
- DECLARE_ALIGNED(16, uint16_t, pred[MAX_SB_SQUARE]);
if (second_pred != NULL) {
if (mask) {
aom_highbd_comp_mask_upsampled_pred(