Speed up handling of local buffers in build_inter_predictors_8x8_and_bigger()
This MR avoids the memset corresponds to the MV offsets (i.e., vx/vy buffers) as the population and utilization happens only for the required sub-blocks. Also, the temporary buffers used to store MV offsets, gradient information and OPFL prediction data are moved to heap memory from stack.
The encode time reduction:
* For GCC compiler, overall 1.247%. 1.776% for Qp110 & 135.
* For CLANG compiler, \<= 0.5%.
Decode time reduction of \~18% and \~4% is seen for GCC and CLANG compilers across all test-sets.
No stats change.
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 835d34a..6e9ca76 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -2618,6 +2618,19 @@
*/
CONV_BUF_TYPE *tmp_conv_dst;
/*!
+ * Temporary buffers used to store the OPFL MV offsets.
+ */
+ int *opfl_vxy_bufs;
+ /*!
+ * Temporary buffers used to store the OPFL gradient information.
+ */
+ int16_t *opfl_gxy_bufs;
+ /*!
+ * Temporary buffers used to store intermediate prediction data calculated
+ * during the OPFL/DMVR.
+ */
+ uint16_t *opfl_dst_bufs;
+ /*!
* Temporary buffers used to build OBMC prediction by above (index 0) and left
* (index 1) predictors respectively.
* tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'.
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index a648bad..d162c59 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -2129,7 +2129,13 @@
suw = ROUND_POWER_OF_TWO_SIGNED_64(suw, redbit);
svw = ROUND_POWER_OF_TWO_SIGNED_64(svw, redbit);
const int64_t det = su2 * sv2 - suv * suv;
- if (det <= 0) return;
+ if (det <= 0) {
+ *vx0 = 0;
+ *vy0 = 0;
+ *vx1 = 0;
+ *vy1 = 0;
+ return;
+ }
int64_t sol[2] = { sv2 * suw - suv * svw, su2 * svw - suv * suw };
@@ -2138,7 +2144,13 @@
*vy0 = (int)-sol[1];
#else
const int64_t det = su2 * sv2 - suv * suv;
- if (det <= 0) return;
+ if (det <= 0) {
+ *vx0 = 0;
+ *vy0 = 0;
+ *vx1 = 0;
+ *vy1 = 0;
+ return;
+ }
const int64_t det_x = (suv * svw - sv2 * suw) * (1 << bits);
const int64_t det_y = (suv * suw - su2 * svw) * (1 << bits);
@@ -4564,9 +4576,9 @@
int subblk_start_x, int subblk_start_y,
#endif // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
int pu_width, int pu_height, uint16_t *dst0_16_refinemv,
- uint16_t *dst1_16_refinemv, int16_t *opt_gx0, int16_t *opt_gx1,
- int row_start, int col_start, MV *sb_refined_mv, MV *chroma_refined_mv,
- int build_for_refine_mv_only, ReferenceArea ref_area[2]
+ uint16_t *dst1_16_refinemv, int row_start, int col_start, MV *sb_refined_mv,
+ MV *chroma_refined_mv, int build_for_refine_mv_only,
+ ReferenceArea ref_area[2]
#if CONFIG_TIP_REF_PRED_MERGING
,
int_mv *mv_refined
@@ -4700,15 +4712,6 @@
cm->features.opfl_refine_type == REFINE_ALL));
#endif // CONFIG_TIP_REF_PRED_MERGING
- // Arrays to hold optical flow offsets.
- int vx0[N_OF_OFFSETS] = { 0 };
- int vx1[N_OF_OFFSETS] = { 0 };
- int vy0[N_OF_OFFSETS] = { 0 };
- int vy1[N_OF_OFFSETS] = { 0 };
-
- // Pointers to gradient and dst buffers
- int16_t *gx0, *gy0, *gx1, *gy1;
- uint16_t *dst0 = NULL, *dst1 = NULL;
#if CONFIG_TIP_REF_PRED_MERGING
int use_4x4 = tip_ref_frame ? 0 : 1;
#endif
@@ -4739,11 +4742,17 @@
#endif // CONFIG_AFFINE_REFINEMENT
if (use_optflow_refinement && plane == 0) {
- // Allocate gradient and dst buffers
- gx0 = &opt_gx0[0];
- gx1 = &opt_gx1[0];
- gy0 = gx0 + (REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT);
- gy1 = gx1 + (REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT);
+ // Pointers to hold optical flow MV offsets.
+ int *vx0 = xd->opfl_vxy_bufs;
+ int *vx1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 1);
+ int *vy0 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 2);
+ int *vy1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 3);
+
+ // Pointers to hold gradient and dst buffers.
+ int16_t *gx0 = xd->opfl_gxy_bufs;
+ int16_t *gx1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 1);
+ int16_t *gy0 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 2);
+ int16_t *gy1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 3);
// Initialize refined mv
const MV mv0 = best_mv_ref[0];
@@ -4757,9 +4766,8 @@
#endif
// Refine MV using optical flow. The final output MV will be in 1/16
// precision.
- dst0 = &dst0_16_refinemv[0];
- dst1 = &dst1_16_refinemv[0];
-
+ uint16_t *dst0 = xd->opfl_dst_bufs;
+ uint16_t *dst1 = xd->opfl_dst_bufs + MAX_SB_SQUARE;
#if CONFIG_TIP_REF_PRED_MERGING
if (tip_ref_frame) {
use_optflow_refinement = !skip_opfl_refine_with_tip(
@@ -5074,12 +5082,6 @@
uint16_t
dst1_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT];
#endif // CONFIG_SUBBLK_REF_EXT
- DECLARE_ALIGNED(
- 32, int16_t,
- opt_gx0[2 * REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]);
- DECLARE_ALIGNED(
- 32, int16_t,
- opt_gx1[2 * REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]);
ReferenceArea ref_area[2];
#if !CONFIG_SUBBLK_PAD
@@ -5157,9 +5159,9 @@
#if CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
w, h,
#endif // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
- pu_width, pu_height, dst0_16_refinemv, dst1_16_refinemv, opt_gx0,
- opt_gx1, row_start, col_start, plane == 0 ? luma_refined_mv : NULL,
- chroma_refined_mv, build_for_refine_mv_only, ref_area, mv_refined);
+ pu_width, pu_height, dst0_16_refinemv, dst1_16_refinemv, row_start,
+ col_start, plane == 0 ? luma_refined_mv : NULL, chroma_refined_mv,
+ build_for_refine_mv_only, ref_area, mv_refined);
if (plane == 0 && !tip_ref_frame) {
#else
@@ -5173,9 +5175,9 @@
#if CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
w, h,
#endif // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
- bw, bh, dst0_16_refinemv, dst1_16_refinemv, opt_gx0, opt_gx1,
- row_start, col_start, plane == 0 ? luma_refined_mv : NULL,
- chroma_refined_mv, build_for_refine_mv_only, ref_area);
+ bw, bh, dst0_16_refinemv, dst1_16_refinemv, row_start, col_start,
+ plane == 0 ? luma_refined_mv : NULL, chroma_refined_mv,
+ build_for_refine_mv_only, ref_area);
if (plane == 0) {
#endif // CONFIG_TIP_REF_PRED_MERGING
@@ -5300,15 +5302,15 @@
#endif // CONFIG_AFFINE_REFINEMENT_SB
#endif // CONFIG_AFFINE_REFINEMENT
- // Arrays to hold optical flow offsets.
- int vx0[N_OF_OFFSETS] = { 0 };
- int vx1[N_OF_OFFSETS] = { 0 };
- int vy0[N_OF_OFFSETS] = { 0 };
- int vy1[N_OF_OFFSETS] = { 0 };
-
// Pointers to gradient and dst buffers
if (use_optflow_refinement && plane == 0) {
+ // Pointers to hold optical flow MV offsets.
+ int *vx0 = xd->opfl_vxy_bufs;
+ int *vx1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 1);
+ int *vy0 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 2);
+ int *vy1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 3);
+
#if CONFIG_AFFINE_REFINEMENT
assert(mi->comp_refine_type > COMP_REFINE_NONE);
assert(IMPLIES(mi->comp_refine_type >= COMP_AFFINE_REFINE_START,
@@ -5327,13 +5329,10 @@
#endif // CONFIG_OPTFLOW_ON_TIP
);
const int n_blocks = (bw / n) * (bh / n);
- int16_t *gx0, *gy0, *gx1, *gy1;
- DECLARE_ALIGNED(32, int16_t, g0_buf[2 * MAX_SB_SQUARE]);
- DECLARE_ALIGNED(32, int16_t, g1_buf[2 * MAX_SB_SQUARE]);
- gx0 = g0_buf;
- gx1 = g1_buf;
- gy0 = g0_buf + MAX_SB_SQUARE;
- gy1 = g1_buf + MAX_SB_SQUARE;
+ int16_t *gx0 = xd->opfl_gxy_bufs;
+ int16_t *gx1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 1);
+ int16_t *gy0 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 2);
+ int16_t *gy1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 3);
// Initialize refined mv
#if CONFIG_REFINEMV
@@ -5351,7 +5350,9 @@
#endif
// Refine MV using optical flow. The final output MV will be in 1/16
// precision.
- uint16_t dst0[MAX_SB_SQUARE], dst1[MAX_SB_SQUARE];
+ uint16_t *dst0 = xd->opfl_dst_bufs;
+ uint16_t *dst1 = xd->opfl_dst_bufs + MAX_SB_SQUARE;
+
#if CONFIG_TIP_REF_PRED_MERGING
if (tip_ref_frame) {
use_optflow_refinement = !skip_opfl_refine_with_tip(
@@ -5387,12 +5388,23 @@
#endif // CONFIG_REFINEMV
);
#if CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
- for (int mvi = 0; mvi < N_OF_OFFSETS; mvi++) {
+ for (int mvi = 0; mvi < n_blocks; mvi++) {
xd->mv_delta[mvi].mv[0].as_mv.row = vy0[mvi];
xd->mv_delta[mvi].mv[0].as_mv.col = vx0[mvi];
xd->mv_delta[mvi].mv[1].as_mv.row = vy1[mvi];
xd->mv_delta[mvi].mv[1].as_mv.col = vx1[mvi];
}
+
+ // TODO(any): The memset is required as the MV delta offsets of optical
+ // flow refinement stored in 'mv_delta' buffer is accessed beyond n_blocks
+ // when 'AFFINE_CHROMA_REFINE_METHOD' is enabled. Recheck if access beyond
+ // n_blocks of 'mv_delta' buffer is valid.
+ for (int mvi = n_blocks; mvi < N_OF_OFFSETS; mvi++) {
+ xd->mv_delta[mvi].mv[0].as_mv.row = 0;
+ xd->mv_delta[mvi].mv[0].as_mv.col = 0;
+ xd->mv_delta[mvi].mv[1].as_mv.row = 0;
+ xd->mv_delta[mvi].mv[1].as_mv.col = 0;
+ }
#endif // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
#if CONFIG_AFFINE_REFINEMENT_SB
memcpy(xd->wm_params_sb, wms, 2 * NUM_AFFINE_PARAMS * sizeof(wms[0]));
diff --git a/av1/common/x86/optflow_refine_sse4.c b/av1/common/x86/optflow_refine_sse4.c
index 4c1a689..c517e40 100644
--- a/av1/common/x86/optflow_refine_sse4.c
+++ b/av1/common/x86/optflow_refine_sse4.c
@@ -423,7 +423,13 @@
suw = ROUND_POWER_OF_TWO_SIGNED_64(suw, redbit);
svw = ROUND_POWER_OF_TWO_SIGNED_64(svw, redbit);
const int64_t det = su2 * sv2 - suv * suv;
- if (det <= 0) return;
+ if (det <= 0) {
+ *vx0 = 0;
+ *vy0 = 0;
+ *vx1 = 0;
+ *vy1 = 0;
+ return;
+ }
int64_t sol[2] = { sv2 * suw - suv * svw, su2 * svw - suv * suw };
@@ -432,7 +438,13 @@
*vy0 = (int)-sol[1];
#else
const int64_t det = su2 * sv2 - suv * suv;
- if (det <= 0) return;
+ if (det <= 0) {
+ *vx0 = 0;
+ *vy0 = 0;
+ *vx1 = 0;
+ *vy1 = 0;
+ return;
+ }
const int64_t det_x = (suv * svw - sv2 * suw) * (1 << bits);
const int64_t det_y = (suv * suw - su2 * svw) * (1 << bits);
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index c54b4c6..1aced61 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5235,6 +5235,11 @@
td->dcb.mc_buf[0] = td->mc_buf[0];
td->dcb.mc_buf[1] = td->mc_buf[1];
td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
+
+ // Temporary buffers used during the DMVR and OPFL processing.
+ td->dcb.xd.opfl_vxy_bufs = td->opfl_vxy_bufs;
+ td->dcb.xd.opfl_gxy_bufs = td->opfl_gxy_bufs;
+ td->dcb.xd.opfl_dst_bufs = td->opfl_dst_bufs;
for (int j = 0; j < 2; ++j) {
td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
}
@@ -5816,6 +5821,35 @@
}
}
+// Free-up the temporary buffers created for DMVR and OPFL processing.
+void av1_free_opfl_tmp_bufs(ThreadData *thread_data) {
+ aom_free(thread_data->opfl_vxy_bufs);
+ thread_data->opfl_vxy_bufs = NULL;
+
+ aom_free(thread_data->opfl_gxy_bufs);
+ thread_data->opfl_gxy_bufs = NULL;
+
+ aom_free(thread_data->opfl_dst_bufs);
+ thread_data->opfl_dst_bufs = NULL;
+}
+
+// Allocate memory for temporary buffers used during the DMVR and OPFL
+// processing.
+static AOM_INLINE void allocate_opfl_tmp_bufs(AV1_COMMON *const cm,
+ ThreadData *thread_data) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->opfl_vxy_bufs,
+ aom_memalign(32, N_OF_OFFSETS * 4 * sizeof(*thread_data->opfl_vxy_bufs)));
+
+ CHECK_MEM_ERROR(cm, thread_data->opfl_gxy_bufs,
+ aom_memalign(32, MAX_SB_SQUARE * 4 *
+ sizeof(*thread_data->opfl_gxy_bufs)));
+
+ CHECK_MEM_ERROR(cm, thread_data->opfl_dst_bufs,
+ aom_memalign(32, MAX_SB_SQUARE * 2 *
+ sizeof(*thread_data->opfl_dst_bufs)));
+}
+
static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
ThreadData *thread_data,
int buf_size) {
@@ -5857,6 +5891,11 @@
thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ // Temporary buffers used during the DMVR and OPFL processing.
+ thread_data->td->dcb.xd.opfl_vxy_bufs = thread_data->td->opfl_vxy_bufs;
+ thread_data->td->dcb.xd.opfl_gxy_bufs = thread_data->td->opfl_gxy_bufs;
+ thread_data->td->dcb.xd.opfl_dst_bufs = thread_data->td->opfl_dst_bufs;
+
for (int j = 0; j < 2; ++j) {
thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
thread_data->td->tmp_obmc_bufs[j];
@@ -5954,7 +5993,10 @@
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
if (thread_data->td->mc_buf_size != buf_size) {
av1_free_mc_tmp_buf(thread_data->td);
+ av1_free_opfl_tmp_bufs(thread_data->td);
+
allocate_mc_tmp_buf(cm, thread_data->td, buf_size);
+ allocate_opfl_tmp_bufs(cm, thread_data->td);
}
}
}
@@ -8537,7 +8579,10 @@
const int buf_size = MC_TEMP_BUF_PELS << 1;
if (pbi->td.mc_buf_size != buf_size) {
av1_free_mc_tmp_buf(&pbi->td);
+ av1_free_opfl_tmp_bufs(&pbi->td);
+
allocate_mc_tmp_buf(cm, &pbi->td, buf_size);
+ allocate_opfl_tmp_bufs(cm, &pbi->td);
}
}
diff --git a/av1/decoder/decodeframe.h b/av1/decoder/decodeframe.h
index b01807c..8ef0422 100644
--- a/av1/decoder/decodeframe.h
+++ b/av1/decoder/decodeframe.h
@@ -94,6 +94,7 @@
const uint8_t *data_end);
void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
+void av1_free_opfl_tmp_bufs(struct ThreadData *thread_data);
void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index c230bbf..1189b76 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -289,6 +289,7 @@
for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
av1_free_mc_tmp_buf(thread_data->td);
+ av1_free_opfl_tmp_bufs(thread_data->td);
aom_free(thread_data->td);
}
aom_free(pbi->thread_data);
@@ -331,6 +332,7 @@
aom_accounting_clear(&pbi->accounting);
#endif
av1_free_mc_tmp_buf(&pbi->td);
+ av1_free_opfl_tmp_bufs(&pbi->td);
aom_img_metadata_array_free(pbi->metadata);
#if DEBUG_EXTQUANT
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index c9c3637..b100eb2 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -133,6 +133,13 @@
int32_t mc_buf_size;
CONV_BUF_TYPE *tmp_conv_dst;
+ // Temporary buffers used to store the OPFL MV offsets.
+ int *opfl_vxy_bufs;
+ // Temporary buffers used to store the OPFL gradient information.
+ int16_t *opfl_gxy_bufs;
+ // Temporary buffers used to store intermediate prediction data calculated
+ // during the OPFL/DMVR.
+ uint16_t *opfl_dst_bufs;
uint16_t *tmp_obmc_bufs[2];
decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 4165df3..d8f0cec 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1559,6 +1559,13 @@
//! Buffer to store convolution during averaging process in compound mode.
CONV_BUF_TYPE *tmp_conv_dst;
+ //! Temporary buffers used to store the OPFL MV offsets.
+ int *opfl_vxy_bufs;
+ //! Temporary buffers used to store the OPFL gradient information.
+ int16_t *opfl_gxy_bufs;
+ //! Temporary buffers used to store intermediate prediction data calculated
+ //! during the OPFL/DMVR.
+ uint16_t *opfl_dst_bufs;
/*! \brief Temporary buffer to hold prediction.
*
* Points to a buffer that is used to hold temporary prediction results. This
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 35bc024..00d2417 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -833,6 +833,27 @@
aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
}
+
+ // Temporary buffers used during the DMVR and OPFL processing.
+ if (x->opfl_vxy_bufs == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->opfl_vxy_bufs,
+ aom_memalign(32, N_OF_OFFSETS * 4 * sizeof(*x->opfl_vxy_bufs)));
+ x->e_mbd.opfl_vxy_bufs = x->opfl_vxy_bufs;
+ }
+ if (x->opfl_gxy_bufs == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->opfl_gxy_bufs,
+ aom_memalign(32, MAX_SB_SQUARE * 4 * sizeof(*x->opfl_gxy_bufs)));
+ x->e_mbd.opfl_gxy_bufs = x->opfl_gxy_bufs;
+ }
+ if (x->opfl_dst_bufs == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->opfl_dst_bufs,
+ aom_memalign(32, MAX_SB_SQUARE * 2 * sizeof(*x->opfl_dst_bufs)));
+ x->e_mbd.opfl_dst_bufs = x->opfl_dst_bufs;
+ }
+
for (int i = 0; i < 2; ++i) {
if (x->tmp_pred_bufs[i] == NULL) {
CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
@@ -1325,6 +1346,11 @@
if (t == 0) continue;
aom_free(thread_data->td->palette_buffer);
aom_free(thread_data->td->tmp_conv_dst);
+
+ // Temporary buffers used during the DMVR and OPFL processing.
+ aom_free(thread_data->td->opfl_vxy_bufs);
+ aom_free(thread_data->td->opfl_gxy_bufs);
+ aom_free(thread_data->td->opfl_dst_bufs);
release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
for (int j = 0; j < 2; ++j) {
aom_free(thread_data->td->tmp_pred_bufs[j]);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b5f78aa..aae391e 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1960,6 +1960,13 @@
PALETTE_BUFFER *palette_buffer;
CompoundTypeRdBuffers comp_rd_buffer;
CONV_BUF_TYPE *tmp_conv_dst;
+ // Temporary buffers used to store the OPFL MV offsets.
+ int *opfl_vxy_bufs;
+ // Temporary buffers used to store the OPFL gradient information.
+ int16_t *opfl_gxy_bufs;
+ // Temporary buffers used to store intermediate prediction data calculated
+ // during the OPFL/DMVR.
+ uint16_t *opfl_dst_bufs;
uint16_t *tmp_pred_bufs[2];
int intrabc_used;
int deltaq_used;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 4b875ce..f563303 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -255,6 +255,11 @@
aom_free(cpi->td.mb.palette_buffer);
release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
aom_free(cpi->td.mb.tmp_conv_dst);
+
+ // Temporary buffers used during the DMVR and OPFL processing.
+ aom_free(cpi->td.mb.opfl_vxy_bufs);
+ aom_free(cpi->td.mb.opfl_gxy_bufs);
+ aom_free(cpi->td.mb.opfl_dst_bufs);
for (int j = 0; j < 2; ++j) {
aom_free(cpi->td.mb.tmp_pred_bufs[j]);
}
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 589eb38..6d052b6 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -585,6 +585,20 @@
cm, thread_data->td->tmp_conv_dst,
aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
sizeof(*thread_data->td->tmp_conv_dst)));
+ // Temporary buffers used during the DMVR and OPFL processing.
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->opfl_vxy_bufs,
+ aom_memalign(
+ 32, N_OF_OFFSETS * 4 * sizeof(*thread_data->td->opfl_vxy_bufs)));
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->opfl_gxy_bufs,
+ aom_memalign(
+ 32, MAX_SB_SQUARE * 4 * sizeof(*thread_data->td->opfl_gxy_bufs)));
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->opfl_dst_bufs,
+ aom_memalign(
+ 32, MAX_SB_SQUARE * 2 * sizeof(*thread_data->td->opfl_dst_bufs)));
+
for (int j = 0; j < 2; ++j) {
CHECK_MEM_ERROR(
cm, thread_data->td->tmp_pred_bufs[j],
@@ -805,12 +819,25 @@
thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ // Temporary buffers used during the DMVR and OPFL processing.
+ thread_data->td->mb.opfl_vxy_bufs = thread_data->td->opfl_vxy_bufs;
+ thread_data->td->mb.opfl_gxy_bufs = thread_data->td->opfl_gxy_bufs;
+ thread_data->td->mb.opfl_dst_bufs = thread_data->td->opfl_dst_bufs;
+
for (int j = 0; j < 2; ++j) {
thread_data->td->mb.tmp_pred_bufs[j] =
thread_data->td->tmp_pred_bufs[j];
}
thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ // Temporary buffers used during the DMVR and OPFL processing.
+ thread_data->td->mb.e_mbd.opfl_vxy_bufs =
+ thread_data->td->mb.opfl_vxy_bufs;
+ thread_data->td->mb.e_mbd.opfl_gxy_bufs =
+ thread_data->td->mb.opfl_gxy_bufs;
+ thread_data->td->mb.e_mbd.opfl_dst_bufs =
+ thread_data->td->mb.opfl_dst_bufs;
+
for (int j = 0; j < 2; ++j) {
thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
thread_data->td->mb.tmp_pred_bufs[j];