Speed up handling of local buffers in build_inter_predictors_8x8_and_bigger()

This MR avoids the memset corresponds to the MV offsets (i.e., vx/vy buffers) as the population and utilization happens only for the required sub-blocks. Also, the temporary buffers used to store MV offsets, gradient information and OPFL prediction data are moved to heap memory from stack.

The encode time reduction:

* For GCC compiler, overall 1.247%.  1.776% for Qp110 & 135.
* For CLANG compiler, \<= 0.5%.

Decode time reduction of \~18% and \~4% is seen for GCC and CLANG compilers across all test-sets.

No stats change.
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 835d34a..6e9ca76 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -2618,6 +2618,19 @@
    */
   CONV_BUF_TYPE *tmp_conv_dst;
   /*!
+   * Temporary buffers used to store the OPFL MV offsets.
+   */
+  int *opfl_vxy_bufs;
+  /*!
+   * Temporary buffers used to store the OPFL gradient information.
+   */
+  int16_t *opfl_gxy_bufs;
+  /*!
+   * Temporary buffers used to store intermediate prediction data calculated
+   * during the OPFL/DMVR.
+   */
+  uint16_t *opfl_dst_bufs;
+  /*!
    * Temporary buffers used to build OBMC prediction by above (index 0) and left
    * (index 1) predictors respectively.
    * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'.
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index a648bad..d162c59 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -2129,7 +2129,13 @@
   suw = ROUND_POWER_OF_TWO_SIGNED_64(suw, redbit);
   svw = ROUND_POWER_OF_TWO_SIGNED_64(svw, redbit);
   const int64_t det = su2 * sv2 - suv * suv;
-  if (det <= 0) return;
+  if (det <= 0) {
+    *vx0 = 0;
+    *vy0 = 0;
+    *vx1 = 0;
+    *vy1 = 0;
+    return;
+  }
 
   int64_t sol[2] = { sv2 * suw - suv * svw, su2 * svw - suv * suw };
 
@@ -2138,7 +2144,13 @@
   *vy0 = (int)-sol[1];
 #else
   const int64_t det = su2 * sv2 - suv * suv;
-  if (det <= 0) return;
+  if (det <= 0) {
+    *vx0 = 0;
+    *vy0 = 0;
+    *vx1 = 0;
+    *vy1 = 0;
+    return;
+  }
   const int64_t det_x = (suv * svw - sv2 * suw) * (1 << bits);
   const int64_t det_y = (suv * suw - su2 * svw) * (1 << bits);
 
@@ -4564,9 +4576,9 @@
     int subblk_start_x, int subblk_start_y,
 #endif  // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
     int pu_width, int pu_height, uint16_t *dst0_16_refinemv,
-    uint16_t *dst1_16_refinemv, int16_t *opt_gx0, int16_t *opt_gx1,
-    int row_start, int col_start, MV *sb_refined_mv, MV *chroma_refined_mv,
-    int build_for_refine_mv_only, ReferenceArea ref_area[2]
+    uint16_t *dst1_16_refinemv, int row_start, int col_start, MV *sb_refined_mv,
+    MV *chroma_refined_mv, int build_for_refine_mv_only,
+    ReferenceArea ref_area[2]
 #if CONFIG_TIP_REF_PRED_MERGING
     ,
     int_mv *mv_refined
@@ -4700,15 +4712,6 @@
                  cm->features.opfl_refine_type == REFINE_ALL));
 #endif  // CONFIG_TIP_REF_PRED_MERGING
 
-  // Arrays to hold optical flow offsets.
-  int vx0[N_OF_OFFSETS] = { 0 };
-  int vx1[N_OF_OFFSETS] = { 0 };
-  int vy0[N_OF_OFFSETS] = { 0 };
-  int vy1[N_OF_OFFSETS] = { 0 };
-
-  // Pointers to gradient and dst buffers
-  int16_t *gx0, *gy0, *gx1, *gy1;
-  uint16_t *dst0 = NULL, *dst1 = NULL;
 #if CONFIG_TIP_REF_PRED_MERGING
   int use_4x4 = tip_ref_frame ? 0 : 1;
 #endif
@@ -4739,11 +4742,17 @@
 #endif  // CONFIG_AFFINE_REFINEMENT
 
   if (use_optflow_refinement && plane == 0) {
-    // Allocate gradient and dst buffers
-    gx0 = &opt_gx0[0];
-    gx1 = &opt_gx1[0];
-    gy0 = gx0 + (REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT);
-    gy1 = gx1 + (REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT);
+    // Pointers to hold optical flow MV offsets.
+    int *vx0 = xd->opfl_vxy_bufs;
+    int *vx1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 1);
+    int *vy0 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 2);
+    int *vy1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 3);
+
+    // Pointers to hold gradient and dst buffers.
+    int16_t *gx0 = xd->opfl_gxy_bufs;
+    int16_t *gx1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 1);
+    int16_t *gy0 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 2);
+    int16_t *gy1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 3);
 
     // Initialize refined mv
     const MV mv0 = best_mv_ref[0];
@@ -4757,9 +4766,8 @@
 #endif
     // Refine MV using optical flow. The final output MV will be in 1/16
     // precision.
-    dst0 = &dst0_16_refinemv[0];
-    dst1 = &dst1_16_refinemv[0];
-
+    uint16_t *dst0 = xd->opfl_dst_bufs;
+    uint16_t *dst1 = xd->opfl_dst_bufs + MAX_SB_SQUARE;
 #if CONFIG_TIP_REF_PRED_MERGING
     if (tip_ref_frame) {
       use_optflow_refinement = !skip_opfl_refine_with_tip(
@@ -5074,12 +5082,6 @@
       uint16_t
           dst1_16_refinemv[REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT];
 #endif  // CONFIG_SUBBLK_REF_EXT
-    DECLARE_ALIGNED(
-        32, int16_t,
-        opt_gx0[2 * REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]);
-    DECLARE_ALIGNED(
-        32, int16_t,
-        opt_gx1[2 * REFINEMV_SUBBLOCK_WIDTH * REFINEMV_SUBBLOCK_HEIGHT]);
 
     ReferenceArea ref_area[2];
 #if !CONFIG_SUBBLK_PAD
@@ -5157,9 +5159,9 @@
 #if CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
             w, h,
 #endif  // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
-            pu_width, pu_height, dst0_16_refinemv, dst1_16_refinemv, opt_gx0,
-            opt_gx1, row_start, col_start, plane == 0 ? luma_refined_mv : NULL,
-            chroma_refined_mv, build_for_refine_mv_only, ref_area, mv_refined);
+            pu_width, pu_height, dst0_16_refinemv, dst1_16_refinemv, row_start,
+            col_start, plane == 0 ? luma_refined_mv : NULL, chroma_refined_mv,
+            build_for_refine_mv_only, ref_area, mv_refined);
 
         if (plane == 0 && !tip_ref_frame) {
 #else
@@ -5173,9 +5175,9 @@
 #if CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
               w, h,
 #endif  // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
-              bw, bh, dst0_16_refinemv, dst1_16_refinemv, opt_gx0, opt_gx1,
-              row_start, col_start, plane == 0 ? luma_refined_mv : NULL,
-              chroma_refined_mv, build_for_refine_mv_only, ref_area);
+              bw, bh, dst0_16_refinemv, dst1_16_refinemv, row_start, col_start,
+              plane == 0 ? luma_refined_mv : NULL, chroma_refined_mv,
+              build_for_refine_mv_only, ref_area);
 
           if (plane == 0) {
 #endif  // CONFIG_TIP_REF_PRED_MERGING
@@ -5300,15 +5302,15 @@
 #endif  // CONFIG_AFFINE_REFINEMENT_SB
 #endif  // CONFIG_AFFINE_REFINEMENT
 
-  // Arrays to hold optical flow offsets.
-  int vx0[N_OF_OFFSETS] = { 0 };
-  int vx1[N_OF_OFFSETS] = { 0 };
-  int vy0[N_OF_OFFSETS] = { 0 };
-  int vy1[N_OF_OFFSETS] = { 0 };
-
   // Pointers to gradient and dst buffers
 
   if (use_optflow_refinement && plane == 0) {
+    // Pointers to hold optical flow MV offsets.
+    int *vx0 = xd->opfl_vxy_bufs;
+    int *vx1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 1);
+    int *vy0 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 2);
+    int *vy1 = xd->opfl_vxy_bufs + (N_OF_OFFSETS * 3);
+
 #if CONFIG_AFFINE_REFINEMENT
     assert(mi->comp_refine_type > COMP_REFINE_NONE);
     assert(IMPLIES(mi->comp_refine_type >= COMP_AFFINE_REFINE_START,
@@ -5327,13 +5329,10 @@
 #endif  // CONFIG_OPTFLOW_ON_TIP
     );
     const int n_blocks = (bw / n) * (bh / n);
-    int16_t *gx0, *gy0, *gx1, *gy1;
-    DECLARE_ALIGNED(32, int16_t, g0_buf[2 * MAX_SB_SQUARE]);
-    DECLARE_ALIGNED(32, int16_t, g1_buf[2 * MAX_SB_SQUARE]);
-    gx0 = g0_buf;
-    gx1 = g1_buf;
-    gy0 = g0_buf + MAX_SB_SQUARE;
-    gy1 = g1_buf + MAX_SB_SQUARE;
+    int16_t *gx0 = xd->opfl_gxy_bufs;
+    int16_t *gx1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 1);
+    int16_t *gy0 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 2);
+    int16_t *gy1 = xd->opfl_gxy_bufs + (MAX_SB_SQUARE * 3);
 
     // Initialize refined mv
 #if CONFIG_REFINEMV
@@ -5351,7 +5350,9 @@
 #endif
     // Refine MV using optical flow. The final output MV will be in 1/16
     // precision.
-    uint16_t dst0[MAX_SB_SQUARE], dst1[MAX_SB_SQUARE];
+    uint16_t *dst0 = xd->opfl_dst_bufs;
+    uint16_t *dst1 = xd->opfl_dst_bufs + MAX_SB_SQUARE;
+
 #if CONFIG_TIP_REF_PRED_MERGING
     if (tip_ref_frame) {
       use_optflow_refinement = !skip_opfl_refine_with_tip(
@@ -5387,12 +5388,23 @@
 #endif  // CONFIG_REFINEMV
       );
 #if CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
-      for (int mvi = 0; mvi < N_OF_OFFSETS; mvi++) {
+      for (int mvi = 0; mvi < n_blocks; mvi++) {
         xd->mv_delta[mvi].mv[0].as_mv.row = vy0[mvi];
         xd->mv_delta[mvi].mv[0].as_mv.col = vx0[mvi];
         xd->mv_delta[mvi].mv[1].as_mv.row = vy1[mvi];
         xd->mv_delta[mvi].mv[1].as_mv.col = vx1[mvi];
       }
+
+      // TODO(any): The memset is required as the MV delta offsets of optical
+      // flow refinement stored in 'mv_delta' buffer is accessed beyond n_blocks
+      // when 'AFFINE_CHROMA_REFINE_METHOD' is enabled. Recheck if access beyond
+      // n_blocks of 'mv_delta' buffer is valid.
+      for (int mvi = n_blocks; mvi < N_OF_OFFSETS; mvi++) {
+        xd->mv_delta[mvi].mv[0].as_mv.row = 0;
+        xd->mv_delta[mvi].mv[0].as_mv.col = 0;
+        xd->mv_delta[mvi].mv[1].as_mv.row = 0;
+        xd->mv_delta[mvi].mv[1].as_mv.col = 0;
+      }
 #endif  // CONFIG_AFFINE_REFINEMENT || CONFIG_REFINED_MVS_IN_TMVP
 #if CONFIG_AFFINE_REFINEMENT_SB
       memcpy(xd->wm_params_sb, wms, 2 * NUM_AFFINE_PARAMS * sizeof(wms[0]));
diff --git a/av1/common/x86/optflow_refine_sse4.c b/av1/common/x86/optflow_refine_sse4.c
index 4c1a689..c517e40 100644
--- a/av1/common/x86/optflow_refine_sse4.c
+++ b/av1/common/x86/optflow_refine_sse4.c
@@ -423,7 +423,13 @@
   suw = ROUND_POWER_OF_TWO_SIGNED_64(suw, redbit);
   svw = ROUND_POWER_OF_TWO_SIGNED_64(svw, redbit);
   const int64_t det = su2 * sv2 - suv * suv;
-  if (det <= 0) return;
+  if (det <= 0) {
+    *vx0 = 0;
+    *vy0 = 0;
+    *vx1 = 0;
+    *vy1 = 0;
+    return;
+  }
 
   int64_t sol[2] = { sv2 * suw - suv * svw, su2 * svw - suv * suw };
 
@@ -432,7 +438,13 @@
   *vy0 = (int)-sol[1];
 #else
   const int64_t det = su2 * sv2 - suv * suv;
-  if (det <= 0) return;
+  if (det <= 0) {
+    *vx0 = 0;
+    *vy0 = 0;
+    *vx1 = 0;
+    *vy1 = 0;
+    return;
+  }
   const int64_t det_x = (suv * svw - sv2 * suw) * (1 << bits);
   const int64_t det_y = (suv * suw - su2 * svw) * (1 << bits);
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index c54b4c6..1aced61 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5235,6 +5235,11 @@
   td->dcb.mc_buf[0] = td->mc_buf[0];
   td->dcb.mc_buf[1] = td->mc_buf[1];
   td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst;
+
+  // Temporary buffers used during the DMVR and OPFL processing.
+  td->dcb.xd.opfl_vxy_bufs = td->opfl_vxy_bufs;
+  td->dcb.xd.opfl_gxy_bufs = td->opfl_gxy_bufs;
+  td->dcb.xd.opfl_dst_bufs = td->opfl_dst_bufs;
   for (int j = 0; j < 2; ++j) {
     td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
   }
@@ -5816,6 +5821,35 @@
   }
 }
 
+// Free-up the temporary buffers created for DMVR and OPFL processing.
+void av1_free_opfl_tmp_bufs(ThreadData *thread_data) {
+  aom_free(thread_data->opfl_vxy_bufs);
+  thread_data->opfl_vxy_bufs = NULL;
+
+  aom_free(thread_data->opfl_gxy_bufs);
+  thread_data->opfl_gxy_bufs = NULL;
+
+  aom_free(thread_data->opfl_dst_bufs);
+  thread_data->opfl_dst_bufs = NULL;
+}
+
+// Allocate memory for temporary buffers used during the DMVR and OPFL
+// processing.
+static AOM_INLINE void allocate_opfl_tmp_bufs(AV1_COMMON *const cm,
+                                              ThreadData *thread_data) {
+  CHECK_MEM_ERROR(
+      cm, thread_data->opfl_vxy_bufs,
+      aom_memalign(32, N_OF_OFFSETS * 4 * sizeof(*thread_data->opfl_vxy_bufs)));
+
+  CHECK_MEM_ERROR(cm, thread_data->opfl_gxy_bufs,
+                  aom_memalign(32, MAX_SB_SQUARE * 4 *
+                                       sizeof(*thread_data->opfl_gxy_bufs)));
+
+  CHECK_MEM_ERROR(cm, thread_data->opfl_dst_bufs,
+                  aom_memalign(32, MAX_SB_SQUARE * 2 *
+                                       sizeof(*thread_data->opfl_dst_bufs)));
+}
+
 static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
                                            ThreadData *thread_data,
                                            int buf_size) {
@@ -5857,6 +5891,11 @@
     thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0];
     thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1];
     thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    // Temporary buffers used during the DMVR and OPFL processing.
+    thread_data->td->dcb.xd.opfl_vxy_bufs = thread_data->td->opfl_vxy_bufs;
+    thread_data->td->dcb.xd.opfl_gxy_bufs = thread_data->td->opfl_gxy_bufs;
+    thread_data->td->dcb.xd.opfl_dst_bufs = thread_data->td->opfl_dst_bufs;
+
     for (int j = 0; j < 2; ++j) {
       thread_data->td->dcb.xd.tmp_obmc_bufs[j] =
           thread_data->td->tmp_obmc_bufs[j];
@@ -5954,7 +5993,10 @@
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
     if (thread_data->td->mc_buf_size != buf_size) {
       av1_free_mc_tmp_buf(thread_data->td);
+      av1_free_opfl_tmp_bufs(thread_data->td);
+
       allocate_mc_tmp_buf(cm, thread_data->td, buf_size);
+      allocate_opfl_tmp_bufs(cm, thread_data->td);
     }
   }
 }
@@ -8537,7 +8579,10 @@
   const int buf_size = MC_TEMP_BUF_PELS << 1;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td);
+    av1_free_opfl_tmp_bufs(&pbi->td);
+
     allocate_mc_tmp_buf(cm, &pbi->td, buf_size);
+    allocate_opfl_tmp_bufs(cm, &pbi->td);
   }
 }
 
diff --git a/av1/decoder/decodeframe.h b/av1/decoder/decodeframe.h
index b01807c..8ef0422 100644
--- a/av1/decoder/decodeframe.h
+++ b/av1/decoder/decodeframe.h
@@ -94,6 +94,7 @@
     const uint8_t *data_end);
 
 void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
+void av1_free_opfl_tmp_bufs(struct ThreadData *thread_data);
 
 void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
 
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index c230bbf..1189b76 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -289,6 +289,7 @@
     for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
       DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
       av1_free_mc_tmp_buf(thread_data->td);
+      av1_free_opfl_tmp_bufs(thread_data->td);
       aom_free(thread_data->td);
     }
     aom_free(pbi->thread_data);
@@ -331,6 +332,7 @@
   aom_accounting_clear(&pbi->accounting);
 #endif
   av1_free_mc_tmp_buf(&pbi->td);
+  av1_free_opfl_tmp_bufs(&pbi->td);
   aom_img_metadata_array_free(pbi->metadata);
 
 #if DEBUG_EXTQUANT
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index c9c3637..b100eb2 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -133,6 +133,13 @@
   int32_t mc_buf_size;
 
   CONV_BUF_TYPE *tmp_conv_dst;
+  // Temporary buffers used to store the OPFL MV offsets.
+  int *opfl_vxy_bufs;
+  // Temporary buffers used to store the OPFL gradient information.
+  int16_t *opfl_gxy_bufs;
+  // Temporary buffers used to store intermediate prediction data calculated
+  // during the OPFL/DMVR.
+  uint16_t *opfl_dst_bufs;
   uint16_t *tmp_obmc_bufs[2];
 
   decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 4165df3..d8f0cec 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1559,6 +1559,13 @@
   //! Buffer to store convolution during averaging process in compound mode.
   CONV_BUF_TYPE *tmp_conv_dst;
 
+  //! Temporary buffers used to store the OPFL MV offsets.
+  int *opfl_vxy_bufs;
+  //! Temporary buffers used to store the OPFL gradient information.
+  int16_t *opfl_gxy_bufs;
+  //! Temporary buffers used to store intermediate prediction data calculated
+  //! during the OPFL/DMVR.
+  uint16_t *opfl_dst_bufs;
   /*! \brief Temporary buffer to hold prediction.
    *
    * Points to a buffer that is used to hold temporary prediction results. This
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 35bc024..00d2417 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -833,6 +833,27 @@
         aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
     x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
   }
+
+  // Temporary buffers used during the DMVR and OPFL processing.
+  if (x->opfl_vxy_bufs == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->opfl_vxy_bufs,
+        aom_memalign(32, N_OF_OFFSETS * 4 * sizeof(*x->opfl_vxy_bufs)));
+    x->e_mbd.opfl_vxy_bufs = x->opfl_vxy_bufs;
+  }
+  if (x->opfl_gxy_bufs == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->opfl_gxy_bufs,
+        aom_memalign(32, MAX_SB_SQUARE * 4 * sizeof(*x->opfl_gxy_bufs)));
+    x->e_mbd.opfl_gxy_bufs = x->opfl_gxy_bufs;
+  }
+  if (x->opfl_dst_bufs == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->opfl_dst_bufs,
+        aom_memalign(32, MAX_SB_SQUARE * 2 * sizeof(*x->opfl_dst_bufs)));
+    x->e_mbd.opfl_dst_bufs = x->opfl_dst_bufs;
+  }
+
   for (int i = 0; i < 2; ++i) {
     if (x->tmp_pred_bufs[i] == NULL) {
       CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
@@ -1325,6 +1346,11 @@
     if (t == 0) continue;
     aom_free(thread_data->td->palette_buffer);
     aom_free(thread_data->td->tmp_conv_dst);
+
+    // Temporary buffers used during the DMVR and OPFL processing.
+    aom_free(thread_data->td->opfl_vxy_bufs);
+    aom_free(thread_data->td->opfl_gxy_bufs);
+    aom_free(thread_data->td->opfl_dst_bufs);
     release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
     for (int j = 0; j < 2; ++j) {
       aom_free(thread_data->td->tmp_pred_bufs[j]);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b5f78aa..aae391e 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1960,6 +1960,13 @@
   PALETTE_BUFFER *palette_buffer;
   CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
+  // Temporary buffers used to store the OPFL MV offsets.
+  int *opfl_vxy_bufs;
+  // Temporary buffers used to store the OPFL gradient information.
+  int16_t *opfl_gxy_bufs;
+  // Temporary buffers used to store intermediate prediction data calculated
+  // during the OPFL/DMVR.
+  uint16_t *opfl_dst_bufs;
   uint16_t *tmp_pred_bufs[2];
   int intrabc_used;
   int deltaq_used;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 4b875ce..f563303 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -255,6 +255,11 @@
   aom_free(cpi->td.mb.palette_buffer);
   release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
   aom_free(cpi->td.mb.tmp_conv_dst);
+
+  // Temporary buffers used during the DMVR and OPFL processing.
+  aom_free(cpi->td.mb.opfl_vxy_bufs);
+  aom_free(cpi->td.mb.opfl_gxy_bufs);
+  aom_free(cpi->td.mb.opfl_dst_bufs);
   for (int j = 0; j < 2; ++j) {
     aom_free(cpi->td.mb.tmp_pred_bufs[j]);
   }
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 589eb38..6d052b6 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -585,6 +585,20 @@
           cm, thread_data->td->tmp_conv_dst,
           aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
                                sizeof(*thread_data->td->tmp_conv_dst)));
+      // Temporary buffers used during the DMVR and OPFL processing.
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->opfl_vxy_bufs,
+          aom_memalign(
+              32, N_OF_OFFSETS * 4 * sizeof(*thread_data->td->opfl_vxy_bufs)));
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->opfl_gxy_bufs,
+          aom_memalign(
+              32, MAX_SB_SQUARE * 4 * sizeof(*thread_data->td->opfl_gxy_bufs)));
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->opfl_dst_bufs,
+          aom_memalign(
+              32, MAX_SB_SQUARE * 2 * sizeof(*thread_data->td->opfl_dst_bufs)));
+
       for (int j = 0; j < 2; ++j) {
         CHECK_MEM_ERROR(
             cm, thread_data->td->tmp_pred_bufs[j],
@@ -805,12 +819,25 @@
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
       thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      // Temporary buffers used during the DMVR and OPFL processing.
+      thread_data->td->mb.opfl_vxy_bufs = thread_data->td->opfl_vxy_bufs;
+      thread_data->td->mb.opfl_gxy_bufs = thread_data->td->opfl_gxy_bufs;
+      thread_data->td->mb.opfl_dst_bufs = thread_data->td->opfl_dst_bufs;
+
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.tmp_pred_bufs[j] =
             thread_data->td->tmp_pred_bufs[j];
       }
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      // Temporary buffers used during the DMVR and OPFL processing.
+      thread_data->td->mb.e_mbd.opfl_vxy_bufs =
+          thread_data->td->mb.opfl_vxy_bufs;
+      thread_data->td->mb.e_mbd.opfl_gxy_bufs =
+          thread_data->td->mb.opfl_gxy_bufs;
+      thread_data->td->mb.e_mbd.opfl_dst_bufs =
+          thread_data->td->mb.opfl_dst_bufs;
+
       for (int j = 0; j < 2; ++j) {
         thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
             thread_data->td->mb.tmp_pred_bufs[j];