Move allocation of pred buffer in upsampled_pref_error()

This patch moves the allocation of pred buffer in 'upsampled_pref_error()' function from stack memory to MACROBLOCK structure which improves the function performance for GCC compiler due to better stack memory management.

This is a bit-exact change, and gives 1% - 3% encoder time reduction for GCC compiler (higher speedup is seen for low QPs).

Performance impact on the Clang compiler is negligible.
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index f82109f..5337f00 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -2644,6 +2644,12 @@
    * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'.
    */
   uint16_t *tmp_obmc_bufs[2];
+
+  /*!
+   *  Temporary buffer used for upsampled prediction.
+   */
+  uint16_t *tmp_upsample_pred;
+
   /*!
    * Enable IST for current coding block.
    */
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index cfa29b7..7204a93 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1591,6 +1591,11 @@
    *   prediction.
    */
   uint16_t *tmp_pred_bufs[2];
+
+  /*!
+   *  Buffer used for upsampled prediction.
+   */
+  uint16_t *upsample_pred;
   /**@}*/
 
   /*****************************************************************************
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 00d2417..542921b 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -834,6 +834,13 @@
     x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
   }
 
+  if (x->upsample_pred == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->upsample_pred,
+        aom_memalign(16, MAX_SB_SQUARE * sizeof(*x->upsample_pred)));
+    x->e_mbd.tmp_upsample_pred = x->upsample_pred;
+  }
+
   // Temporary buffers used during the DMVR and OPFL processing.
   if (x->opfl_vxy_bufs == NULL) {
     CHECK_MEM_ERROR(
@@ -1346,6 +1353,7 @@
     if (t == 0) continue;
     aom_free(thread_data->td->palette_buffer);
     aom_free(thread_data->td->tmp_conv_dst);
+    aom_free(thread_data->td->upsample_pred);
 
     // Temporary buffers used during the DMVR and OPFL processing.
     aom_free(thread_data->td->opfl_vxy_bufs);
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index aae391e..a8481bd 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1968,6 +1968,8 @@
   // during the OPFL/DMVR.
   uint16_t *opfl_dst_bufs;
   uint16_t *tmp_pred_bufs[2];
+  // Buffer used for upsampled prediction.
+  uint16_t *upsample_pred;
   int intrabc_used;
   int deltaq_used;
   FRAME_CONTEXT *tctx;
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index f563303..c3fbbe8 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -255,6 +255,7 @@
   aom_free(cpi->td.mb.palette_buffer);
   release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
   aom_free(cpi->td.mb.tmp_conv_dst);
+  aom_free(cpi->td.mb.upsample_pred);
 
   // Temporary buffers used during the DMVR and OPFL processing.
   aom_free(cpi->td.mb.opfl_vxy_bufs);
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 6d052b6..4541889 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -599,6 +599,11 @@
           aom_memalign(
               32, MAX_SB_SQUARE * 2 * sizeof(*thread_data->td->opfl_dst_bufs)));
 
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->upsample_pred,
+          aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->upsample_pred)));
+
       for (int j = 0; j < 2; ++j) {
         CHECK_MEM_ERROR(
             cm, thread_data->td->tmp_pred_bufs[j],
@@ -819,6 +824,7 @@
       thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
       thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
       thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      thread_data->td->mb.upsample_pred = thread_data->td->upsample_pred;
       // Temporary buffers used during the DMVR and OPFL processing.
       thread_data->td->mb.opfl_vxy_bufs = thread_data->td->opfl_vxy_bufs;
       thread_data->td->mb.opfl_gxy_bufs = thread_data->td->opfl_gxy_bufs;
@@ -830,6 +836,8 @@
       }
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      thread_data->td->mb.e_mbd.tmp_upsample_pred =
+          thread_data->td->mb.upsample_pred;
       // Temporary buffers used during the DMVR and OPFL processing.
       thread_data->td->mb.e_mbd.opfl_vxy_bufs =
           thread_data->td->mb.opfl_vxy_bufs;
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 307da01..7a493d8 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -3347,6 +3347,7 @@
   const uint16_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
   const int src_stride = ms_buffers->src->stride;
   const int ref_stride = ms_buffers->ref->stride;
+  uint16_t *pred = xd->tmp_upsample_pred;
   const uint16_t *second_pred = ms_buffers->second_pred;
   const uint8_t *mask = ms_buffers->mask;
   const int mask_stride = ms_buffers->mask_stride;
@@ -3363,7 +3364,6 @@
   const int is_scaled_ref = ms_buffers->src->width == ms_buffers->ref->width &&
                             ms_buffers->src->height == ms_buffers->ref->height;
 
-  DECLARE_ALIGNED(16, uint16_t, pred[MAX_SB_SQUARE]);
   if (second_pred != NULL) {
     if (mask) {
       aom_highbd_comp_mask_upsampled_pred(