Allocate scaled source buffers on the fly

The scaled source buffers are allocated/reallocated on the fly
based on the target dimensions.

For AVIF image encode with speed = 9,

             HEAP Memory reduction(%)
Resolution   threads=1    threads=4
640x360        4.49         2.71
768x512        4.26         2.89
832x480        4.49         3.12
1280x720       4.51         3.59

For threads=4, an average encode time reduction of ~1.12% is
observed for 360p-720p resolutions.

HEAP memory reduction was measured using the following command.
$valgrind --tool=massif ./avifenc ...

Change-Id: I7faf035207ffa9dfab5d57ff8db4ae70b8534d6c
diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index 6ddc82f..dedfc02 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c
@@ -185,7 +185,7 @@
    * between planes, which would break the semantics of things like
    * aom_img_set_rect(). */
   if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
-  *y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+  *y_stride = aom_calc_y_stride(aligned_width, border);
   *yplane_size =
       (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
 
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index c0e0361..31af69c 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h
@@ -167,6 +167,18 @@
 int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                       const aom_metadata_array_t *arr);
 
+/*!\brief Calculate the stride required for the image.
+ *
+ * Calculates the stride value for an image from aligned width and border.
+ * Returns the y stride value.
+ *
+ * \param[in]    aligned_width       Aligned width of the image
+ * \param[in]    border              Border in pixels
+ */
+static AOM_INLINE int aom_calc_y_stride(int aligned_width, int border) {
+  return ((aligned_width + 2 * border) + 31) & ~31;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 112a08a..a3c3c0e 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -1338,21 +1338,34 @@
   aom_extend_frame_borders(dst, num_planes);
 }
 
-YV12_BUFFER_CONFIG *av1_scale_if_required(
+YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
     AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     const InterpFilter filter, const int phase, const bool use_optimized_scaler,
-    const bool for_psnr) {
+    const bool for_psnr, const int border_in_pixels,
+    const bool alloc_y_buffer_8bit) {
   // If scaling is performed for the sole purpose of calculating PSNR, then our
   // target dimensions are superres upscaled width/height. Otherwise our target
   // dimensions are coded width/height.
-  const bool scaling_required =
-      for_psnr ? (cm->superres_upscaled_width != unscaled->y_crop_width ||
-                  cm->superres_upscaled_height != unscaled->y_crop_height)
-               : (cm->width != unscaled->y_crop_width ||
-                  cm->height != unscaled->y_crop_height);
+  const int scaled_width = for_psnr ? cm->superres_upscaled_width : cm->width;
+  const int scaled_height =
+      for_psnr ? cm->superres_upscaled_height : cm->height;
+  const bool scaling_required = (scaled_width != unscaled->y_crop_width) ||
+                                (scaled_height != unscaled->y_crop_height);
 
   if (scaling_required) {
     const int num_planes = av1_num_planes(cm);
+    const SequenceHeader *seq_params = cm->seq_params;
+
+    // Reallocate the frame buffer based on the target dimensions when scaling
+    // is required.
+    if (aom_realloc_frame_buffer(
+            scaled, scaled_width, scaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL,
+            alloc_y_buffer_8bit))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate scaled buffer");
+
 #if CONFIG_AV1_HIGHBITDEPTH
     if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
diff --git a/av1/common/resize.h b/av1/common/resize.h
index b08de80..75abe62 100644
--- a/av1/common/resize.h
+++ b/av1/common/resize.h
@@ -71,10 +71,11 @@
                                             const YV12_BUFFER_CONFIG *src,
                                             YV12_BUFFER_CONFIG *dst);
 
-YV12_BUFFER_CONFIG *av1_scale_if_required(
+YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required(
     AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     const InterpFilter filter, const int phase, const bool use_optimized_scaler,
-    const bool for_psnr);
+    const bool for_psnr, const int border_in_pixels,
+    const bool alloc_y_buffer_8bit);
 
 void av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
                                               YV12_BUFFER_CONFIG *dst, int bd,
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 6dda945..547c71e 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -1067,9 +1067,10 @@
 
   // Set frame_input source to true source for psnr calculation.
   if (apply_filtering && is_psnr_calc_enabled(cpi)) {
-    cpi->source =
-        av1_scale_if_required(cm, source_buffer, &cpi->scaled_source,
-                              cm->features.interp_filter, 0, false, true);
+    cpi->source = av1_realloc_and_scale_if_required(
+        cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0,
+        false, true, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
     cpi->unscaled_source = source_buffer;
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index d6a60db..7c4d1e5 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1878,14 +1878,16 @@
 static void init_motion_estimation(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
-  const int y_stride = cpi->scaled_source.y_stride;
+  const int aligned_width = (cm->width + 7) & ~7;
+  const int y_stride =
+      aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels);
   const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width ||
                              cpi->oxcf.frm_dim_cfg.height != cm->height) ||
                             av1_superres_scaled(cm))
                                ? y_stride
                                : cpi->ppi->lookahead->buf->img.y_stride;
-  int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride
-                                           : cpi->scaled_source.y_stride;
+  int fpf_y_stride =
+      cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride;
 
   // Update if search_site_cfg is uninitialized or the current frame has a new
   // stride
@@ -2331,8 +2333,10 @@
   }
 #endif
 
-  cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source,
-                                      filter_scaler, phase_scaler, true, false);
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true,
+      false, cpi->oxcf.border_in_pixels,
+      cpi->oxcf.tool_cfg.enable_global_motion);
   if (frame_is_intra_only(cm) || resize_pending != 0) {
     memset(cpi->consec_zero_mv, 0,
            ((cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2) *
@@ -2340,9 +2344,10 @@
   }
 
   if (cpi->unscaled_last_source != NULL) {
-    cpi->last_source = av1_scale_if_required(
+    cpi->last_source = av1_realloc_and_scale_if_required(
         cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
-        phase_scaler, true, false);
+        phase_scaler, true, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
   }
 
   if (cpi->sf.rt_sf.use_temporal_noise_estimate) {
@@ -2567,9 +2572,10 @@
         gm_info->search_done = 0;
       }
     }
-    cpi->source =
-        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
-                              EIGHTTAP_REGULAR, 0, false, false);
+    cpi->source = av1_realloc_and_scale_if_required(
+        cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0,
+        false, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
 
 #if CONFIG_TUNE_BUTTERAUGLI
     if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
@@ -2586,9 +2592,10 @@
 #endif
 
     if (cpi->unscaled_last_source != NULL) {
-      cpi->last_source = av1_scale_if_required(
+      cpi->last_source = av1_realloc_and_scale_if_required(
           cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
-          EIGHTTAP_REGULAR, 0, false, false);
+          EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels,
+          cpi->oxcf.tool_cfg.enable_global_motion);
     }
 
 #if CONFIG_FRAME_PARALLEL_ENCODE
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 0d43a98..8f5c3a8 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -372,31 +372,6 @@
       aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate trial restored frame buffer");
   }
-
-  if (aom_realloc_frame_buffer(
-          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL,
-          cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate scaled source buffer");
-
-  // The frame buffer cpi->scaled_last_source is used to hold the previous
-  // source frame information. As the previous source frame buffer allocation in
-  // the lookahead queue is avoided for all-intra frame encoding,
-  // cpi->unscaled_last_source will be NULL in such cases. As
-  // cpi->unscaled_last_source is NULL, cpi->scaled_last_source will not be used
-  // for all-intra frame encoding. Hence, the buffer is allocated conditionally.
-  if (cpi->oxcf.kf_cfg.key_freq_max > 0) {
-    if (aom_realloc_frame_buffer(
-            &cpi->scaled_last_source, cm->width, cm->height,
-            seq_params->subsampling_x, seq_params->subsampling_y,
-            seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
-            byte_alignment, NULL, NULL, NULL,
-            cpi->oxcf.tool_cfg.enable_global_motion))
-      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
-                         "Failed to allocate scaled last source buffer");
-  }
 }
 
 static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 66cd272..3892e86 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c
@@ -979,13 +979,15 @@
 
   // Setup necessary params for encoding, including frame source, etc.
 
-  cpi->source =
-      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
-                            cm->features.interp_filter, 0, false, false);
+  cpi->source = av1_realloc_and_scale_if_required(
+      cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+      0, false, false, cpi->oxcf.border_in_pixels,
+      cpi->oxcf.tool_cfg.enable_global_motion);
   if (cpi->unscaled_last_source != NULL) {
-    cpi->last_source = av1_scale_if_required(
+    cpi->last_source = av1_realloc_and_scale_if_required(
         cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
-        cm->features.interp_filter, 0, false, false);
+        cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+        cpi->oxcf.tool_cfg.enable_global_motion);
   }
 
   av1_setup_frame(cpi);