Use source buffer from last gop for tpl

This should benefit tpl data for ext RC.

It also helps a little bit to libaom's built-in RC. Test results for q mode, speed 1, 48 lag in frames and 4 tiles:

                       PSNR1411    Vmaf-neg    uvq-1.5
hdres2:      mean:     -0.264%     -0.185%     -0.622%
             median:   -0.309%     -0.141%     -0.609%
shorts_720p: mean:     -0.098%     -0.092%     13.157% (non-monotonic outliers)
             median:   -0.092%     -0.114%     -0.436%

STATS_CHANGED

Change-Id: I92c8c9e98598351e6d647f8460a1e99acbd3058d
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index ed4cdb4..aca5706 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1750,6 +1750,8 @@
   for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
     aom_free(tpl_data->tpl_stats_pool[frame]);
     aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->prev_gop_arf_src);
+    tpl_data->prev_gop_arf_disp_order = -1;
     tpl_data->tpl_stats_pool[frame] = NULL;
   }
 
@@ -4543,6 +4545,46 @@
       cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
       current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
 
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  // Check if this is the last frame in the gop. If so, make a copy of the
+  // source for TPL.
+  if (gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE &&
+      gf_group->update_type[cpi->gf_frame_index] != INTNL_OVERLAY_UPDATE) {
+    int is_last = 1;
+    for (int i = 0; i < gf_group->size; ++i) {
+      if (gf_group->display_idx[i] >
+          (int64_t)current_frame->display_order_hint) {
+        is_last = 0;
+        break;
+      }
+    }
+    if (is_last) {
+      cpi->ppi->tpl_data.prev_gop_arf_disp_order = -1;
+      const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+      int ret = aom_realloc_frame_buffer(
+          &cpi->ppi->tpl_data.prev_gop_arf_src, oxcf->frm_dim_cfg.width,
+          oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x,
+          cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->alloc_pyramid, 0);
+      if (ret)
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate tpl prev_gop_arf_src buf.");
+
+      // Currently it is not supported if source/refernece is resized.
+      if (cpi->source->y_width == cpi->ppi->tpl_data.prev_gop_arf_src.y_width &&
+          cpi->source->y_height ==
+              cpi->ppi->tpl_data.prev_gop_arf_src.y_height) {
+        // Copy the content from source to this buffer for next gop.
+        aom_yv12_copy_frame(cpi->source, &cpi->ppi->tpl_data.prev_gop_arf_src,
+                            av1_num_planes(cm));
+
+        cpi->ppi->tpl_data.prev_gop_arf_disp_order =
+            current_frame->display_order_hint;
+      }
+    }
+  }
+
   if (is_stat_generation_stage(cpi)) {
 #if !CONFIG_REALTIME_ONLY
     if (cpi->oxcf.q_cfg.use_fixed_qp_offsets)
diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 9cd5225..3c57d48 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c
@@ -3837,6 +3837,8 @@
     frame_params->frame_type = KEY_FRAME;
     find_next_key_frame(cpi, &this_frame);
     this_frame = this_frame_copy;
+    // Mark prev gop arf source as unusable
+    cpi->ppi->tpl_data.prev_gop_arf_disp_order = -1;
   }
 
   if (rc->frames_to_fwd_kf <= 0)
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index fca14fd..b299930 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -200,6 +200,16 @@
       aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate frame buffer");
   }
+
+  if (aom_alloc_frame_buffer(
+          &tpl_data->prev_gop_arf_src, width, height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          tpl_data->border_in_pixels, byte_alignment, false,
+          alloc_y_plane_only))
+    aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate prev gop arf buffer");
+
+  tpl_data->prev_gop_arf_disp_order = -1;
 }
 
 static inline int32_t tpl_get_satd_cost(BitDepthInfo bd_info, int16_t *src_diff,
@@ -1591,7 +1601,12 @@
       tpl_data->tpl_frame[-i - 1].rec_picture = NULL;
       tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
     } else {
-      tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+      if (cm->ref_frame_map[i]->display_order_hint ==
+          tpl_data->prev_gop_arf_disp_order) {
+        tpl_data->tpl_frame[-i - 1].gf_picture = &tpl_data->prev_gop_arf_src;
+      } else {
+        tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+      }
       tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf;
       tpl_data->tpl_frame[-i - 1].frame_display_index =
           cm->ref_frame_map[i]->display_order_hint;
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 264bc09..8fba0cf 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -237,6 +237,16 @@
   const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME];
 
   /*!
+   * The buffer for the past gop's last frame's src.
+   */
+  YV12_BUFFER_CONFIG prev_gop_arf_src;
+
+  /*!
+   * Display order of the past gop's last frame.
+   */
+  int64_t prev_gop_arf_disp_order;
+
+  /*!
    * Parameters related to synchronization for top-right dependency in row based
    * multi-threading of tpl
    */