Optimize memory in tpl frame buffer

The tpl frame buffer padding has been reduced to 32.

Resolution    Tile    Memory reduction
                      Single   Multi
                      thread   thread
640x360       2x1      ~13%     ~13% (2 threads)
832x480       2x1      ~11%     ~11% (2 threads)
1280x720      2x2      ~10%      ~9% (4 threads)
1920x1080     4x2       ~7%      ~7% (8 threads)

Memory measuring command:
$ command time -v ./aomenc ...

Change-Id: Ib51808f7d84f88d440db0f7fd8f3033abae7ce65
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index b183e8a..b40edec 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h
@@ -27,6 +27,7 @@
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_ENC_TPL_FRAME_BORDER 32
 #define AOM_DEC_BORDER_IN_PIXELS 64
 
 typedef struct yv12_buffer_config {
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 57331e0..18b8c7b 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -113,6 +113,7 @@
   set_tpl_stats_block_size(cm->width, cm->height,
                            &tpl_data->tpl_stats_block_mis_log2);
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  tpl_data->border_in_pixels = AOM_ENC_TPL_FRAME_BORDER;
 
   for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
     const int mi_cols =
@@ -138,7 +139,7 @@
     if (aom_alloc_frame_buffer(
             &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
             cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, AOM_ENC_NO_SCALE_BORDER,
+            cm->seq_params.use_highbitdepth, tpl_data->border_in_pixels,
             cm->features.byte_alignment))
       aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate frame buffer");
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 84975b2..8777f67 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -846,7 +846,7 @@
 
     // Motion estimation column boundary
     av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
-                          cpi->oxcf.border_in_pixels);
+                          tpl_data->border_in_pixels);
     xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
     xd->mb_to_right_edge =
         GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
@@ -873,7 +873,7 @@
   for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
     av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
-                          cpi->oxcf.border_in_pixels);
+                          cpi->tpl_data.border_in_pixels);
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index cbbf11e..ff3d6ad 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -175,6 +175,11 @@
    * multi-threading of tpl
    */
   AV1TplRowMultiThreadSync tpl_mt_sync;
+
+  /*!
+   * Frame border for tpl frame.
+   */
+  int border_in_pixels;
 } TplParams;
 
 int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,