Add tpl_bsize in TplParams

Added tpl_bsize_1d in TplParams and made tpl block size used in tpl
motion estimation configurable. Resolved places that used a hard-coded
16x16 tpl block size. This CL wouldn't cause any bitstream change.

Change-Id: I80521783c81820b4aaf72ce1c83356fc62546fe5
diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index dfd4e51..ea92c92 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h
@@ -29,7 +29,6 @@
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
-#define AOM_ENC_TPL_FRAME_BORDER 32
 #define AOM_DEC_BORDER_IN_PIXELS 64
 
 /*!\endcond */
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 7283058..76fbaac 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -31,12 +31,10 @@
 extern "C" {
 #endif
 
-//! Linear dimension of a tpl block
-#define MC_FLOW_BSIZE_1D 16
-//! Number of pixels in a tpl block
-#define MC_FLOW_NUM_PELS (MC_FLOW_BSIZE_1D * MC_FLOW_BSIZE_1D)
-//! Number of tpl block in a super block
-#define MAX_MC_FLOW_BLK_IN_SB (MAX_SB_SIZE / MC_FLOW_BSIZE_1D)
+//! Minimum linear dimension of a tpl block
+#define MIN_TPL_BSIZE_1D 16
+//! Maximum number of tpl block in a super block
+#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D)
 //! Number of intra winner modes kept
 #define MAX_WINNER_MODE_COUNT_INTRA 3
 //! Number of inter winner modes kept
@@ -61,19 +59,18 @@
   /*****************************************************************************
    * \name TPL Info
    *
-   * Information gathered from tpl_model at MC_FLOW_BSIZE_1D precision for the
+   * Information gathered from tpl_model at tpl block precision for the
    * superblock to speed up the encoding process..
    ****************************************************************************/
   /**@{*/
   //! Number of TPL blocks in this superblock.
   int tpl_data_count;
   //! TPL's estimate of inter cost for each tpl block.
-  int64_t tpl_inter_cost[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
   //! TPL's estimate of tpl cost for each tpl block.
-  int64_t tpl_intra_cost[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
   //! Motion vectors found by TPL model for each tpl block.
-  int_mv tpl_mv[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]
-               [INTER_REFS_PER_FRAME];
+  int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME];
   //! TPL's stride for the arrays in this struct.
   int tpl_stride;
   /**@}*/
@@ -1139,16 +1136,6 @@
 #undef SINGLE_REF_MODES
 
 /*!\cond */
-
-static INLINE int tpl_blocks_in_sb(BLOCK_SIZE bsize) {
-  switch (bsize) {
-    case BLOCK_64X64: return 16;
-    case BLOCK_128X128: return 64;
-    default: assert(0);
-  }
-  return -1;
-}
-
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
   static const char LUT[BLOCK_SIZES_ALL] = {
     0,  // BLOCK_4X4
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 695084f..e367245 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -818,7 +818,7 @@
   // TPL store unit size is not the same as the motion estimation unit size.
   // Here always use motion estimation size to avoid getting repetitive inter/
   // intra cost.
-  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
   const int step = mi_size_wide[tpl_bsize];
   assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
 
diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 18b8c7b..4713b8b 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h
@@ -100,20 +100,26 @@
 }
 
 static AOM_INLINE void set_tpl_stats_block_size(int width, int height,
-                                                uint8_t *block_mis_log2) {
+                                                uint8_t *block_mis_log2,
+                                                uint8_t *tpl_bsize_1d) {
   const int is_720p_or_larger = AOMMIN(width, height) >= 720;
 
   // 0: 4x4, 1: 8x8, 2: 16x16
   *block_mis_log2 = is_720p_or_larger ? 2 : 1;
+  // Block size used in tpl motion estimation
+  *tpl_bsize_1d = 16;
+  assert(*tpl_bsize_1d >= 16);
 }
 
 static AOM_INLINE void setup_tpl_buffers(AV1_COMMON *const cm,
                                          TplParams *const tpl_data) {
   CommonModeInfoParams *const mi_params = &cm->mi_params;
   set_tpl_stats_block_size(cm->width, cm->height,
-                           &tpl_data->tpl_stats_block_mis_log2);
+                           &tpl_data->tpl_stats_block_mis_log2,
+                           &tpl_data->tpl_bsize_1d);
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
-  tpl_data->border_in_pixels = AOM_ENC_TPL_FRAME_BORDER;
+  tpl_data->border_in_pixels =
+      ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5);
 
   for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
     const int mi_cols =
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 4eaa0ad..9209159 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -1176,7 +1176,7 @@
   MACROBLOCK *x = &thread_data->td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
   CommonModeInfoParams *mi_params = &cm->mi_params;
-  BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
   TX_SIZE tx_size = max_txsize_lookup[bsize];
   int mi_height = mi_size_high[bsize];
   int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working;
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 9dbca7e..dcb12e5 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -96,9 +96,8 @@
     start_mv = get_fullmv_from_mv(&ref_mv);
 
   // cand stores start_mv and all possible MVs in a SB.
-  cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB + 1] = {
-    { { 0, 0 }, 0 }
-  };
+  cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1] = { { { 0, 0 },
+                                                                  0 } };
   cand[0].fmv = start_mv;
   int cnt = 1;
   int total_weight = 0;
@@ -107,7 +106,8 @@
       mbmi->motion_mode == SIMPLE_TRANSLATION) {
     SuperBlockEnc *sb_enc = &x->sb_enc;
     if (sb_enc->tpl_data_count) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const BLOCK_SIZE tpl_bsize =
+          convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
       const int tplw = mi_size_wide[tpl_bsize];
       const int tplh = mi_size_high[tpl_bsize];
       const int nw = mi_size_wide[bsize] / tplw;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index ebe8a44..909bd00 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4983,18 +4983,21 @@
       (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
   if (do_pruning && sf->intra_sf.skip_intra_in_interframe) {
     // Only consider full SB.
-    int len = tpl_blocks_in_sb(cm->seq_params.sb_size);
+    const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+    const int tpl_bsize_1d = cpi->tpl_data.tpl_bsize_1d;
+    const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+                    (block_size_high[sb_size] / tpl_bsize_1d);
     SuperBlockEnc *sb_enc = &x->sb_enc;
     if (sb_enc->tpl_data_count == len) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
       const int tpl_stride = sb_enc->tpl_stride;
       const int tplw = mi_size_wide[tpl_bsize];
       const int tplh = mi_size_high[tpl_bsize];
       const int nw = mi_size_wide[bsize] / tplw;
       const int nh = mi_size_high[bsize] / tplh;
       if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+        const int of_h = mi_row % mi_size_high[sb_size];
+        const int of_w = mi_col % mi_size_wide[sb_size];
         const int start = of_h / tplh * tpl_stride + of_w / tplw;
 
         for (int k = 0; k < nh; k++) {
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 4890f37..77612d6 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -254,13 +254,16 @@
   uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
   const int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
 
-  // Temporaray buffers
-  DECLARE_ALIGNED(32, uint8_t, predictor8[MC_FLOW_NUM_PELS * 2]);
-  DECLARE_ALIGNED(32, int16_t, src_diff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, coeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, qcoeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MC_FLOW_NUM_PELS]);
-  DECLARE_ALIGNED(32, tran_low_t, best_coeff[MC_FLOW_NUM_PELS]);
+  // Number of pixels in a tpl block
+  const int tpl_block_pels = tpl_data->tpl_bsize_1d * tpl_data->tpl_bsize_1d;
+  // Allocate temporary buffers used in motion estimation.
+  uint8_t *predictor8 = aom_memalign(32, tpl_block_pels * 2 * sizeof(uint8_t));
+  int16_t *src_diff = aom_memalign(32, tpl_block_pels * sizeof(int16_t));
+  tran_low_t *coeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *qcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *dqcoeff = aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
+  tran_low_t *best_coeff =
+      aom_memalign(32, tpl_block_pels * sizeof(tran_low_t));
   uint8_t *predictor =
       is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
   int64_t recon_error = 1, sse = 1;
@@ -444,7 +447,7 @@
     tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
 
     if (inter_cost < best_inter_cost) {
-      memcpy(best_coeff, coeff, sizeof(best_coeff));
+      memcpy(best_coeff, coeff, tpl_block_pels * sizeof(best_coeff[0]));
       best_rf_idx = rf_idx;
 
       best_inter_cost = inter_cost;
@@ -526,6 +529,14 @@
       }
     }
   }
+
+  // Free temporary buffers.
+  aom_free(predictor8);
+  aom_free(src_diff);
+  aom_free(coeff);
+  aom_free(qcoeff);
+  aom_free(dqcoeff);
+  aom_free(best_coeff);
 }
 
 static int round_floor(int ref_pos, int bsize_pix) {
@@ -679,13 +690,13 @@
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
   const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
-  const BLOCK_SIZE tpl_block_size =
+  const BLOCK_SIZE tpl_stats_block_size =
       convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
 
   for (int idy = 0; idy < mi_height; idy += step) {
     for (int idx = 0; idx < mi_width; idx += step) {
-      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx, tpl_block_size,
-                         frame_idx);
+      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx,
+                         tpl_stats_block_size, frame_idx);
     }
   }
 }
@@ -855,12 +866,15 @@
   TplParams *const tpl_data = &cpi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   MACROBLOCKD *xd = &x->e_mbd;
-  const int mb_cols_in_tile = mi_params->mb_cols;
-  const int mb_row = (mi_row + 2) >> 2;
-  for (int mi_col = 0, mb_col_in_tile = 0; mi_col < mi_params->mi_cols;
-       mi_col += mi_width, mb_col_in_tile++) {
-    (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, mb_row,
-                                 mb_col_in_tile);
+
+  const int tplb_cols_in_tile =
+      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+  const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]);
+
+  for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols;
+       mi_col += mi_width, tplb_col_in_tile++) {
+    (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                 tplb_col_in_tile);
     TplDepStats tpl_stats;
 
     // Motion estimation column boundary
@@ -875,8 +889,8 @@
     tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                     tpl_frame->stride, &tpl_stats,
                     tpl_data->tpl_stats_block_mis_log2);
-    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, mb_row,
-                                  mb_col_in_tile, mb_cols_in_tile);
+    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+                                  tplb_col_in_tile, tplb_cols_in_tile);
   }
 }
 
@@ -886,7 +900,7 @@
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = mi_size_high[bsize];
   for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
@@ -908,7 +922,7 @@
 
   TplParams *const tpl_data = &cpi->tpl_data;
 
-  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
 
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 1557c50..b4d3db2 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -124,6 +124,11 @@
   uint8_t tpl_stats_block_mis_log2;
 
   /*!
+   * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16.
+   */
+  uint8_t tpl_bsize_1d;
+
+  /*!
    * Buffer to store the frame level tpl information for each frame in a gf
    * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
    * group