Increase tpl stats storage granularity for high resolution

Previously, tpl stats will be stored at 4x4 granularity.
This patch make the granularity adaptive to source resolution.
At >=720p resolution, tpl stats will be stored at 16x16 granularity,
otherwise, stored at 8x8 granularity, controlled by
cpi->tpl_stats_block_mis_log2.
This will reduce memory usage for 4k encoding by 0.9G.

Coding performance change:
lowres: -0.023%
midres: -0.002%
hdres: -0.008%
netflix4k_8bit: -0.001%

BUG=aomedia:2453

STATS_CHANGED

Change-Id: I4dbcd2c3cec4986fae279d5da2cbd5ede9d4c323
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index d21c126..047ba93 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -66,6 +66,7 @@
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/var_based_part.h"
+#include "av1/encoder/tpl_model.h"
 
 static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                               ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
@@ -3443,9 +3444,8 @@
   int tpl_stride = tpl_frame->stride;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
-  int mi_wide = mi_size_wide[bsize];
-  int mi_high = mi_size_high[bsize];
-  int row, col;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
 
   if (tpl_frame->is_valid == 0) return orig_rdmult;
 
@@ -3462,10 +3462,12 @@
   const int mi_col_end_sr = av1_coded_to_superres_mi(
       mi_col + mi_wide, cm->superres_scale_denominator);
   const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  for (row = mi_row; row < mi_row + mi_high; ++row) {
-    for (col = mi_col_sr; col < mi_col_end_sr; ++col) {
+  const int step = 1 << cpi->tpl_stats_block_mis_log2;
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
       if (row >= cm->mi_rows || col >= mi_cols_sr) continue;
-      TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
       intra_cost += this_stats->intra_cost;
       mc_dep_cost += this_stats->intra_cost + this_stats->mc_flow;
 #if !USE_TPL_CLASSIC_MODEL
@@ -3526,9 +3528,8 @@
   int tpl_stride = tpl_frame->stride;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
-  int mi_wide = mi_size_wide[bsize];
-  int mi_high = mi_size_high[bsize];
-  int row, col;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
 
   if (cpi->tpl_model_pass == 1) {
     assert(cpi->oxcf.enable_tpl_model == 2);
@@ -3550,10 +3551,12 @@
   const int mi_col_end_sr = av1_coded_to_superres_mi(
       mi_col + mi_wide, cm->superres_scale_denominator);
   const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-  for (row = mi_row; row < mi_row + mi_high; ++row) {
-    for (col = mi_col_sr; col < mi_col_end_sr; ++col) {
+  const int step = 1 << cpi->tpl_stats_block_mis_log2;
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
       if (row >= cm->mi_rows || col >= mi_cols_sr) continue;
-      TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
       intra_cost += this_stats->intra_cost;
       mc_dep_cost += this_stats->intra_cost + this_stats->mc_flow;
 #if !USE_TPL_CLASSIC_MODEL
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f8ea834..1da9a6a 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2396,6 +2396,14 @@
                   aom_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
+static void set_tpl_stats_block_size(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+
+  // 0: 4x4, 1: 8x8, 2: 16x16
+  cpi->tpl_stats_block_mis_log2 = is_720p_or_larger ? 2 : 1;
+}
+
 void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
                                         CompoundTypeRdBuffers *const bufs) {
   CHECK_MEM_ERROR(
@@ -2779,20 +2787,25 @@
                                sizeof(*cpi->ssim_rdmult_scaling_factors)));
   }
 
+  set_tpl_stats_block_size(cpi);
   for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
-    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
-    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+    const int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+    const int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+
+    cpi->tpl_stats_buffer[frame].is_valid = 0;
+    cpi->tpl_stats_buffer[frame].width =
+        mi_cols >> cpi->tpl_stats_block_mis_log2;
+    cpi->tpl_stats_buffer[frame].height =
+        mi_rows >> cpi->tpl_stats_block_mis_log2;
+    cpi->tpl_stats_buffer[frame].stride = cpi->tpl_stats_buffer[frame].width;
+    cpi->tpl_stats_buffer[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats_buffer[frame].mi_cols = cm->mi_cols;
 
     CHECK_MEM_ERROR(
         cm, cpi->tpl_stats_buffer[frame].tpl_stats_ptr,
-        aom_calloc(mi_rows * mi_cols,
+        aom_calloc(cpi->tpl_stats_buffer[frame].width *
+                       cpi->tpl_stats_buffer[frame].height,
                    sizeof(*cpi->tpl_stats_buffer[frame].tpl_stats_ptr)));
-    cpi->tpl_stats_buffer[frame].is_valid = 0;
-    cpi->tpl_stats_buffer[frame].width = mi_cols;
-    cpi->tpl_stats_buffer[frame].height = mi_rows;
-    cpi->tpl_stats_buffer[frame].stride = mi_cols;
-    cpi->tpl_stats_buffer[frame].mi_rows = cm->mi_rows;
-    cpi->tpl_stats_buffer[frame].mi_cols = cm->mi_cols;
   }
   cpi->tpl_frame = &cpi->tpl_stats_buffer[REF_FRAMES + 1];
 
@@ -3639,12 +3652,13 @@
     int64_t mc_saved_base = 0;
     int64_t mc_count_base = 0;
 #endif  // !USE_TPL_CLASSIC_MODEL
-    int row, col;
-
+    const int step = 1 << cpi->tpl_stats_block_mis_log2;
     const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
-    for (row = 0; row < cm->mi_rows; ++row) {
-      for (col = 0; col < mi_cols_sr; ++col) {
-        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+
+    for (int row = 0; row < cm->mi_rows; row += step) {
+      for (int col = 0; col < mi_cols_sr; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(cpi, row, col, tpl_stride)];
         intra_cost_base += this_stats->intra_cost;
         mc_dep_cost_base += this_stats->intra_cost + this_stats->mc_flow;
 #if !USE_TPL_CLASSIC_MODEL
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index da5e001..9a7e25a 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -786,6 +786,7 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
+  uint8_t tpl_stats_block_mis_log2;  // block granularity of tpl score storage
   TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
   TplDepFrame *tpl_frame;
 
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 53bcd17..c644c88 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -355,7 +355,13 @@
   return iiratio * iiratio;
 }
 
-static void tpl_model_update_b(TplDepFrame *tpl_frame,
+int av1_tpl_ptr_pos(AV1_COMP *cpi, int mi_row, int mi_col, int stride) {
+  const int right_shift = cpi->tpl_stats_block_mis_log2;
+
+  return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
+
+static void tpl_model_update_b(AV1_COMP *cpi, TplDepFrame *tpl_frame,
                                TplDepStats *tpl_stats_ptr, int mi_row,
                                int mi_col, double quant_ratio,
                                const BLOCK_SIZE bsize, int ref_frame_index,
@@ -395,12 +401,11 @@
 #if !USE_TPL_CLASSIC_MODEL
       int64_t mc_saved = tpl_stats_ptr->intra_cost - tpl_stats_ptr->inter_cost;
 #endif  // #if !USE_TPL_CLASSIC_MODEL
-      int idx, idy;
-      for (idy = 0; idy < mi_height; ++idy) {
-        for (idx = 0; idx < mi_width; ++idx) {
-          TplDepStats *des_stats =
-              &ref_stats_ptr[(ref_mi_row + idy) * ref_tpl_frame->stride +
-                             (ref_mi_col + idx)];
+      const int step = 1 << cpi->tpl_stats_block_mis_log2;
+      for (int idy = 0; idy < mi_height; idy += step) {
+        for (int idx = 0; idx < mi_width; idx += step) {
+          TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+              cpi, ref_mi_row + idy, ref_mi_col + idx, ref_tpl_frame->stride)];
           des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
 #if !USE_TPL_CLASSIC_MODEL
           des_stats->mc_count += overlap_area << TPL_DEP_COST_SCALE_LOG2;
@@ -413,30 +418,32 @@
   }
 }
 
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats_ptr,
-                             int mi_row, int mi_col, double quant_ratio,
-                             const BLOCK_SIZE bsize, int ref_frame_index,
-                             int_mv mv) {
-  int idx, idy;
+static void tpl_model_update(AV1_COMP *cpi, TplDepFrame *tpl_frame,
+                             TplDepStats *tpl_stats_ptr, int mi_row, int mi_col,
+                             double quant_ratio, const BLOCK_SIZE bsize,
+                             int ref_frame_index, int_mv mv) {
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
+  const int step = 1 << cpi->tpl_stats_block_mis_log2;
+  const BLOCK_SIZE tpl_block_size =
+      convert_length_to_bsize(MI_SIZE << cpi->tpl_stats_block_mis_log2);
 
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats_ptr[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
-      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
-                         quant_ratio, BLOCK_4X4, ref_frame_index, mv);
+  for (int idy = 0; idy < mi_height; idy += step) {
+    for (int idx = 0; idx < mi_width; idx += step) {
+      TplDepStats *tpl_ptr = &tpl_stats_ptr[av1_tpl_ptr_pos(
+          cpi, mi_row + idy, mi_col + idx, tpl_frame->stride)];
+      tpl_model_update_b(cpi, tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         quant_ratio, tpl_block_size, ref_frame_index, mv);
     }
   }
 }
 
-static void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, int stride,
-                            const TplDepStats *src_stats) {
+static void tpl_model_store(AV1_COMP *cpi, TplDepStats *tpl_stats_ptr,
+                            int mi_row, int mi_col, BLOCK_SIZE bsize,
+                            int stride, const TplDepStats *src_stats) {
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
-  int idx, idy;
+  const int step = 1 << cpi->tpl_stats_block_mis_log2;
 
   int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
   int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
@@ -446,9 +453,10 @@
   intra_cost = AOMMAX(1, intra_cost);
   inter_cost = AOMMAX(1, inter_cost);
 
-  for (idy = 0; idy < mi_height; ++idy) {
-    tpl_ptr = &tpl_stats_ptr[(mi_row + idy) * stride + mi_col];
-    for (idx = 0; idx < mi_width; ++idx) {
+  for (int idy = 0; idy < mi_height; idy += step) {
+    tpl_ptr =
+        &tpl_stats_ptr[av1_tpl_ptr_pos(cpi, mi_row + idy, mi_col, stride)];
+    for (int idx = 0; idx < mi_width; idx += step) {
       tpl_ptr->intra_cost = intra_cost;
       tpl_ptr->inter_cost = inter_cost;
       tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
@@ -565,12 +573,13 @@
                       &ref_frame_index, &mv);
 
       // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+      tpl_model_store(cpi, tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
                       tpl_frame->stride, &tpl_stats);
       double quant_ratio = (double)recon_error / sse;
-      if (frame_idx)
-        tpl_model_update(cpi->tpl_frame, tpl_frame->tpl_stats_ptr, mi_row,
+      if (frame_idx) {
+        tpl_model_update(cpi, cpi->tpl_frame, tpl_frame->tpl_stats_ptr, mi_row,
                          mi_col, quant_ratio, bsize, ref_frame_index, mv);
+      }
     }
   }
 
diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index d089b3f..e3b1e68 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h
@@ -22,6 +22,8 @@
 
 void av1_tpl_setup_forward_stats(AV1_COMP *cpi);
 
+int av1_tpl_ptr_pos(AV1_COMP *cpi, int mi_row, int mi_col, int stride);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif