Allocate mbmi_ext at BLOCK_8X8 level for 4k+ videos

Previously mbmi_ext is dynamically allocated at the beginning for each
BLOCK_4X4. This change makes the allocation resolution dependent, so it
is allocated for each BLOCK_8X8 instead.

Memory Reduction:
About 1GB for 4K videos, which is about 25% of heap memory.

BUG=aomedia:2453

Change-Id: Ic38349eb19adccbe2bf3355db60cb02f2116272c
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 27c3510..0507f23 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1275,8 +1275,11 @@
   AV1_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO *const *mbmi =
       *(cm->mi_grid_base + (mi_row * cm->mi_stride + mi_col));
+  const int mi_alloc_size_1d = cpi->mi_alloc_size_1d;
+  const int mi_alloc_row = (mi_row + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_col = (mi_col + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
   const MB_MODE_INFO_EXT *const *mbmi_ext =
-      cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+      cpi->mbmi_ext_base + (mi_alloc_row * cpi->mi_alloc_cols + mi_alloc_col);
   if (is_inter_block(mbmi)) {
 #define FRAME_TO_CHECK 11
     if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
@@ -1462,7 +1465,12 @@
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   xd->mi = cm->mi_grid_base + (mi_row * cm->mi_stride + mi_col);
-  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  const int mi_alloc_size_1d = cpi->mi_alloc_size_1d;
+  const int mi_alloc_row = (mi_row + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_col = (mi_col + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  cpi->td.mb.mbmi_ext =
+      cpi->mbmi_ext_base + (mi_alloc_row * cpi->mi_alloc_cols + mi_alloc_col);
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->sb_type;
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 4c4c712..e139c5b 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -1054,8 +1054,8 @@
     av1_init_context_buffers(cm);
     setup_mi(cpi, frame_input->source);
     av1_init_macroblockd(cm, xd, NULL);
-    memset(cpi->mbmi_ext_base, 0,
-           cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+    const int alloc_mi_size = cpi->mi_alloc_rows * cpi->mi_alloc_cols;
+    memset(cpi->mbmi_ext_base, 0, alloc_mi_size * sizeof(*cpi->mbmi_ext_base));
 
     av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
     av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index aed8250..20368d8 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -13,6 +13,7 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "av1/common/enums.h"
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 #include "config/aom_scale_rtcd.h"
@@ -424,11 +425,23 @@
 
 static void alloc_context_buffers_ext(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  int mi_size = cm->mi_cols * cm->mi_rows;
+  const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+
+  cpi->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4;
+  cpi->mi_alloc_size_1d = mi_size_wide[cpi->mi_alloc_bsize];
+  cpi->mi_alloc_rows =
+      (cm->mi_rows + cpi->mi_alloc_size_1d - 1) / cpi->mi_alloc_size_1d;
+  cpi->mi_alloc_cols =
+      (cm->mi_cols + cpi->mi_alloc_size_1d - 1) / cpi->mi_alloc_size_1d;
+
+  assert(mi_size_wide[cpi->mi_alloc_bsize] ==
+         mi_size_high[cpi->mi_alloc_bsize]);
+
+  const int alloc_mi_size = cpi->mi_alloc_rows * cpi->mi_alloc_cols;
 
   dealloc_context_buffers_ext(cpi);
   CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
-                  aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
+                  aom_calloc(alloc_mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
 static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
@@ -932,8 +945,9 @@
   av1_set_mb_mi(cm, cm->width, cm->height);
   av1_init_context_buffers(cm);
   av1_init_macroblockd(cm, xd, NULL);
-  memset(cpi->mbmi_ext_base, 0,
-         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+  const int alloc_mi_size = cpi->mi_alloc_rows * cpi->mi_alloc_cols;
+  memset(cpi->mbmi_ext_base, 0, alloc_mi_size * sizeof(*cpi->mbmi_ext_base));
   set_tile_info(cpi);
 }
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b9db4ed..514d536 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -768,6 +768,12 @@
   struct lookahead_entry *alt_ref_source;
   int no_show_kf;
 
+  // The minimum size each allocateed mi_ext can correspond to. Currently set to
+  // BLOCK_4X4 for resolution below 4k, and BLOCK_8X8 for resolution above 4k
+  BLOCK_SIZE mi_alloc_bsize;
+  int mi_alloc_size_1d;  // Number of 4x4 blocks in an allocated mi_ext
+  int mi_alloc_rows, mi_alloc_cols;
+
   int optimize_seg_arr[MAX_SEGMENTS];
 
   YV12_BUFFER_CONFIG *source;
@@ -1372,7 +1378,12 @@
   const int idx_str = xd->mi_stride * mi_row + mi_col;
   xd->mi = cm->mi_grid_base + idx_str;
   xd->mi[0] = cm->mi + idx_str;
-  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  const int mi_alloc_size_1d = cpi->mi_alloc_size_1d;
+  const int mi_alloc_row = (mi_row + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_col = (mi_col + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  x->mbmi_ext =
+      cpi->mbmi_ext_base + (mi_alloc_row * cpi->mi_alloc_cols + mi_alloc_col);
 }
 
 // Check to see if the given partition size is allowed for a specified number