Use downsized frame in VMAF RDO update

The RDO multiplier calculation time is largely reduced with little VMAF
quality loss. Compare to old tune=vmaf_without_preprocessing, VMAF
BD-rate changes:
midres 0.24%, hdres -0.18%, lowres_10bd -0.19%, midres_10bd 0.16%

Compression speed (instruction count) of different modes
--------------------------------------------------------
MODE                 480P     1080P
PSNR (baseline)      1.0x      1.0x
VMAF (before)        2.2x     10.2x
VMAF (after)         1.5x      4.1x
VMAF_WO_PP (before)  1.9x      9.5x
VMAF_WO_PP (after)   1.2x      3.4x
VMAF_W_PP            1.2x      1.5x

Full test results
-----------------
Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=psnr --cpu-used=1':
   642,437,621,503      instructions:u
      84.889688323 seconds time elapsed
      84.588830000 seconds user
       0.291975000 seconds sys

Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=vmaf_with_preprocessing --cpu-used=1':
   775,025,691,607      instructions:u
     109.458205579 seconds time elapsed
     107.989239000 seconds user
       4.284554000 seconds sys

Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=vmaf --cpu-used=1':
Before:
 1,405,511,943,075      instructions:u
     129.392291668 seconds time elapsed
     254.063729000 seconds user
      49.802219000 seconds sys
After:
   964,828,402,519      instructions:u
     123.807091424 seconds time elapsed
     161.892974000 seconds user
      42.540103000 seconds sys

Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=vmaf_without_preprocessing --cpu-used=1':
Before:
 1,218,021,966,132      instructions:u
      83.938745902 seconds time elapsed
     206.545182000 seconds user
       5.033487000 seconds sys
After:
   762,965,561,826      instructions:u
      79.566220976 seconds time elapsed
     118.461060000 seconds user
       4.023039000 seconds sys
------------------
Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=psnr --cpu-used=1':
 1,582,885,665,086      instructions:u
     204.687949181 seconds time elapsed
     203.775777000 seconds user
       0.811887000 seconds sys

Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=vmaf_with_preprocessing
 --cpu-used=1':
 2,298,587,912,062      instructions:u
     361.306249556 seconds time elapsed
     326.457704000 seconds user
      41.029086000 seconds sys

Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=vmaf --cpu-used=1':
Before:
16,214,552,275,288      instructions:u
     696.679505444 seconds time elapsed
    3579.636014000 seconds user
    1746.357289000 seconds sys
After:
 6,543,459,539,854      instructions:u
     553.099946250 seconds time elapsed
    1257.052953000 seconds user
     250.151702000 seconds sys

Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=vmaf_without_preprocessing
 --cpu-used=1':
Before:
15,040,376,209,757      instructions:u
     398.965821306 seconds time elapsed
    3205.411780000 seconds user
    1157.351525000 seconds sys
After:
 5,311,331,047,231      instructions:u
     247.754365865 seconds time elapsed
     984.609408000 seconds user
      12.773107000 seconds sys

Change-Id: Id64d59c2ef0fc63e0549a648406615edd71e43bd
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 79220a6..5aecb4e 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -77,16 +77,14 @@
 // all co-efficients must be even.
 DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0,  8, 30, 52,
                                                                30, 8, 0,  0 };
-static AOM_INLINE void gaussian_blur(const AV1_COMP *const cpi,
+static AOM_INLINE void gaussian_blur(const int bit_depth,
                                      const YV12_BUFFER_CONFIG *source,
                                      const YV12_BUFFER_CONFIG *dst) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int block_size = BLOCK_128X128;
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
   int row, col;
 
   ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
@@ -97,11 +95,8 @@
 
   for (row = 0; row < num_rows; ++row) {
     for (col = 0; col < num_cols; ++col) {
-      const int mi_row = row * num_mi_h;
-      const int mi_col = col * num_mi_w;
-
-      const int row_offset_y = mi_row << 2;
-      const int col_offset_y = mi_col << 2;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
 
       uint8_t *src_buf =
           source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
@@ -111,11 +106,11 @@
       if (bit_depth > 8) {
         av1_highbd_convolve_2d_sr(
             CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
-            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, num_mi_w << 2,
-            num_mi_h << 2, &filter, &filter, 0, 0, &conv_params, bit_depth);
+            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+            &filter, &filter, 0, 0, &conv_params, bit_depth);
       } else {
         av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
-                           num_mi_w << 2, num_mi_h << 2, &filter, &filter, 0, 0,
+                           block_w, block_h, &filter, &filter, 0, 0,
                            &conv_params);
       }
     }
@@ -199,6 +194,7 @@
                                   YV12_BUFFER_CONFIG *const source) {
   aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int width = source->y_width;
   const int height = source->y_height;
 
@@ -213,7 +209,7 @@
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
-  gaussian_blur(cpi, &source_extended, &blurred);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
   const double best_frame_unsharp_amount =
@@ -243,19 +239,17 @@
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
-  gaussian_blur(cpi, &source_extended, &blurred);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
   const double best_frame_unsharp_amount =
       find_best_frame_unsharp_amount(cpi, source, &blurred, 0.0, 0.05, 20);
 
   const int block_size = BLOCK_64X64;
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
-  const int block_w = num_mi_w << 2;
-  const int block_h = num_mi_h << 2;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
   double *best_unsharp_amounts =
       aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows);
   memset(best_unsharp_amounts, 0,
@@ -340,10 +334,8 @@
   // Apply best blur amounts
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
-      const int mi_row = row * num_mi_h;
-      const int mi_col = col * num_mi_w;
-      const int row_offset_y = mi_row << 2;
-      const int col_offset_y = mi_col << 2;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
       const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
@@ -463,65 +455,75 @@
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  uint8_t *const y_buffer = cpi->source->y_buffer;
-  const int y_stride = cpi->source->y_stride;
   const int y_width = cpi->source->y_width;
   const int y_height = cpi->source->y_height;
-  const int block_size = BLOCK_64X64;
-
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
-  const int block_w = num_mi_w << 2;
-  const int block_h = num_mi_h << 2;
+  const int resized_block_size = BLOCK_32X32;
+  const int resize_factor = 2;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
 
   aom_clear_system_state();
-  YV12_BUFFER_CONFIG blurred;
-  memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(&blurred, y_width, y_height, 1, 1,
+  YV12_BUFFER_CONFIG resized_source;
+  memset(&resized_source, 0, sizeof(resized_source));
+  aom_alloc_frame_buffer(&resized_source, y_width / resize_factor,
+                         y_height / resize_factor, 1, 1,
                          cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
-  gaussian_blur(cpi, cpi->source, &blurred);
+  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
+                              av1_num_planes(cm));
+
+  const int resized_y_width = resized_source.y_width;
+  const int resized_y_height = resized_source.y_height;
+  const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+  const int resized_block_h = mi_size_high[resized_block_size] * 4;
+  const int num_cols =
+      (resized_y_width + resized_block_w - 1) / resized_block_w;
+  const int num_rows =
+      (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
+  gaussian_blur(bit_depth, &resized_source, &blurred);
 
   double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
   memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
   FrameData frame_data;
-  frame_data.source = cpi->source;
+  frame_data.source = &resized_source;
   frame_data.blurred = &blurred;
-  frame_data.block_w = block_w;
-  frame_data.block_h = block_h;
+  frame_data.block_w = resized_block_w;
+  frame_data.block_h = resized_block_h;
   frame_data.num_rows = num_rows;
   frame_data.num_cols = num_cols;
   frame_data.row = 0;
   frame_data.col = 0;
   frame_data.bit_depth = bit_depth;
   aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path,
-                            update_frame, y_width, y_height, bit_depth, scores);
+                            update_frame, resized_y_width, resized_y_height,
+                            bit_depth, scores);
 
   // Loop through each 'block_size' block.
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
-      const int mi_row = row * num_mi_h;
-      const int mi_col = col * num_mi_w;
       const int index = row * num_cols + col;
-      const int row_offset_y = mi_row << 2;
-      const int col_offset_y = mi_col << 2;
+      const int row_offset_y = row * resized_block_h;
+      const int col_offset_y = col * resized_block_w;
 
-      uint8_t *const orig_buf =
-          y_buffer + row_offset_y * y_stride + col_offset_y;
+      uint8_t *const orig_buf = resized_source.y_buffer +
+                                row_offset_y * resized_source.y_stride +
+                                col_offset_y;
       uint8_t *const blurred_buf =
           blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
 
       const double vmaf = scores[index];
       const double dvmaf = kBaselineVmaf - vmaf;
       unsigned int sse;
-      cpi->fn_ptr[block_size].vf(orig_buf, y_stride, blurred_buf,
-                                 blurred.y_stride, &sse);
+      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                         blurred_buf, blurred.y_stride, &sse);
 
-      const double mse = (double)sse / (double)(y_width * y_height);
-
+      const double mse =
+          (double)sse / (double)(resized_y_width * resized_y_height);
       double weight;
       const double eps = 0.01 / (num_rows * num_cols);
       if (dvmaf < eps || mse < eps) {
@@ -536,6 +538,7 @@
     }
   }
 
+  aom_free_frame_buffer(&resized_source);
   aom_free_frame_buffer(&blurred);
   aom_free(scores);
   aom_clear_system_state();
@@ -635,12 +638,11 @@
                          cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
-  gaussian_blur(cpi, cur, &blurred_cur);
-  gaussian_blur(cpi, last, &blurred_last);
-  if (next) gaussian_blur(cpi, next, &blurred_next);
+  gaussian_blur(bit_depth, cur, &blurred_cur);
+  gaussian_blur(bit_depth, last, &blurred_last);
+  if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-
   if (bit_depth > 8) {
     const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
     motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),