Use downsized frame in VMAF RDO update The RDO multiplier calculation time is largely reduced with little VMAF quality loss. Compare to old tune=vmaf_without_preprocessing, VMAF BD-rate changes: midres 0.24%, hdres -0.18%, lowres_10bd -0.19%, midres_10bd 0.16% Compression speed (instruction count) of different modes -------------------------------------------------------- MODE 480P 1080P PSNR (baseline) 1.0x 1.0x VMAF (before) 2.2x 10.2x VMAF (after) 1.5x 4.1x VMAF_WO_PP (before) 1.9x 9.5x VMAF_WO_PP (after) 1.2x 3.4x VMAF_W_PP 1.2x 1.5x Full test results ----------------- Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30 -o output_new --tune=psnr --cpu-used=1': 642,437,621,503 instructions:u 84.889688323 seconds time elapsed 84.588830000 seconds user 0.291975000 seconds sys Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30 -o output_new --tune=vmaf_with_preprocessing --cpu-used=1': 775,025,691,607 instructions:u 109.458205579 seconds time elapsed 107.989239000 seconds user 4.284554000 seconds sys Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30 -o output_new --tune=vmaf --cpu-used=1': Before: 1,405,511,943,075 instructions:u 129.392291668 seconds time elapsed 254.063729000 seconds user 49.802219000 seconds sys After: 964,828,402,519 instructions:u 123.807091424 seconds time elapsed 161.892974000 seconds user 42.540103000 seconds sys Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30 -o output_new --tune=vmaf_without_preprocessing --cpu-used=1': Before: 1,218,021,966,132 instructions:u 83.938745902 seconds time elapsed 206.545182000 seconds user 5.033487000 seconds sys After: 762,965,561,826 instructions:u 79.566220976 seconds time elapsed 118.461060000 seconds user 4.023039000 seconds sys ------------------ Performance counter stats for './aomenc basketballdrive_1080p50.y4m --limit=30 -o output_new --tune=psnr --cpu-used=1': 1,582,885,665,086 instructions:u 204.687949181 seconds time elapsed 203.775777000 seconds user 0.811887000 seconds sys Performance counter stats for './aomenc basketballdrive_1080p50.y4m --limit=30 -o output_new --tune=vmaf_with_preprocessing --cpu-used=1': 2,298,587,912,062 instructions:u 361.306249556 seconds time elapsed 326.457704000 seconds user 41.029086000 seconds sys Performance counter stats for './aomenc basketballdrive_1080p50.y4m --limit=30 -o output_new --tune=vmaf --cpu-used=1': Before: 16,214,552,275,288 instructions:u 696.679505444 seconds time elapsed 3579.636014000 seconds user 1746.357289000 seconds sys After: 6,543,459,539,854 instructions:u 553.099946250 seconds time elapsed 1257.052953000 seconds user 250.151702000 seconds sys Performance counter stats for './aomenc basketballdrive_1080p50.y4m --limit=30 -o output_new --tune=vmaf_without_preprocessing --cpu-used=1': Before: 15,040,376,209,757 instructions:u 398.965821306 seconds time elapsed 3205.411780000 seconds user 1157.351525000 seconds sys After: 5,311,331,047,231 instructions:u 247.754365865 seconds time elapsed 984.609408000 seconds user 12.773107000 seconds sys Change-Id: Id64d59c2ef0fc63e0549a648406615edd71e43bd

commit: f84f7c47ce8c55f310c307ddc97803e270fb8ec4 [log] [tgz]
author: sdeng <sdeng@google.com> Fri Feb 28 15:30:09 2020 -0800
committer: Sai Deng <sdeng@google.com> Thu Mar 19 17:10:51 2020 +0000
tree: 454c238bd851f28f830cdbca0b0c8ea9d2e47e5d
parent: 922db60359be0a68dfb18dd5ec36d615cfb15c76 [diff]
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 79220a6..5aecb4e 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c

@@ -77,16 +77,14 @@
 // all co-efficients must be even.
 DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0,  8, 30, 52,
                                                                30, 8, 0,  0 };
-static AOM_INLINE void gaussian_blur(const AV1_COMP *const cpi,
+static AOM_INLINE void gaussian_blur(const int bit_depth,
                                      const YV12_BUFFER_CONFIG *source,
                                      const YV12_BUFFER_CONFIG *dst) {
-  const AV1_COMMON *cm = &cpi->common;
-  const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int block_size = BLOCK_128X128;
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
   int row, col;
 
   ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
@@ -97,11 +95,8 @@
 
   for (row = 0; row < num_rows; ++row) {
     for (col = 0; col < num_cols; ++col) {
-      const int mi_row = row * num_mi_h;
-      const int mi_col = col * num_mi_w;
-
-      const int row_offset_y = mi_row << 2;
-      const int col_offset_y = mi_col << 2;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
 
       uint8_t *src_buf =
           source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
@@ -111,11 +106,11 @@
       if (bit_depth > 8) {
         av1_highbd_convolve_2d_sr(
             CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
-            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, num_mi_w << 2,
-            num_mi_h << 2, &filter, &filter, 0, 0, &conv_params, bit_depth);
+            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+            &filter, &filter, 0, 0, &conv_params, bit_depth);
       } else {
         av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
-                           num_mi_w << 2, num_mi_h << 2, &filter, &filter, 0, 0,
+                           block_w, block_h, &filter, &filter, 0, 0,
                            &conv_params);
       }
     }
@@ -199,6 +194,7 @@
                                   YV12_BUFFER_CONFIG *const source) {
   aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int width = source->y_width;
   const int height = source->y_height;
 
@@ -213,7 +209,7 @@
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
-  gaussian_blur(cpi, &source_extended, &blurred);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
   const double best_frame_unsharp_amount =
@@ -243,19 +239,17 @@
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
-  gaussian_blur(cpi, &source_extended, &blurred);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
   const double best_frame_unsharp_amount =
       find_best_frame_unsharp_amount(cpi, source, &blurred, 0.0, 0.05, 20);
 
   const int block_size = BLOCK_64X64;
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
-  const int block_w = num_mi_w << 2;
-  const int block_h = num_mi_h << 2;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
   double *best_unsharp_amounts =
       aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows);
   memset(best_unsharp_amounts, 0,
@@ -340,10 +334,8 @@
   // Apply best blur amounts
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
-      const int mi_row = row * num_mi_h;
-      const int mi_col = col * num_mi_w;
-      const int row_offset_y = mi_row << 2;
-      const int col_offset_y = mi_col << 2;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
       const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
@@ -463,65 +455,75 @@
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  uint8_t *const y_buffer = cpi->source->y_buffer;
-  const int y_stride = cpi->source->y_stride;
   const int y_width = cpi->source->y_width;
   const int y_height = cpi->source->y_height;
-  const int block_size = BLOCK_64X64;
-
-  const int num_mi_w = mi_size_wide[block_size];
-  const int num_mi_h = mi_size_high[block_size];
-  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
-  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
-  const int block_w = num_mi_w << 2;
-  const int block_h = num_mi_h << 2;
+  const int resized_block_size = BLOCK_32X32;
+  const int resize_factor = 2;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
 
   aom_clear_system_state();
-  YV12_BUFFER_CONFIG blurred;
-  memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(&blurred, y_width, y_height, 1, 1,
+  YV12_BUFFER_CONFIG resized_source;
+  memset(&resized_source, 0, sizeof(resized_source));
+  aom_alloc_frame_buffer(&resized_source, y_width / resize_factor,
+                         y_height / resize_factor, 1, 1,
                          cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
-  gaussian_blur(cpi, cpi->source, &blurred);
+  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
+                              av1_num_planes(cm));
+
+  const int resized_y_width = resized_source.y_width;
+  const int resized_y_height = resized_source.y_height;
+  const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+  const int resized_block_h = mi_size_high[resized_block_size] * 4;
+  const int num_cols =
+      (resized_y_width + resized_block_w - 1) / resized_block_w;
+  const int num_rows =
+      (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
+  gaussian_blur(bit_depth, &resized_source, &blurred);
 
   double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
   memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
   FrameData frame_data;
-  frame_data.source = cpi->source;
+  frame_data.source = &resized_source;
   frame_data.blurred = &blurred;
-  frame_data.block_w = block_w;
-  frame_data.block_h = block_h;
+  frame_data.block_w = resized_block_w;
+  frame_data.block_h = resized_block_h;
   frame_data.num_rows = num_rows;
   frame_data.num_cols = num_cols;
   frame_data.row = 0;
   frame_data.col = 0;
   frame_data.bit_depth = bit_depth;
   aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path,
-                            update_frame, y_width, y_height, bit_depth, scores);
+                            update_frame, resized_y_width, resized_y_height,
+                            bit_depth, scores);
 
   // Loop through each 'block_size' block.
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
-      const int mi_row = row * num_mi_h;
-      const int mi_col = col * num_mi_w;
       const int index = row * num_cols + col;
-      const int row_offset_y = mi_row << 2;
-      const int col_offset_y = mi_col << 2;
+      const int row_offset_y = row * resized_block_h;
+      const int col_offset_y = col * resized_block_w;
 
-      uint8_t *const orig_buf =
-          y_buffer + row_offset_y * y_stride + col_offset_y;
+      uint8_t *const orig_buf = resized_source.y_buffer +
+                                row_offset_y * resized_source.y_stride +
+                                col_offset_y;
       uint8_t *const blurred_buf =
           blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
 
       const double vmaf = scores[index];
       const double dvmaf = kBaselineVmaf - vmaf;
       unsigned int sse;
-      cpi->fn_ptr[block_size].vf(orig_buf, y_stride, blurred_buf,
-                                 blurred.y_stride, &sse);
+      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                         blurred_buf, blurred.y_stride, &sse);
 
-      const double mse = (double)sse / (double)(y_width * y_height);
-
+      const double mse =
+          (double)sse / (double)(resized_y_width * resized_y_height);
       double weight;
       const double eps = 0.01 / (num_rows * num_cols);
       if (dvmaf < eps || mse < eps) {
@@ -536,6 +538,7 @@
     }
   }
 
+  aom_free_frame_buffer(&resized_source);
   aom_free_frame_buffer(&blurred);
   aom_free(scores);
   aom_clear_system_state();
@@ -635,12 +638,11 @@
                          cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
-  gaussian_blur(cpi, cur, &blurred_cur);
-  gaussian_blur(cpi, last, &blurred_last);
-  if (next) gaussian_blur(cpi, next, &blurred_next);
+  gaussian_blur(bit_depth, cur, &blurred_cur);
+  gaussian_blur(bit_depth, last, &blurred_last);
+  if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-
   if (bit_depth > 8) {
     const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
     motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
commit	f84f7c47ce8c55f310c307ddc97803e270fb8ec4	[log] [tgz]
author	sdeng <sdeng@google.com>	Fri Feb 28 15:30:09 2020 -0800
committer	Sai Deng <sdeng@google.com>	Thu Mar 19 17:10:51 2020 +0000
tree	454c238bd851f28f830cdbca0b0c8ea9d2e47e5d
parent	922db60359be0a68dfb18dd5ec36d615cfb15c76 [diff]