Speed up VMAF pre-processing

by using last frame's filter strength as the initial guess.
VMAF BD-rate changes
--------------------
MODE        midres   hdres  lowres_10bd  midres_10bd
VMAF        -0.50%   0.06%    -1.32%        0.01%
VMAF_W_PP   -1.05%  -0.35%    -0.62%       -0.70%


Compression speed (instruction count) of different modes
--------------------------------------------------------
MODE                 480P       1080P
PSNR (baseline)      1.00x      1.00x
VMAF (before)        1.50x      4.14x
VMAF (after)         1.49x      3.87x
VMAF_W_PP (before)   1.21x      1.45x
VMAF_W_PP (after)    1.16x      1.15x
VMAF_WO_PP           1.19x      3.36x

Full speed test results
-----------------------
Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=psnr --cpu-used=1':
   642,437,621,503      instructions:u
      84.889688323 seconds time elapsed
      84.588830000 seconds user
       0.291975000 seconds sys

Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=vmaf_with_preprocessing --cpu-used=1':
Before:
   775,025,691,607      instructions:u
     109.458205579 seconds time elapsed
     107.989239000 seconds user
       4.284554000 seconds sys
After:
   747,907,904,154      instructions:u
     105.304168262 seconds time elapsed
     104.205238000 seconds user
       3.659274000 seconds sys

Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=vmaf --cpu-used=1':
Before:
   964,828,402,519      instructions:u
     123.807091424 seconds time elapsed
     161.892974000 seconds user
      42.540103000 seconds sys
After:
   958,203,027,249      instructions:u
     166.730655440 seconds time elapsed
     172.417289000 seconds user
      96.809565000 seconds sys

Performance counter stats for './aomenc red_kayak_480p.y4m --limit=30
 -o output_new --tune=vmaf_without_preprocessing --cpu-used=1':
   762,965,561,826      instructions:u
      79.566220976 seconds time elapsed
     118.461060000 seconds user
       4.023039000 seconds sys
------------------
Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=psnr --cpu-used=1':
 1,582,885,665,086      instructions:u
     204.687949181 seconds time elapsed
     203.775777000 seconds user
       0.811887000 seconds sys

Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=vmaf_with_preprocessing
 --cpu-used=1':
Before:
 2,298,587,912,062      instructions:u
     361.306249556 seconds time elapsed
     326.457704000 seconds user
      41.029086000 seconds sys
After:
 1,821,777,148,459      instructions:u
     237.958643869 seconds time elapsed
     233.000579000 seconds user
       6.904995000 seconds sys

Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=vmaf --cpu-used=1':
Before:
 6,543,459,539,854      instructions:u
     553.099946250 seconds time elapsed
    1257.052953000 seconds user
     250.151702000 seconds sys
After:
 6,118,499,090,110      instructions:u
     610.973700082 seconds time elapsed
    1254.925749000 seconds user
     439.220760000 seconds sys

Performance counter stats for './aomenc basketballdrive_1080p50.y4m
 --limit=30 -o output_new --tune=vmaf_without_preprocessing
 --cpu-used=1':
 5,311,331,047,231      instructions:u
     247.754365865 seconds time elapsed
     984.609408000 seconds user
      12.773107000 seconds sys

Change-Id: Ifdcfbccdd976ba537f23cbbfebe144402fa032c6
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 2aa9a3e..ee104e6 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3194,6 +3194,7 @@
     CHECK_MEM_ERROR(cm, cpi->vmaf_rdmult_scaling_factors,
                     aom_calloc(num_rows * num_cols,
                                sizeof(*cpi->vmaf_rdmult_scaling_factors)));
+    cpi->last_frame_unsharp_amount = 0.0;
   }
 #endif
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 29e8f68..63c3b38 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1131,6 +1131,7 @@
   double *vmaf_rdmult_scaling_factors;
   double last_frame_ysse;
   double last_frame_vmaf;
+  double last_frame_unsharp_amount;
 #endif
 
   int use_svc;
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 6a5ea75..72fd846 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -152,16 +152,51 @@
   return var;
 }
 
+static double cal_approx_vmaf(const AV1_COMP *const cpi, double source_variance,
+                              YV12_BUFFER_CONFIG *const source,
+                              YV12_BUFFER_CONFIG *const sharpened) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double new_vmaf;
+  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, sharpened, bit_depth,
+                &new_vmaf);
+  const double sharpened_var = frame_average_variance(cpi, sharpened);
+  return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
+}
+
+static double find_best_frame_unsharp_amount_loop(
+    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+    YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+    double best_vmaf, const double baseline_variance,
+    const double unsharp_amount_start, const double step_size,
+    const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_vmaf = best_vmaf;
+  double unsharp_amount = unsharp_amount_start;
+  do {
+    best_vmaf = approx_vmaf;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, source, blurred, sharpened, unsharp_amount);
+    approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+
+    loop_count++;
+  } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
 static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
                                              YV12_BUFFER_CONFIG *const source,
                                              YV12_BUFFER_CONFIG *const blurred,
                                              const double unsharp_amount_start,
                                              const double step_size,
-                                             const int max_loop_count) {
+                                             const int max_loop_count,
+                                             const double max_filter_amount) {
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
-  const int bit_depth = cpi->td.mb.e_mbd.bd;
 
   YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
@@ -170,27 +205,36 @@
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
   const double baseline_variance = frame_average_variance(cpi, source);
-  int loop_count = 0;
-  double approx_vmaf = 0.0;
-  double best_vmaf, new_vmaf, unsharp_amount = unsharp_amount_start;
-  do {
-    best_vmaf = approx_vmaf;
-    unsharp_amount += step_size;
-    unsharp(cpi, source, blurred, &sharpened, unsharp_amount);
-    aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, &sharpened, bit_depth,
-                  &new_vmaf);
-    const double sharpened_var = frame_average_variance(cpi, &sharpened);
-    approx_vmaf =
-        baseline_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
-
-    loop_count++;
-  } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+  double unsharp_amount;
+  if (unsharp_amount_start <= step_size) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop(
+        cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+        step_size, max_loop_count, max_filter_amount);
+  } else {
+    double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
+    double v0, v1;
+    unsharp(cpi, source, blurred, &sharpened, a0);
+    v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    unsharp(cpi, source, blurred, &sharpened, a1);
+    v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    if (fabs(v0 - v1) < 0.01) {
+      unsharp_amount = a0;
+    } else if (v0 > v1) {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+          -step_size, max_loop_count, max_filter_amount);
+    } else {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+          step_size, max_loop_count, max_filter_amount);
+    }
+  }
 
   aom_free_frame_buffer(&sharpened);
-  return approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+  return unsharp_amount;
 }
 
-void av1_vmaf_frame_preprocessing(const AV1_COMP *const cpi,
+void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
                                   YV12_BUFFER_CONFIG *const source) {
   aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
@@ -212,15 +256,16 @@
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
-  const double best_frame_unsharp_amount =
-      find_best_frame_unsharp_amount(cpi, source, &blurred, 0.0, 0.05, 20);
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
+  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
 
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
   aom_free_frame_buffer(&blurred);
   aom_clear_system_state();
 }
 
-void av1_vmaf_blk_preprocessing(const AV1_COMP *const cpi,
+void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
                                 YV12_BUFFER_CONFIG *const source) {
   aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
@@ -242,8 +287,9 @@
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
-  const double best_frame_unsharp_amount =
-      find_best_frame_unsharp_amount(cpi, source, &blurred, 0.0, 0.05, 20);
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
+  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
 
   const int block_size = BLOCK_64X64;
   const int block_w = mi_size_wide[block_size] * 4;
@@ -325,9 +371,9 @@
         }
       }
 
-      const double amount_start = AOMMAX(best_frame_unsharp_amount - 0.2, 0.0);
       best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
-          cpi, &source_block, &blurred_block, amount_start, 0.1, 5);
+          cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3,
+          1.5);
     }
   }
 
diff --git a/av1/encoder/tune_vmaf.h b/av1/encoder/tune_vmaf.h
index 5ed14b7..c4cf072 100644
--- a/av1/encoder/tune_vmaf.h
+++ b/av1/encoder/tune_vmaf.h
@@ -15,11 +15,9 @@
 #include "aom_scale/yv12config.h"
 #include "av1/encoder/encoder.h"
 
-void av1_vmaf_blk_preprocessing(const AV1_COMP *cpi,
-                                YV12_BUFFER_CONFIG *source);
+void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
 
-void av1_vmaf_frame_preprocessing(const AV1_COMP *cpi,
-                                  YV12_BUFFER_CONFIG *source);
+void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi);