Add block based VMAF preprocessing

Comparing to frame based preprocessing, the additional VMAF gains are:
150f/SP1/VBR
ugc360p -2.38%
midres  -1.21%
hdres(95% done) -5.37%

However due to the slow VMAF computations, this method is not enabled
at the moment.

Change-Id: I281a79f24e2a68afadabae1de46e38d6d522bc7b
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 00da360..ac4cfcf 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -6242,7 +6242,7 @@
 #if CONFIG_TUNE_VMAF
   if (!is_stat_generation_stage(cpi) &&
       cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
-    av1_vmaf_preprocessing(cpi, sd);
+    av1_vmaf_preprocessing(cpi, sd, false);
   }
 #endif
 
diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index df37884..4bd086b 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c
@@ -16,22 +16,28 @@
 #include "av1/encoder/extend.h"
 
 // TODO(sdeng): Add the SIMD implementation.
+static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
+                                    const uint8_t *blurred, int blurred_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    double amount) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
 static AOM_INLINE void unsharp(const YV12_BUFFER_CONFIG *source,
                                const YV12_BUFFER_CONFIG *blurred,
                                const YV12_BUFFER_CONFIG *dst, double amount) {
-  uint8_t *src = source->y_buffer;
-  uint8_t *blur = blurred->y_buffer;
-  uint8_t *dstbuf = dst->y_buffer;
-  for (int i = 0; i < source->y_height; ++i) {
-    for (int j = 0; j < source->y_width; ++j) {
-      const double val =
-          (double)src[j] + amount * ((double)src[j] - (double)blur[j]);
-      dstbuf[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
-    }
-    src += source->y_stride;
-    blur += blurred->y_stride;
-    dstbuf += dst->y_stride;
-  }
+  unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
+               blurred->y_stride, dst->y_buffer, dst->y_stride, source->y_width,
+               source->y_height, amount);
 }
 
 // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
@@ -88,37 +94,19 @@
   }
 }
 
-void av1_vmaf_preprocessing(const AV1_COMP *cpi, YV12_BUFFER_CONFIG *source) {
-  const int use_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
-  // TODO(sdeng): Add high bit depth support.
-  if (use_hbd) {
-    printf(
-        "VMAF preprocessing for high bit depth videos is unsupported yet.\n");
-    exit(0);
-  }
-
-  aom_clear_system_state();
+static double find_best_frame_unsharp_amount(
+    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+    YV12_BUFFER_CONFIG *const blurred) {
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
 
-  YV12_BUFFER_CONFIG source_extended, blurred, sharpened;
-  memset(&source_extended, 0, sizeof(source_extended));
-  memset(&blurred, 0, sizeof(blurred));
+  YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
-  aom_alloc_frame_buffer(&source_extended, width, height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
-                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
-  aom_alloc_frame_buffer(&blurred, width, height, 1, 1,
-                         cm->seq_params.use_highbitdepth,
-                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
   aom_alloc_frame_buffer(&sharpened, width, height, 1, 1,
                          cm->seq_params.use_highbitdepth,
                          cpi->oxcf.border_in_pixels, cm->byte_alignment);
 
-  av1_copy_and_extend_frame(source, &source_extended);
-  gaussian_blur(cpi, &source_extended, &blurred);
-
   double unsharp_amount = 0.0;
   const double step_size = 0.05;
   const double max_vmaf_score = 100.0;
@@ -135,7 +123,7 @@
   const int max_loop_count = 20;
   while (!exit_loop) {
     unsharp_amount += step_size;
-    unsharp(source, &blurred, &sharpened, unsharp_amount);
+    unsharp(source, blurred, &sharpened, unsharp_amount);
     double new_vmaf;
     aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, &sharpened, &new_vmaf);
     if (new_vmaf < best_vmaf || loop_count == max_loop_count) {
@@ -149,14 +137,146 @@
     loop_count++;
   }
 
+  aom_free_frame_buffer(&sharpened);
+
   unsharp_amount -= step_size;
   if (best_unsharp_amount_begin >= 0.0) {
     unsharp_amount = (unsharp_amount + best_unsharp_amount_begin) / 2.0;
   }
-  unsharp(source, &blurred, source, unsharp_amount);
 
+  return unsharp_amount;
+}
+
+void av1_vmaf_preprocessing(const AV1_COMP *const cpi,
+                            YV12_BUFFER_CONFIG *const source,
+                            bool use_block_based_method) {
+  const int use_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
+  // TODO(sdeng): Add high bit depth support.
+  if (use_hbd) {
+    printf(
+        "VMAF preprocessing for high bit depth videos is unsupported yet.\n");
+    exit(0);
+  }
+
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+  YV12_BUFFER_CONFIG source_extended, blurred, sharpened;
+  memset(&source_extended, 0, sizeof(source_extended));
+  memset(&blurred, 0, sizeof(blurred));
+  memset(&sharpened, 0, sizeof(sharpened));
+  aom_alloc_frame_buffer(&source_extended, width, height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
+  aom_alloc_frame_buffer(&blurred, width, height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
+  aom_alloc_frame_buffer(&sharpened, width, height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels, cm->byte_alignment);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  av1_copy_and_extend_frame(source, &sharpened);
+
+  gaussian_blur(cpi, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
+  const double best_frame_unsharp_amount =
+      find_best_frame_unsharp_amount(cpi, source, &blurred);
+
+  if (!use_block_based_method) {
+    unsharp(source, &blurred, source, best_frame_unsharp_amount);
+    aom_free_frame_buffer(&sharpened);
+    aom_free_frame_buffer(&blurred);
+    aom_clear_system_state();
+    return;
+  }
+
+  const int block_size = BLOCK_128X128;
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (cm->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_rows + num_mi_h - 1) / num_mi_h;
+  const int block_w = num_mi_w << 2;
+  const int block_h = num_mi_h << 2;
+  double *best_unsharp_amounts =
+      aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+  memset(best_unsharp_amounts, 0,
+         sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int mi_row = row * num_mi_h;
+      const int mi_col = col * num_mi_w;
+
+      const int row_offset_y = mi_row << 2;
+      const int col_offset_y = mi_col << 2;
+
+      const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+      const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+
+      uint8_t *src_buf =
+          source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+      uint8_t *blurred_buf =
+          blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+      uint8_t *dst_buf =
+          sharpened.y_buffer + row_offset_y * sharpened.y_stride + col_offset_y;
+
+      const int index = col + row * num_cols;
+      const double step_size = 0.1;
+      double amount = AOMMAX(best_frame_unsharp_amount - 0.2, step_size);
+      unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                   dst_buf, sharpened.y_stride, block_width, block_height,
+                   amount);
+      double best_vmaf;
+      aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, &sharpened, &best_vmaf);
+
+      // Find the best unsharp amount.
+      bool exit_loop = false;
+      while (!exit_loop && amount < best_frame_unsharp_amount + 0.2) {
+        amount += step_size;
+        unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                     dst_buf, sharpened.y_stride, block_width, block_height,
+                     amount);
+
+        double new_vmaf;
+        aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, &sharpened, &new_vmaf);
+        if (new_vmaf <= best_vmaf) {
+          exit_loop = true;
+          amount -= step_size;
+        } else {
+          best_vmaf = new_vmaf;
+        }
+      }
+      best_unsharp_amounts[index] = amount;
+      // Reset blurred frame
+      unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                   dst_buf, sharpened.y_stride, block_width, block_height, 0.0);
+    }
+  }
+
+  // Apply best blur amounts
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int mi_row = row * num_mi_h;
+      const int mi_col = col * num_mi_w;
+      const int row_offset_y = mi_row << 2;
+      const int col_offset_y = mi_col << 2;
+      const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+      const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+      uint8_t *src_buf =
+          source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+      uint8_t *blurred_buf =
+          blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+      unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                   src_buf, source->y_stride, block_width, block_height,
+                   best_unsharp_amounts[index]);
+    }
+  }
+
   aom_free_frame_buffer(&sharpened);
   aom_free_frame_buffer(&blurred);
+  aom_free(best_unsharp_amounts);
   aom_clear_system_state();
 }
diff --git a/av1/encoder/tune_vmaf.h b/av1/encoder/tune_vmaf.h
index 0baa588..27955a0 100644
--- a/av1/encoder/tune_vmaf.h
+++ b/av1/encoder/tune_vmaf.h
@@ -15,6 +15,7 @@
 #include "aom_scale/yv12config.h"
 #include "av1/encoder/encoder.h"
 
-void av1_vmaf_preprocessing(const AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+void av1_vmaf_preprocessing(const AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
+                            bool use_block_based_method);
 
 #endif  // AOM_AV1_ENCODER_TUNE_VMAF_H_