Implement new non-local means algorithm.

Apply the new non-local means filtering for 480p and above.

An experiment flag is set for easy maintainance and modification.

The most recent performance tests on 60 frames show:

Vbr midres: avg_psnr	ovr_psnr	 ssim           vmaf
Spd0	    -0.133	-0.280		-0.144		-0.218
Spd1	    -0.236	-0.377		-0.245		-0.440
Spd2	    -0.273	-0.415		-0.299		-0.436

Vbr hdres:  avg_psnr	ovr_psnr	 ssim		vmaf
Spd0	    -0.235	-0.445		-0.302		-0.768
Spd1	    -0.347	-0.565		-0.437		-0.870
Spd2	    -0.416	-0.630		-0.488		-0.827

q midres:   avg_psnr	ovr_psnr	 ssim		vmaf
Spd0	    -0.077	-0.197		-0.118		-0.253
Spd1	    -0.084	-0.196		-0.141		-0.262
Spd2	    -0.107	-0.224		-0.107		-0.188

q hdres:    avg_psnr	ovr_psnr	 ssim		vmaf
Spd0	     0.107	-0.092		-0.123		-0.878
Spd1	     0.003	-0.198		-0.219		-1.003
Spd2	    -0.015	-0.216		-0.187		-0.901

Test head is 23bc1ee51f81334fef4189d1c6e3156a36407b5c.

STATS_CHANGED

Change-Id: Ia8cd2e31d75a3cfcef607f454ee873d812bde323
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 54127bc..edca3e2 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -34,6 +34,11 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_scale/aom_scale.h"
 
+#define EXPERIMENT_TEMPORAL_FILTER 1
+#define WINDOW_LENGTH 2
+#define WINDOW_SIZE 25
+#define SCALE 1000
+
 #define EDGE_THRESHOLD 50
 #define SQRT_PI_BY_2 1.25331413732
 
@@ -144,7 +149,12 @@
                                        unsigned int block_height,
                                        int filter_weight, uint32_t *accumulator,
                                        uint16_t *count) {
+#if EXPERIMENT_TEMPORAL_FILTER
+  (void)filter_weight;
+  const int modifier = SCALE;
+#else
   const int modifier = filter_weight * 16;
+#endif  // EXPERIMENT_TEMPORAL_FILTER
   unsigned int i, j, k = 0;
   assert(filter_weight == 2);
 
@@ -162,7 +172,12 @@
     const uint8_t *pred8, int buf_stride, unsigned int block_width,
     unsigned int block_height, int filter_weight, uint32_t *accumulator,
     uint16_t *count) {
+#if EXPERIMENT_TEMPORAL_FILTER
+  (void)filter_weight;
+  const int modifier = SCALE;
+#else
   const int modifier = filter_weight * 16;
+#endif  // EXPERIMENT_TEMPORAL_FILTER
   const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   unsigned int i, j, k = 0;
   assert(filter_weight == 2);
@@ -655,6 +670,204 @@
   }
 }
 
+#if EXPERIMENT_TEMPORAL_FILTER
+void av1_temporal_filter_plane_c(uint8_t *frame1, unsigned int stride,
+                                 uint8_t *frame2, unsigned int stride2,
+                                 int block_width, int block_height,
+                                 int strength, double sigma, int decay_control,
+                                 const int *blk_fw, int use_32x32,
+                                 unsigned int *accumulator, uint16_t *count) {
+  (void)strength;
+  (void)blk_fw;
+  (void)use_32x32;
+  const double decay = decay_control * exp(1 - sigma);
+  const double h = decay * sigma;
+  const double beta = 1.0;
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = 0;
+      for (int idy = -WINDOW_LENGTH; idy <= WINDOW_LENGTH; ++idy) {
+        for (int idx = -WINDOW_LENGTH; idx <= WINDOW_LENGTH; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+          if (row < 0) row = 0;
+          if (row >= block_height) row = block_height - 1;
+          if (col < 0) col = 0;
+          if (col >= block_width) col = block_width - 1;
+
+          int diff = frame1[row * (int)stride + col] -
+                     frame2[row * (int)stride2 + col];
+          diff_sse += diff * diff;
+        }
+      }
+      diff_sse /= WINDOW_SIZE;
+
+      double w = exp(-diff_sse / (2 * beta * h * h));
+      const int weight = (int)(w * SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+
+void av1_highbd_temporal_filter_plane_c(
+    uint8_t *frame1_8bit, unsigned int stride, uint8_t *frame2_8bit,
+    unsigned int stride2, int block_width, int block_height, int strength,
+    double sigma, int decay_control, const int *blk_fw, int use_32x32,
+    unsigned int *accumulator, uint16_t *count) {
+  (void)strength;
+  (void)blk_fw;
+  (void)use_32x32;
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8bit);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8bit);
+  const double decay = decay_control * exp(1 - sigma);
+  const double h = decay * sigma;
+  const double beta = 1.0;
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = 0;
+      for (int idy = -WINDOW_LENGTH; idy <= WINDOW_LENGTH; ++idy) {
+        for (int idx = -WINDOW_LENGTH; idx <= WINDOW_LENGTH; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+          if (row < 0) row = 0;
+          if (row >= block_height) row = block_height - 1;
+          if (col < 0) col = 0;
+          if (col >= block_width) col = block_width - 1;
+
+          int diff = frame1[row * (int)stride + col] -
+                     frame2[row * (int)stride2 + col];
+          diff_sse += diff * diff;
+        }
+      }
+      diff_sse /= WINDOW_SIZE;
+
+      double w = exp(-diff_sse / (2 * beta * h * h));
+      const int weight = (int)(w * SCALE);
+
+      count[k] += weight;
+      accumulator[k] += weight * pixel_value;
+    }
+  }
+}
+
+void apply_temporal_filter_block(YV12_BUFFER_CONFIG *frame, MACROBLOCKD *mbd,
+                                 int mb_y_src_offset, int mb_uv_src_offset,
+                                 int mb_uv_width, int mb_uv_height,
+                                 int num_planes, uint8_t *predictor,
+                                 int frame_height, int strength, double sigma,
+                                 int *blk_fw, int use_32x32,
+                                 unsigned int *accumulator, uint16_t *count) {
+  const int is_hbd = is_cur_buf_hbd(mbd);
+  // High bitdepth
+  if (is_hbd) {
+    if (frame_height >= 480) {
+      // Apply frame size dependent non-local means filtering.
+      int decay_control;
+      // The decay is obtained empirically, subject to better tuning.
+      if (frame_height >= 720) {
+        decay_control = 7;
+      } else if (frame_height >= 480) {
+        decay_control = 5;
+      } else {
+        decay_control = 3;
+      }
+      av1_highbd_temporal_filter_plane_c(frame->y_buffer + mb_y_src_offset,
+                                         frame->y_stride, predictor, BW, BW, BH,
+                                         strength, sigma, decay_control, blk_fw,
+                                         use_32x32, accumulator, count);
+      if (num_planes > 1) {
+        av1_highbd_temporal_filter_plane_c(
+            frame->u_buffer + mb_uv_src_offset, frame->uv_stride,
+            predictor + BLK_PELS, mb_uv_width, mb_uv_width, mb_uv_height,
+            strength, sigma, decay_control, blk_fw, use_32x32,
+            accumulator + BLK_PELS, count + BLK_PELS);
+        av1_highbd_temporal_filter_plane_c(
+            frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+            predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_width, mb_uv_height,
+            strength, sigma, decay_control, blk_fw, use_32x32,
+            accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+      }
+    } else {
+      // Apply original non-local means filtering for small resolution
+      const int adj_strength = strength + 2 * (mbd->bd - 8);
+      if (num_planes <= 1) {
+        // Single plane case
+        av1_highbd_temporal_filter_apply_c(
+            frame->y_buffer + mb_y_src_offset, frame->y_stride, predictor, BW,
+            BH, adj_strength, blk_fw, use_32x32, accumulator, count);
+      } else {
+        // Process 3 planes together.
+        av1_highbd_apply_temporal_filter(
+            frame->y_buffer + mb_y_src_offset, frame->y_stride, predictor, BW,
+            frame->u_buffer + mb_uv_src_offset,
+            frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+            predictor + BLK_PELS, predictor + (BLK_PELS << 1), mb_uv_width, BW,
+            BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+            adj_strength, blk_fw, use_32x32, accumulator, count,
+            accumulator + BLK_PELS, count + BLK_PELS,
+            accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+      }
+    }
+    return;
+  }
+
+  // Low bitdepth
+  if (frame_height >= 480) {
+    // Apply frame size dependent non-local means filtering.
+    int decay_control;
+    // The decay is obtained empirically, subject to better tuning.
+    if (frame_height >= 720) {
+      decay_control = 7;
+    } else if (frame_height >= 480) {
+      decay_control = 5;
+    } else {
+      decay_control = 3;
+    }
+    av1_temporal_filter_plane_c(frame->y_buffer + mb_y_src_offset,
+                                frame->y_stride, predictor, BW, BW, BH,
+                                strength, sigma, decay_control, blk_fw,
+                                use_32x32, accumulator, count);
+    if (num_planes > 1) {
+      av1_temporal_filter_plane_c(
+          frame->u_buffer + mb_uv_src_offset, frame->uv_stride,
+          predictor + BLK_PELS, mb_uv_width, mb_uv_width, mb_uv_height,
+          strength, sigma, decay_control, blk_fw, use_32x32,
+          accumulator + BLK_PELS, count + BLK_PELS);
+      av1_temporal_filter_plane_c(
+          frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+          predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_width, mb_uv_height,
+          strength, sigma, decay_control, blk_fw, use_32x32,
+          accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+    }
+  } else {
+    // Apply original non-local means filtering for small resolution
+    if (num_planes <= 1) {
+      // Single plane case
+      av1_temporal_filter_apply_c(frame->y_buffer + mb_y_src_offset,
+                                  frame->y_stride, predictor, BW, BH, strength,
+                                  blk_fw, use_32x32, accumulator, count);
+    } else {
+      // Process 3 planes together.
+      av1_apply_temporal_filter(
+          frame->y_buffer + mb_y_src_offset, frame->y_stride, predictor, BW,
+          frame->u_buffer + mb_uv_src_offset,
+          frame->v_buffer + mb_uv_src_offset, frame->uv_stride,
+          predictor + BLK_PELS, predictor + (BLK_PELS << 1), mb_uv_width, BW,
+          BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+          strength, blk_fw, use_32x32, accumulator, count,
+          accumulator + BLK_PELS, count + BLK_PELS,
+          accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+    }
+  }
+}
+#endif  // EXPERIMENT_TEMPORAL_FILTER
+
 static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
@@ -779,7 +992,7 @@
 static void temporal_filter_iterate_c(AV1_COMP *cpi,
                                       YV12_BUFFER_CONFIG **frames,
                                       int frame_count, int alt_ref_index,
-                                      int strength,
+                                      int strength, double sigma,
                                       struct scale_factors *ref_scale_factors) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -944,8 +1157,13 @@
             }
           } else {
             if (is_hbd) {
+#if EXPERIMENT_TEMPORAL_FILTER
+              apply_temporal_filter_block(
+                  f, mbd, mb_y_src_offset, mb_uv_src_offset, mb_uv_width,
+                  mb_uv_height, num_planes, predictor, cm->height, strength,
+                  sigma, blk_fw, use_32x32, accumulator, count);
+#else
               const int adj_strength = strength + 2 * (mbd->bd - 8);
-
               if (num_planes <= 1) {
                 // Single plane case
                 av1_highbd_temporal_filter_apply_c(
@@ -964,7 +1182,14 @@
                     count + BLK_PELS, accumulator + (BLK_PELS << 1),
                     count + (BLK_PELS << 1));
               }
+#endif  // EXPERIMENT_TEMPORAL_FILTER
             } else {
+#if EXPERIMENT_TEMPORAL_FILTER
+              apply_temporal_filter_block(
+                  f, mbd, mb_y_src_offset, mb_uv_src_offset, mb_uv_width,
+                  mb_uv_height, num_planes, predictor, cm->height, strength,
+                  sigma, blk_fw, use_32x32, accumulator, count);
+#else
               if (num_planes <= 1) {
                 // Single plane case
                 av1_temporal_filter_apply_c(
@@ -983,6 +1208,7 @@
                     count + BLK_PELS, accumulator + (BLK_PELS << 1),
                     count + (BLK_PELS << 1));
               }
+#endif  // EXPERIMENT_TEMPORAL_FILTER
             }
           }
         }
@@ -1159,7 +1385,8 @@
 
 // Apply buffer limits and context specific adjustments to arnr filter.
 static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
-                               int *arnr_frames, int *arnr_strength) {
+                               int *arnr_frames, int *arnr_strength,
+                               double *sigma) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const int frames_after_arf =
       av1_lookahead_depth(cpi->lookahead) - distance - 1;
@@ -1194,10 +1421,12 @@
     noiselevel = highbd_estimate_noise(
         buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height,
         buf->img.y_stride, mbd->bd, EDGE_THRESHOLD);
+    *sigma = noiselevel;
   } else {
     noiselevel = estimate_noise(buf->img.y_buffer, buf->img.y_crop_width,
                                 buf->img.y_crop_height, buf->img.y_stride,
                                 EDGE_THRESHOLD);
+    *sigma = noiselevel;
   }
   int adj_strength = oxcf->arnr_strength;
   if (noiselevel > 0) {
@@ -1249,6 +1478,7 @@
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
   const GF_GROUP *const gf_group = &cpi->gf_group;
   int rdmult = 0;
+  double sigma = 0;
 
   // Apply context specific adjustments to the arnr filter parameters.
   if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
@@ -1258,8 +1488,8 @@
     strength = 0;
     frames_to_blur = 1;
   } else {
-    adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur,
-                       &strength);
+    adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength,
+                       &sigma);
   }
 
   int which_arf = gf_group->arf_update_idx[gf_group->index];
@@ -1299,5 +1529,5 @@
   av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, &sf);
+                            frames_to_blur_backward, strength, sigma, &sf);
 }