Apply temporal filtering on key frames.

Apply temporal filtering on key frames, except
(1) if the key frame interval is smaller than 7.
(2) if the key frame is noise free.

Use 7 frames for filtering (actually 6, if exclude current frame).
The change is under a macro in encode_stratgey.c.

Use original non-local means method instead of the new version
applied to alt-ref frames
(https://aomedia-review.googlesource.com/c/aom/+/88481)
(This gives additional 0.42% gain on hdres).

Performance:
(1). AWCY, spd0.
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-2.8877 | -2.7754 | -2.7226 |  -2.6934 | -2.2295 | -2.5333 |    -1.9622

(2). Google test sets, VBR, 150 frames, spd1.
          avg_psnr    ovr_psnr     ssim     vmaf
lowres      -2.87       -3.14     -3.88    -3.77
midres      -0.96       -0.99     -1.13    -0.86
hdres       -1.48       -1.64     -1.88    -2.08

(3). Google test sets, VBR, 150 frames, spd4.
          avg_psnr    ovr_psnr     ssim     vmaf
lowres      -2.80       -3.04     -3.74    -3.56
midres      -0.87       -0.91     -1.01    -0.57
hdres       -1.26       -1.46     -1.67    -1.71

(4). Google ugc360p, VBR, 150 frames, spd0.
          avg_psnr    ovr_psnr     ssim     vmaf
ugc360p     -1.18       -1.00     -1.45    -1.33

STATS_CHANGED

Change-Id: I18cd5b8a067b9e0d0f2e5bcff01daa66bb331ae7
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index af496d5..b27601a 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -3726,6 +3726,8 @@
 
 int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id) {
+  if (!cpi->pack_bitstream) return AOM_CODEC_OK;
+
   uint8_t *data = dst;
   uint32_t data_size;
   AV1_COMMON *const cm = &cpi->common;
diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index d6dc11d..caf8b8b 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c
@@ -32,6 +32,8 @@
 #include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tpl_model.h"
 
+#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
+
 void av1_configure_buffer_updates(AV1_COMP *const cpi,
                                   EncodeFrameParams *const frame_params,
                                   const FRAME_UPDATE_TYPE type,
@@ -986,6 +988,88 @@
   return refresh_mask;
 }
 
+#if !CONFIG_REALTIME_ONLY
+// Apply temporal filtering to key frames and encode the filtered frame.
+// If the current frame is not key frame, this function is identical to
+// av1_encode().
+static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
+                              EncodeFrameInput *const frame_input,
+                              EncodeFrameParams *const frame_params,
+                              EncodeFrameResults *const frame_results,
+                              int *temporal_filtered) {
+  if (frame_params->frame_type != KEY_FRAME) {
+    cpi->pack_bitstream = 1;
+    if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+    return AOM_CODEC_OK;
+  }
+
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+  double noise_level;
+  const int use_hbd = frame_input->source->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (use_hbd) {
+    noise_level = highbd_estimate_noise(
+        frame_input->source->y_buffer, frame_input->source->y_crop_width,
+        frame_input->source->y_crop_height, frame_input->source->y_stride,
+        cm->seq_params.bit_depth, EDGE_THRESHOLD);
+  } else {
+    noise_level = estimate_noise(frame_input->source->y_buffer,
+                                 frame_input->source->y_crop_width,
+                                 frame_input->source->y_crop_height,
+                                 frame_input->source->y_stride, EDGE_THRESHOLD);
+  }
+  const int apply_filtering =
+      oxcf->pass == 2 && frame_params->frame_type == KEY_FRAME &&
+      cpi->rc.frames_to_key > NUM_KEY_FRAME_DENOISING && noise_level > 0 &&
+      !is_lossless_requested(oxcf) && oxcf->arnr_max_frames > 0;
+
+  // Apply filtering to key frame and encode.
+  if (apply_filtering) {
+    const int num_planes = av1_num_planes(cm);
+    // Keep a copy of the source image.
+    aom_yv12_copy_frame(frame_input->source, &cpi->source_kf_buffer,
+                        num_planes);
+    // TODO(chengchen): Encode the key frame, this is a workaround to get
+    // internal data structures properly initialized, for example, mi, x, xd.
+    // Do not pack bitstream in this case.
+    cpi->pack_bitstream = 0;
+    if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+    // Produce the filtered key frame.
+    av1_temporal_filter(cpi, -1);
+    aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+    *temporal_filtered = 1;
+    // Set frame_input source to temporal filtered key frame.
+    frame_input->source = &cpi->alt_ref_buffer;
+    // Encode the filtered key frame. Pack bitstream.
+    cpi->pack_bitstream = 1;
+    if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+    // Set frame_input source to true source for psnr calculation.
+    if (oxcf->arnr_max_frames > 0 && *temporal_filtered) {
+      aom_yv12_copy_frame(&cpi->source_kf_buffer, cpi->source, num_planes);
+      aom_yv12_copy_frame(&cpi->source_kf_buffer, cpi->unscaled_source,
+                          num_planes);
+    }
+  } else {
+    // Encode other frames.
+    cpi->pack_bitstream = 1;
+    if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_OK;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
@@ -1186,10 +1270,18 @@
     }
   }
 
+#if TEMPORAL_FILTER_KEY_FRAME
+  if (denoise_and_encode(cpi, dest, &frame_input, &frame_params, &frame_results,
+                         &temporal_filtered) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+#else   // !TEMPORAL_FILTER_KEY_FRAME
+  cpi->pack_bitstream = 1;
   if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
       AOM_CODEC_OK) {
     return AOM_CODEC_ERROR;
   }
+#endif  // TEMPORAL_FILTER_KEY_FRAME
   if (oxcf->pass != 1) cpi->num_gf_group_show_frames += frame_params.show_frame;
 
   if (oxcf->pass == 0 || oxcf->pass == 2) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f1e17d4..1547bad 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -573,6 +573,7 @@
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
   aom_free_frame_buffer(&cpi->alt_ref_buffer);
+  aom_free_frame_buffer(&cpi->source_kf_buffer);
   av1_lookahead_destroy(cpi->lookahead);
 
   aom_free(cpi->tile_tok[0][0]);
@@ -810,6 +811,16 @@
           cm->byte_alignment, NULL, NULL, NULL))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
+
+  // Allocate frame buffer to hold source frame whey key frame filtering
+  // is applied.
+  if (aom_realloc_frame_buffer(
+          &cpi->source_kf_buffer, oxcf->width, oxcf->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, oxcf->border_in_pixels,
+          cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
 }
 
 static void alloc_util_frame_buffers(AV1_COMP *cpi) {
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 7336e87..0f96e89 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -868,6 +868,8 @@
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
 
+  YV12_BUFFER_CONFIG source_kf_buffer;
+
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
 
@@ -1022,6 +1024,11 @@
   int deltaq_used;
 
   double *ssim_rdmult_scaling_factors;
+
+  // Whether writing to bitstream. It allows us to encode one frame multiple
+  // times without writing to bitstream and thus provides flexibility for
+  // experiments, for example, temporal filtering on key frames.
+  int pack_bitstream;
 } AV1_COMP;
 
 typedef struct {
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 6c353b1..67402d8 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -39,9 +39,6 @@
 #define WINDOW_SIZE 25
 #define SCALE 1000
 
-#define EDGE_THRESHOLD 50
-#define SQRT_PI_BY_2 1.25331413732
-
 static unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
                                        39322, 32768, 28087, 24576, 21846,
                                        19661, 17874, 0,     15124 };
@@ -985,6 +982,7 @@
                                       YV12_BUFFER_CONFIG **frames,
                                       int frame_count, int alt_ref_index,
                                       int strength, double sigma,
+                                      int is_key_frame,
                                       struct scale_factors *ref_scale_factors) {
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
@@ -1009,8 +1007,8 @@
   const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
 #if EXPERIMENT_TEMPORAL_FILTER
   const int is_screen_content_type = cm->allow_screen_content_tools != 0;
-  const int use_new_temporal_mode =
-      AOMMIN(cm->width, cm->height) >= 480 && !is_screen_content_type;
+  const int use_new_temporal_mode = AOMMIN(cm->width, cm->height) >= 480 &&
+                                    !is_screen_content_type && !is_key_frame;
 #else
   (void)sigma;
   const int use_new_temporal_mode = 0;
@@ -1316,8 +1314,8 @@
 // Signal Processing, 2008, St Julians, Malta.
 //
 // Return noise estimate, or -1.0 if there was a failure
-static double estimate_noise(const uint8_t *src, int width, int height,
-                             int stride, int edge_thresh) {
+double estimate_noise(const uint8_t *src, int width, int height, int stride,
+                      int edge_thresh) {
   int64_t sum = 0;
   int64_t num = 0;
   for (int i = 1; i < height - 1; ++i) {
@@ -1351,8 +1349,8 @@
 }
 
 // Return noise estimate, or -1.0 if there was a failure
-static double highbd_estimate_noise(const uint8_t *src8, int width, int height,
-                                    int stride, int bd, int edge_thresh) {
+double highbd_estimate_noise(const uint8_t *src8, int width, int height,
+                             int stride, int bd, int edge_thresh) {
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   int64_t sum = 0;
   int64_t num = 0;
@@ -1386,31 +1384,10 @@
   return sigma;
 }
 
-// Apply buffer limits and context specific adjustments to arnr filter.
-static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
-                               int *arnr_frames, int *arnr_strength,
-                               double *sigma) {
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const int frames_after_arf =
-      av1_lookahead_depth(cpi->lookahead) - distance - 1;
-  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
-  int frames_bwd;
-  int q, frames, strength;
-
-  // Define the forward and backwards filter limits for this arnr group.
-  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
-  if (frames_fwd > distance) frames_fwd = distance;
-
-  frames_bwd = frames_fwd;
-
-  // For even length filter there is one more frame backward
-  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
-  if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
-
-  // Set the baseline active filter size.
-  frames = frames_bwd + 1 + frames_fwd;
-
+static int estimate_strength(AV1_COMP *cpi, int distance, int group_boost,
+                             double *sigma) {
   // Adjust the strength based on active max q.
+  int q;
   if (cpi->common.current_frame.frame_number > 1)
     q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
                                       cpi->common.seq_params.bit_depth));
@@ -1419,6 +1396,7 @@
                                       cpi->common.seq_params.bit_depth));
   MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
   struct lookahead_entry *buf = av1_lookahead_peek(cpi->lookahead, distance);
+  int strength;
   double noiselevel;
   if (is_cur_buf_hbd(mbd)) {
     noiselevel = highbd_estimate_noise(
@@ -1431,7 +1409,7 @@
                                 EDGE_THRESHOLD);
     *sigma = noiselevel;
   }
-  int adj_strength = oxcf->arnr_strength;
+  int adj_strength = cpi->oxcf.arnr_strength;
   if (noiselevel > 0) {
     // Get 4 integer adjustment levels in [-2, 1]
     int noiselevel_adj;
@@ -1454,18 +1432,45 @@
     if (strength < 0) strength = 0;
   }
 
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
+  }
+
+  return strength;
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength,
+                               double *sigma) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frames_after_arf =
+      av1_lookahead_depth(cpi->lookahead) - distance - 1;
+  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd;
+  int frames;
+
+  // Define the forward and backwards filter limits for this arnr group.
+  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+  if (frames_fwd > distance) frames_fwd = distance;
+
+  frames_bwd = frames_fwd;
+
+  // For even length filter there is one more frame backward
+  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+  if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+  // Set the baseline active filter size.
+  frames = frames_bwd + 1 + frames_fwd;
+
   // Adjust number of frames in filter and strength based on gf boost level.
   if (frames > group_boost / 150) {
     frames = group_boost / 150;
     frames += !(frames & 1);
   }
 
-  if (strength > group_boost / 300) {
-    strength = group_boost / 300;
-  }
-
   *arnr_frames = frames;
-  *arnr_strength = strength;
+  *arnr_strength = estimate_strength(cpi, distance, group_boost, sigma);
 }
 
 void av1_temporal_filter(AV1_COMP *cpi, int distance) {
@@ -1490,6 +1495,11 @@
     // beneficial to use non-zero strength filtering.
     strength = 0;
     frames_to_blur = 1;
+  } else if (distance == -1) {
+    // Apply temporal filtering on key frame.
+    strength = estimate_strength(cpi, distance, rc->gfu_boost, &sigma);
+    // Number of frames for temporal filtering, could be tuned.
+    frames_to_blur = NUM_KEY_FRAME_DENOISING;
   } else {
     adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength,
                        &sigma);
@@ -1504,19 +1514,29 @@
     cpi->is_arf_filter_off[which_arf] = 0;
   cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf];
 
-  frames_to_blur_backward = (frames_to_blur / 2);
-  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
-  start_frame = distance + frames_to_blur_forward;
+  if (distance == -1) {
+    frames_to_blur_backward = 0;
+    frames_to_blur_forward = frames_to_blur - 1;
+    start_frame = distance + frames_to_blur_forward;
+  } else {
+    frames_to_blur_backward = (frames_to_blur / 2);
+    frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+    start_frame = distance + frames_to_blur_forward;
+  }
 
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
     struct lookahead_entry *buf =
         av1_lookahead_peek(cpi->lookahead, which_buffer);
-    frames[frames_to_blur - 1 - frame] = &buf->img;
+    if (buf == NULL) {
+      frames[frames_to_blur - 1 - frame] = NULL;
+    } else {
+      frames[frames_to_blur - 1 - frame] = &buf->img;
+    }
   }
 
-  if (frames_to_blur > 0) {
+  if (frames_to_blur > 0 && frames[0] != NULL) {
     // Setup scaling factors. Scaling on each of the arnr frames is not
     // supported.
     // ARF is produced at the native frame size and resized when coded.
@@ -1532,5 +1552,6 @@
   av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
 
   temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, sigma, &sf);
+                            frames_to_blur_backward, strength, sigma,
+                            distance == -1, &sf);
 }
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index bb26c36..c0afe6f 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -30,7 +30,15 @@
 #define SUB_BH 16
 #define SUB_BW 16
 
+#define NUM_KEY_FRAME_DENOISING 7
+#define EDGE_THRESHOLD 50
+#define SQRT_PI_BY_2 1.25331413732
+
 void av1_temporal_filter(AV1_COMP *cpi, int distance);
+double estimate_noise(const uint8_t *src, int width, int height, int stride,
+                      int edge_thresh);
+double highbd_estimate_noise(const uint8_t *src8, int width, int height,
+                             int stride, int bd, int edge_thresh);
 
 #ifdef __cplusplus
 }  // extern "C"