Estimate noises for different planes separately.

Plane-wise temporal filtering strategy is kind of sensitive to the
estimated noise level. Currently, the noise level is only estimated on
Y-plane but applied to U-plane and V-plane.

This CL estimates the noise levels from different planes separately,
and use the plane-wise noise level to filter the corresponding plane
respectively. This significantly improves PSNR on U-plane and V-plane.

NOTE: Plane-wise strategy is specially used for midres and hdres.

Experimental results:

Under Speed-4 (two-pass mode):
        avg PSNR   ovr PSNR     SSIM   PSNR_Y   PSNR_U   PSNR_V
midres    -0.047     -0.041   -0.073    0.003   -0.506   -0.529
hdres     -0.075     -0.082   -0.089   -0.025   -0.401   -0.621

Under Speed-1 (two-pass mode):
        avg PSNR   ovr PSNR     SSIM   PSNR_Y   PSNR_U   PSNR_V
midres    -0.047     -0.040   -0.032    0.003   -0.409   -0.479
hdres     -0.089     -0.094   -0.070   -0.036   -0.494   -0.470

STATS_CHANGED

Change-Id: If9a520c8868fc99a63607c34411c150cd94eb77b
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 735c479..c31b0c0 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -289,7 +289,7 @@
   }
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double noise_level, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const uint8_t *pred, uint32_t *accum, uint16_t *count";
     specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
   }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 7f1a23d..970c6a5 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -798,7 +798,8 @@
 //   mb_row: Row index of the block in the entire frame.
 //   mb_col: Column index of the block in the entire frame.
 //   num_planes: Number of planes in the frame.
-//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order).
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -808,17 +809,10 @@
 void av1_apply_temporal_filter_planewise_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double noise_level, const uint8_t *pred,
+    const int num_planes, const double *noise_levels, const uint8_t *pred,
     uint32_t *accum, uint16_t *count) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
-  // Hyper-parameter for filter weight adjustment.
-  const int frame_height = frame_to_filter->heights[0]
-                           << mbd->plane[0].subsampling_y;
-  const int decay_control = frame_height >= 480 ? 4 : 3;
-  // Control factor for non-local mean approach.
-  const double r = (double)decay_control * (0.7 + log(noise_level + 1.0));
-
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
@@ -850,6 +844,11 @@
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
   const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
 
+  // Hyper-parameter for filter weight adjustment.
+  const int frame_height = frame_to_filter->heights[0]
+                           << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 480 ? 4 : 3;
+
   // Handle planes in sequence.
   plane_offset = 0;
   for (int plane = 0; plane < num_planes; ++plane) {
@@ -875,6 +874,10 @@
           }
         }
 
+        // Control factor for non-local mean approach.
+        const double r =
+            (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
+
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
         // Scale down the difference for high bit depth input.
@@ -910,15 +913,16 @@
 //                           strategy. If set as 0, YUV or YONLY filtering will
 //                           be used (depending on number of planes).
 //   strength: Strength for filter weight adjustment. (Used in YUV filtering and
-//             YONLY filtering.)
+//             YONLY filtering)
 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//                 (Used in YUV filtering and YONLY filtering.)
+//                 (Used in YUV filtering and YONLY filtering)
 //   subblock_filter_weights: The filter weights for each sub-block (row-major
 //                            order). If `use_subblock` is set as 0, the first
 //                            weight will be applied to the entire block. (Used
-//                            in YUV filtering and YONLY filtering.)
-//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
-//                (Used in plane-wise filtering.)
+//                            in YUV filtering and YONLY filtering)
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order). (Used in plane-wise
+//                 filtering)
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -930,7 +934,7 @@
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const int use_planewise_strategy, const int strength,
     const int use_subblock, const int *subblock_filter_weights,
-    const double noise_level, const uint8_t *pred, uint32_t *accum,
+    const double *noise_levels, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
@@ -939,11 +943,11 @@
     if (is_frame_high_bitdepth(frame_to_filter)) {
       av1_apply_temporal_filter_planewise_c(frame_to_filter, mbd, block_size,
                                             mb_row, mb_col, num_planes,
-                                            noise_level, pred, accum, count);
+                                            noise_levels, pred, accum, count);
     } else {
       av1_apply_temporal_filter_planewise(frame_to_filter, mbd, block_size,
                                           mb_row, mb_col, num_planes,
-                                          noise_level, pred, accum, count);
+                                          noise_levels, pred, accum, count);
     }
   } else {  // Commonly used for low-resolution video.
     const int adj_strength = strength + 2 * (mbd->bd - 8);
@@ -1062,14 +1066,15 @@
 //   block_size: Block size used for temporal filtering.
 //   scale: Scaling factor.
 //   strength: Pre-estimated strength for filter weight adjustment.
-//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order).
 // Returns:
 //   Difference between filtered frame and the original frame.
 static FRAME_DIFF tf_do_filtering(
     AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
     const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
     const BLOCK_SIZE block_size, const struct scale_factors *scale,
-    const int strength, const double noise_level) {
+    const int strength, const double *noise_levels) {
   // Basic information.
   const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
   const int frame_height = frame_to_filter->y_crop_height;
@@ -1163,7 +1168,7 @@
             av1_apply_temporal_filter_others(  // Other reference frames.
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 use_planewise_strategy, strength, use_subblock,
-                subblock_filter_weights, noise_level, pred, accum, count);
+                subblock_filter_weights, noise_levels, pred, accum, count);
           }
         }
       }
@@ -1390,10 +1395,14 @@
 
   // Estimate noise and strength.
   const int bit_depth = cpi->common.seq_params.bit_depth;
-  const double y_noise_level = av1_estimate_noise_from_single_plane(
-      frames[filter_frame_idx], 0, bit_depth);
+  const int num_planes = av1_num_planes(&cpi->common);
+  double noise_levels[MAX_MB_PLANE] = { 0 };
+  for (int plane = 0; plane < num_planes; ++plane) {
+    noise_levels[plane] = av1_estimate_noise_from_single_plane(
+        frames[filter_frame_idx], plane, bit_depth);
+  }
   const int strength =
-      tf_estimate_strength(cpi, y_noise_level, cpi->rc.gfu_boost);
+      tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost);
   if (filter_frame_lookahead_idx >= 0) {
     cpi->common.showable_frame =
         (strength == 0 && num_frames_for_filtering == 1) || is_second_arf ||
@@ -1414,7 +1423,7 @@
         frames[0]->y_crop_width, frames[0]->y_crop_height);
     diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
                            filter_frame_idx, is_key_frame, is_second_arf,
-                           block_size, &sf, strength, y_noise_level);
+                           block_size, &sf, strength, noise_levels);
   }
 
   if (is_key_frame) {  // Key frame should always be filtered.
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index fe09518..93d9186 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -218,12 +218,13 @@
 void av1_apply_temporal_filter_planewise_avx2(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double noise_level, const uint8_t *pred,
+    const int num_planes, const double *noise_levels, const uint8_t *pred,
     uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
     assert(0 && "Only support low bit-depth with avx2!");
   }
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
   const int decay_control = frame_height >= 480 ? 4 : 3;
@@ -238,9 +239,9 @@
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
     const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
-    apply_temporal_filter_planewise(ref, frame_stride, pred + mb_pels * plane,
-                                    plane_w, plane_w, plane_h, noise_level,
-                                    decay_control, accum + mb_pels * plane,
-                                    count + mb_pels * plane);
+    apply_temporal_filter_planewise(
+        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
+        noise_levels[plane], decay_control, accum + mb_pels * plane,
+        count + mb_pels * plane);
   }
 }
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index c162c73..0cfa841 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -191,12 +191,13 @@
 void av1_apply_temporal_filter_planewise_sse2(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double noise_level, const uint8_t *pred,
+    const int num_planes, const double *noise_levels, const uint8_t *pred,
     uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
     assert(0 && "Only support low bit-depth with sse2!");
   }
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
   const int decay_control = frame_height >= 480 ? 4 : 3;
@@ -211,9 +212,9 @@
     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
 
     const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
-    apply_temporal_filter_planewise(ref, frame_stride, pred + mb_pels * plane,
-                                    plane_w, plane_w, plane_h, noise_level,
-                                    decay_control, accum + mb_pels * plane,
-                                    count + mb_pels * plane);
+    apply_temporal_filter_planewise(
+        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
+        noise_levels[plane], decay_control, accum + mb_pels * plane,
+        count + mb_pels * plane);
   }
 }
diff --git a/test/temporal_filter_planewise_test.cc b/test/temporal_filter_planewise_test.cc
index f142beb..19a22ac 100644
--- a/test/temporal_filter_planewise_test.cc
+++ b/test/temporal_filter_planewise_test.cc
@@ -39,7 +39,7 @@
 typedef void (*TemporalFilterPlanewiseFunc)(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double noise_level, const uint8_t *pred,
+    const int num_planes, const double *noise_level, const uint8_t *pred,
     uint32_t *accum, uint16_t *count);
 typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
     TemporalFilterPlanewiseFuncParam;
@@ -111,7 +111,7 @@
         GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
       }
     }
-    double sigma = 2.1002103677063437;
+    double sigma[1] = { 2.1002103677063437 };
     DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
     DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
     memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));