Consider motion search error in plane-wise filter.

In plane-wise temporal filtering strategy, filter weight is assigned
based on motion search. Currently, only a small neighborhood (i.e., a
5x5 window) is considered for each individual pixel. However, the motion
search result of the entire block also reflects the search accuracy.

This CL improves the plane-wise strategy by considering both local (5x5
window) and global (the entire filtering block) information to assign
filter weight. In particular, the window-wise error plays a more
important role than the block-wise error, but both of them are involved
in the filtering process.

NOTE: This CL only affects the performance on midres and hdres datasets.

Experimental results:

Under Speed-4 (two-pass mode):
          avg PSNR   ovr PSNR     SSIM
midres      -0.056     -0.015   -0.026
midres2     -0.029      0.000   -0.000
hdres       -0.085     -0.076   -0.049
hdres2      -0.050     -0.028   -0.017

Under Speed-1 (two-pass mode):
	  avg PSNR   ovr PSNR     SSIM
midres      -0.129     -0.033   -0.036
midres2     -0.074     -0.044   -0.026
hdres       -0.106     -0.091   -0.098
hdres2      -0.070     -0.055   -0.057

STATS_CHANGED

Change-Id: Ie883a0105b1bdf4cd41b40b2e5be7f2f7178dd50
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3dfc6b5..f78bde4 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -289,7 +289,7 @@
   }
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const uint8_t *pred, uint32_t *accum, uint16_t *count";
     specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
   }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index f827356..4592813 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -642,6 +642,9 @@
 //   num_planes: Number of planes in the frame.
 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
 //                 with each plane (in Y, U, V order).
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -651,7 +654,8 @@
 void av1_apply_temporal_filter_planewise_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const uint8_t *pred,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const uint8_t *pred,
     uint32_t *accum, uint16_t *count) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
@@ -733,18 +737,25 @@
           }
         }
 
+        // Scale down the difference for high bit depth input.
+        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
+        const double window_error = (double)(sum_square_diff) / num_ref_pixels;
+        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+        const double block_error =
+            (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
         // Control factor for non-local mean approach.
         const double r =
             (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
 
-        const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        // Scale down the difference for high bit depth input.
-        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
-        const double scaled_diff = AOMMAX(
-            -(double)(sum_square_diff / num_ref_pixels) / (2 * r * r), -15.0);
+        // Compute filter weight.
+        const double scaled_diff =
+            AOMMAX(-(window_error + block_error / 10) / (2 * r * r), -15.0);
         const int adjusted_weight =
             (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
         accum[idx] += adjusted_weight * pred_value;
         count[idx] += adjusted_weight;
 
@@ -779,6 +790,8 @@
 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
 //                 with each plane (in Y, U, V order). (Used in plane-wise
 //                 strategy)
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -790,20 +803,22 @@
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
     const int num_planes, const int use_planewise_strategy, const int strength,
     const int use_subblock, const int *subblock_filter_weights,
-    const double *noise_levels, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
+    const double *noise_levels, const int block_mse, const int *subblock_mses,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   if (use_planewise_strategy) {  // Commonly used for high-resolution video.
     // TODO(any): avx2 and sse2 version should also support high bit-depth.
     if (is_frame_high_bitdepth(frame_to_filter)) {
-      av1_apply_temporal_filter_planewise_c(frame_to_filter, mbd, block_size,
-                                            mb_row, mb_col, num_planes,
-                                            noise_levels, pred, accum, count);
+      av1_apply_temporal_filter_planewise_c(
+          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+          noise_levels, use_subblock, block_mse, subblock_mses, pred, accum,
+          count);
     } else {
       av1_apply_temporal_filter_planewise(frame_to_filter, mbd, block_size,
                                           mb_row, mb_col, num_planes,
-                                          noise_levels, pred, accum, count);
+                                          noise_levels, use_subblock, block_mse,
+                                          subblock_mses, pred, accum, count);
     }
   } else {  // Commonly used for low-resolution video.
     if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
@@ -1014,7 +1029,8 @@
           av1_apply_temporal_filter_others(
               frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
               use_planewise_strategy, strength, use_subblock,
-              subblock_filter_weights, noise_levels, pred, accum, count);
+              subblock_filter_weights, noise_levels, block_mse, subblock_mses,
+              pred, accum, count);
         }
       }
 
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 1d10179..07e14f7 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -50,7 +50,7 @@
     vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
 
     _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
-    // Set zero to unitialized memory to avoid uninitialized loads later
+    // Set zero to uninitialized memory to avoid uninitialized loads later
     *(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
 
     src1 += stride, src2 += stride2;
@@ -84,7 +84,7 @@
     vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
     _mm256_storeu_si256((__m256i *)(dst), vres1);
     _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
-    // Set zero to unitialized memory to avoid uninitialized loads later
+    // Set zero to uninitialized memory to avoid uninitialized loads later
     *(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
 
     src1 += stride;
@@ -130,7 +130,8 @@
 static void apply_temporal_filter_planewise(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, unsigned int *accumulator,
+    const double sigma, const int decay_control, const int use_subblock,
+    const int block_mse, const int *subblock_mses, unsigned int *accumulator,
     uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
     int plane, int ss_x_shift, int ss_y_shift) {
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
@@ -217,8 +218,15 @@
           }
         }
       }
+
+      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error =
+          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
       const double scaled_diff =
-          AOMMAX(-(double)(diff_sse / num_ref_pixels) / (2 * h * h), -15.0);
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h), -15.0);
       const int adjusted_weight =
           (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
 
@@ -231,7 +239,8 @@
 void av1_apply_temporal_filter_planewise_avx2(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const uint8_t *pred,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const uint8_t *pred,
     uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
@@ -265,9 +274,9 @@
 
     apply_temporal_filter_planewise(
         ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+        noise_levels[plane], decay_control, use_subblock, block_mse,
+        subblock_mses, accum + mb_pels * plane, count + mb_pels * plane,
+        luma_sq_error, chroma_sq_error, plane, ss_x_shift, ss_y_shift);
   }
   if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 5d8e5e6..4fc8738 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -41,7 +41,7 @@
 
   for (int i = 0; i < block_height; i++) {
     for (int j = 0; j < block_width; j += 16) {
-      // Set zero to unitialized memory to avoid uninitialized loads later
+      // Set zero to uninitialized memory to avoid uninitialized loads later
       *(uint32_t *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
 
       __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
@@ -62,7 +62,7 @@
       _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
     }
 
-    // Set zero to unitialized memory to avoid uninitialized loads later
+    // Set zero to uninitialized memory to avoid uninitialized loads later
     *(uint32_t *)(dst + block_width + 2) =
         _mm_cvtsi128_si32(_mm_setzero_si128());
 
@@ -105,7 +105,8 @@
 static void apply_temporal_filter_planewise(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, unsigned int *accumulator,
+    const double sigma, const int decay_control, const int use_subblock,
+    const int block_mse, const int *subblock_mses, unsigned int *accumulator,
     uint16_t *count, uint16_t *luma_sq_error, uint16_t *chroma_sq_error,
     int plane, int ss_x_shift, int ss_y_shift) {
   assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
@@ -196,8 +197,14 @@
         }
       }
 
+      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error =
+          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
       const double scaled_diff =
-          AOMMAX(-(double)(diff_sse / num_ref_pixels) / (2 * h * h), -15.0);
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h), -15.0);
       const int adjusted_weight =
           (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
 
@@ -210,7 +217,8 @@
 void av1_apply_temporal_filter_planewise_sse2(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const uint8_t *pred,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const uint8_t *pred,
     uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
@@ -244,9 +252,9 @@
 
     apply_temporal_filter_planewise(
         ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+        noise_levels[plane], decay_control, use_subblock, block_mse,
+        subblock_mses, accum + mb_pels * plane, count + mb_pels * plane,
+        luma_sq_error, chroma_sq_error, plane, ss_x_shift, ss_y_shift);
   }
   if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/test/temporal_filter_planewise_test.cc b/test/temporal_filter_planewise_test.cc
index 3c44600..b19ec29 100644
--- a/test/temporal_filter_planewise_test.cc
+++ b/test/temporal_filter_planewise_test.cc
@@ -40,7 +40,8 @@
 typedef void (*TemporalFilterPlanewiseFunc)(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_level, const uint8_t *pred,
+    const int num_planes, const double *noise_level, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const uint8_t *pred,
     uint32_t *accum, uint16_t *count);
 typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
     TemporalFilterPlanewiseFuncParam;
@@ -124,6 +125,9 @@
 
     assert(width == 32 && height == 32);
     const BLOCK_SIZE block_size = BLOCK_32X32;
+    const int use_subblock = 0;
+    const int block_mse = 0;
+    const int subblock_mses[4] = { 0, 0, 0, 0 };
     const int mb_row = 0;
     const int mb_col = 0;
     const int num_planes = 1;
@@ -143,15 +147,18 @@
     mbd->bd = 8;
 
     params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, src2_, accumulator_ref, count_ref);
+                     sigma, use_subblock, block_mse, subblock_mses, src2_,
+                     accumulator_ref, count_ref);
     params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, src2_, accumulator_mod, count_mod);
+                     sigma, use_subblock, block_mse, subblock_mses, src2_,
+                     accumulator_mod, count_mod);
 
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < run_times; j++) {
         params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, src2_, accumulator_ref, count_ref);
+                         sigma, use_subblock, block_mse, subblock_mses, src2_,
+                         accumulator_ref, count_ref);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -160,7 +167,8 @@
       aom_usec_timer_start(&test_timer);
       for (int j = 0; j < run_times; j++) {
         params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, src2_, accumulator_mod, count_mod);
+                         sigma, use_subblock, block_mse, subblock_mses, src2_,
+                         accumulator_mod, count_mod);
       }
       aom_usec_timer_mark(&test_timer);
       const int elapsed_time_simd =