Improve RTC temporal filtering

This change focused on improving performance for <360p.
More work will be done and more CLs will follow.

Borg result on rtc_derf set:
          avg_psnr:  ovr_psnr:   ssim:
speed 9:   -0.405     -0.240    -0.547
speed 10:  -0.344     -0.204    -0.494
Encoder speed drop: <=0.5%

STATS_CHANGED for rtc_derf speed 9 & 10

Change-Id: I28ecc68b70a08d27952eb5a1115e3ed48ab5cb64
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f8e0d33..857e069 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -803,8 +803,9 @@
  */
 // TODO(any): consolidate sfs to make interface cleaner
 static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
-                                               MACROBLOCK *const x, int mi_row,
-                                               int mi_col) {
+                                               MACROBLOCK *const x,
+                                               TileDataEnc *tile_data,
+                                               int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   bool calc_src_content = false;
 
@@ -823,7 +824,8 @@
     else
       x->content_state_sb.source_sad_rd = kZeroSad;
   }
-  if (calc_src_content) av1_source_content_sb(cpi, x, mi_row, mi_col);
+  if (calc_src_content)
+    av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
 }
 
 /*!\brief Encode a superblock row by breaking it into superblocks
@@ -939,7 +941,7 @@
 
     // Grade the temporal variation of the sb, the grade will be used to decide
     // fast mode search strategy for coding blocks
-    grade_source_content_sb(cpi, x, mi_row, mi_col);
+    grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
 
     // encode the superblock
     if (use_nonrd_mode) {
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index e3a606c..e40698c 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -1309,10 +1309,78 @@
               CFL_ALPHABET_SIZE);
 }
 
+// Check neighbor blocks' motion information.
+static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride,
+                                 const TileInfo *const tile_info, int mi_row,
+                                 int mi_col) {
+  int is_above_low_motion = 1;
+  int is_left_low_motion = 1;
+  const int thr = 24;
+
+  // Check above block.
+  if (mi_row > tile_info->mi_row_start) {
+    const MB_MODE_INFO *above_mbmi = mi[-mi_stride];
+    const int_mv above_mv = above_mbmi->mv[0];
+    if (above_mbmi->mode >= INTRA_MODE_END &&
+        (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr))
+      is_above_low_motion = 0;
+  }
+
+  // Check left block.
+  if (mi_col > tile_info->mi_col_start) {
+    const MB_MODE_INFO *left_mbmi = mi[-1];
+    const int_mv left_mv = left_mbmi->mv[0];
+    if (left_mbmi->mode >= INTRA_MODE_END &&
+        (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr))
+      is_left_low_motion = 0;
+  }
+
+  return (is_above_low_motion && is_left_low_motion);
+}
+
+// Check this block's motion in a fast way.
+static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y,
+                                       int src_ystride,
+                                       const uint8_t *last_src_y,
+                                       int last_src_ystride, int mi_row,
+                                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+  unsigned int blk_sad = INT_MAX;
+  if (cpi->src_sad_blk_64x64 != NULL) {
+    const int sb_size_by_mb = (bsize == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
+    const int sb_cols =
+        (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+    const int sbi_col = mi_col / sb_size_by_mb;
+    const int sbi_row = mi_row / sb_size_by_mb;
+    blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+  } else {
+    blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                          last_src_ystride);
+  }
+
+  // Search 4 1-away points.
+  const uint8_t *const search_pos[4] = {
+    last_src_y - last_src_ystride,
+    last_src_y - 1,
+    last_src_y + 1,
+    last_src_y + last_src_ystride,
+  };
+  unsigned int sad_arr[4];
+  cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos,
+                                 last_src_ystride, sad_arr);
+
+  blk_sad = (blk_sad * 5) >> 3;
+  return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] &&
+          blk_sad < sad_arr[2] && blk_sad < sad_arr[3]);
+}
+
 // Grade the temporal variation of the source by comparing the current sb and
 // its collocated block in the last frame.
-void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                           int mi_col) {
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+                           int mi_row, int mi_col) {
   unsigned int tmp_sse;
   unsigned int tmp_variance;
   const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
@@ -1363,6 +1431,9 @@
       cpi->last_source->y_height != cpi->source->y_height)
     return;
   if (!cpi->sf.rt_sf.use_rtc_tf || tmp_sse == 0) return;
+  if (cpi->sf.rt_sf.use_rtc_tf == 2 &&
+      (cpi->rc.high_source_sad || cpi->rc.frame_source_sad > 20000))
+    return;
 
   // In-place temporal filter. If psnr calculation is enabled, we store the
   // source for that.
@@ -1371,10 +1442,35 @@
   const unsigned int nmean2 = tmp_sse - tmp_variance;
   const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
                                          cm->seq_params->bit_depth);
-  const unsigned int threshold = 3 * ac_q_step * ac_q_step / 2;
+  // Keep the threshold for >= 360p unchanged. It will be tested and modified
+  // later when needed.
+  const unsigned int threshold = (cpi->sf.rt_sf.use_rtc_tf == 1)
+                                     ? ((3 * ac_q_step * ac_q_step) >> 1)
+                                     : 250 * ac_q_step;
 
   // TODO(yunqing): use a weighted sum instead of averaging in filtering.
   if (tmp_variance <= threshold && nmean2 <= 15) {
+    if (cpi->sf.rt_sf.use_rtc_tf == 2) {
+      // Check neighbor blocks. If neighbor blocks aren't low-motion blocks,
+      // skip temporal filtering for this block.
+      MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                          get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+      const TileInfo *const tile_info = &tile_data->tile_info;
+      const int is_neighbor_blocks_low_motion = check_neighbor_blocks(
+          mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col);
+      if (!is_neighbor_blocks_low_motion) return;
+
+      // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB
+      // size.
+      // Test several nearby points. If non-zero mv exists, don't do temporal
+      // filtering.
+      const int is_this_blk_low_motion =
+          fast_detect_non_zero_motion(cpi, src_y, src_ystride, last_src_y,
+                                      last_src_ystride, mi_row, mi_col);
+
+      if (!is_this_blk_low_motion) return;
+    }
+
     const int shift_x[2] = { 0, cpi->source->subsampling_x };
     const int shift_y[2] = { 0, cpi->source->subsampling_y };
     const uint8_t h = block_size_high[bsize];
diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 462600c..0e9c439 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h
@@ -391,8 +391,8 @@
 void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
                          int wt_left, int wt_tr);
 
-void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
-                           int mi_col);
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+                           int mi_row, int mi_col);
 
 void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
                     int mi_row, int mi_col);
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 80457a2..9f11e87 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1279,6 +1279,8 @@
       sf->rt_sf.use_comp_ref_nonrd = 0;
       sf->rt_sf.nonrd_agressive_skip = 1;
       sf->rt_sf.skip_intra_pred = 1;
+      sf->rt_sf.use_rtc_tf = 2;
+
 // TODO(kyslov) Re-enable when AV1 models are trained
 #if 0
 #if CONFIG_RT_ML_PARTITIONING