rtc: Speedup for dynamic screen content

For rtc screen content mode, speed >= 11:
Detect if the content has high motion, from
source_sad and fast/coarse ME analysis in the
scene detection. If detected, adjust some speed
features and increase the base partition threshold
to make the encoder faster. Coarse ME is done on center
superblock to avoid setting high_motion flag for scroll.

This also contains a fix to bsize_select for the
fixed partitioning.

This has small effect on most clips in the rtc_screen set,
except for very high motion content where it reduces
instruction count by ~50-60%. psnr loss ~0.4dB on high
motion clip.

Change-Id: I68071da2b40731cc81ac15a8010976e6ef234776
diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index e9fd771..4d8be31 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c
@@ -439,7 +439,8 @@
   // should we enable cyclic refresh on this frame.
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
-      scene_change_detected || svc->temporal_layer_id > 0 ||
+      cpi->rc.high_motion_screen_content || scene_change_detected ||
+      svc->temporal_layer_id > 0 ||
       svc->prev_number_spatial_layers != svc->number_spatial_layers ||
       p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
       (svc->number_spatial_layers > 1 &&
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 788ac80..cac8d81 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -545,7 +545,7 @@
     BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
     if (sf->rt_sf.use_fast_fixed_part &&
         x->content_state_sb.source_sad_nonrd < kLowSad) {
-      bsize_select = BLOCK_64X64;
+      bsize_select = cm->seq_params->sb_size;
     }
     const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select;
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cf44db7..94fd17e 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -1993,8 +1993,8 @@
   return best_hash_cost;
 }
 
-static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
-                        int full_search, int *sad) {
+int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
+                     int search_size, int full_search, int *sad) {
   int best_sad = INT_MAX;
   int this_sad;
   int d;
@@ -2174,11 +2174,11 @@
 
   // Find the best match per 1-D search
   best_int_mv->as_fullmv.col =
-      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
-                   full_search, &best_sad_col);
+      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
+                       search_size_width, full_search, &best_sad_col);
   best_int_mv->as_fullmv.row =
-      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
-                   full_search, &best_sad_row);
+      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
+                       search_size_height, full_search, &best_sad_row);
 
   // For screen: select between horiz or vert motion.
   if (is_screen) {
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index d6dc8cb..7dd32e0 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -262,6 +262,9 @@
 
 int av1_init_search_range(int size);
 
+int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl,
+                     int search_size, int full_search, int *sad);
+
 unsigned int av1_int_pro_motion_estimation(
     const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
     int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index ec9ae10..a34ce78 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c
@@ -33,6 +33,7 @@
 #include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/encode_strategy.h"
 #include "av1/encoder/gop_structure.h"
+#include "av1/encoder/mcomp.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/ratectrl.h"
 
@@ -3017,6 +3018,80 @@
   return 0;
 }
 
+// Returns the best sad for column or row motion of the superblock.
+static unsigned int estimate_scroll_motion(
+    const AV1_COMP *cpi, uint8_t *src_buf, uint8_t *last_src_buf,
+    int src_stride, int ref_stride, BLOCK_SIZE bsize, int pos_col, int pos_row,
+    int *best_intmv_col, int *best_intmv_row) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int full_search = 1;
+  // Keep border a multiple of 16.
+  const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+  // Make search_size_height larger to capture more common vertical scroll.
+  // Increase the search if last two frames were dropped.
+  // Values set based on screen test set.
+  int search_size_width = 96;
+  int search_size_height = cpi->rc.drop_count_consec > 1 ? 224 : 192;
+  // Adjust based on boundary.
+  if ((pos_col - search_size_width < -border) ||
+      (pos_col + search_size_width > cm->width + border))
+    search_size_width = border;
+  if ((pos_row - search_size_height < -border) ||
+      (pos_row + search_size_height > cm->height + border))
+    search_size_height = border;
+  const uint8_t *ref_buf;
+  const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+  const int col_norm_factor = 3 + (bw >> 5);
+  const int ref_buf_width = (search_size_width << 1) + bw;
+  const int ref_buf_height = (search_size_height << 1) + bh;
+  int16_t *hbuf = (int16_t *)aom_malloc(ref_buf_width * sizeof(*hbuf));
+  int16_t *vbuf = (int16_t *)aom_malloc(ref_buf_height * sizeof(*vbuf));
+  int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+  int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+  if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+    aom_free(hbuf);
+    aom_free(vbuf);
+    aom_free(src_hbuf);
+    aom_free(src_vbuf);
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+  }
+  // Set up prediction 1-D reference set for rows.
+  ref_buf = last_src_buf - search_size_width;
+  aom_int_pro_row(hbuf, ref_buf, ref_stride, ref_buf_width, bh,
+                  row_norm_factor);
+  // Set up prediction 1-D reference set for cols
+  ref_buf = last_src_buf - search_size_height * ref_stride;
+  aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, ref_buf_height,
+                  col_norm_factor);
+  // Set up src 1-D reference set
+  aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+  aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
+  unsigned int best_sad;
+  int best_sad_col, best_sad_row;
+  // Find the best match per 1-D search
+  *best_intmv_col =
+      av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize],
+                       search_size_width, full_search, &best_sad_col);
+  *best_intmv_row =
+      av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize],
+                       search_size_height, full_search, &best_sad_row);
+  if (best_sad_col < best_sad_row) {
+    *best_intmv_row = 0;
+    best_sad = best_sad_col;
+  } else {
+    *best_intmv_col = 0;
+    best_sad = best_sad_row;
+  }
+  aom_free(hbuf);
+  aom_free(vbuf);
+  aom_free(src_hbuf);
+  aom_free(src_vbuf);
+  return best_sad;
+}
+
 /*!\brief Check for scene detection, for 1 pass real-time mode.
  *
  * Compute average source sad (temporal sad: between current source and
@@ -3184,6 +3259,49 @@
   if (num_samples > 0)
     rc->percent_blocks_with_motion =
         ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+  // Update the high_motion_screen_content flag on TL0. Avoid the update
+  // if too many consecutive frame drops occurred.
+  const uint64_t thresh_high_motion = 9 * 64 * 64;
+  if (cpi->svc.temporal_layer_id == 0 && rc->drop_count_consec < 3) {
+    cpi->rc.high_motion_screen_content = 0;
+    if (cpi->oxcf.speed >= 11 &&
+        cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+        rc->percent_blocks_with_motion > 40 &&
+        rc->prev_avg_source_sad > thresh_high_motion &&
+        rc->avg_source_sad > thresh_high_motion &&
+        rc->avg_frame_low_motion < 60 && unscaled_src->y_width >= 1280 &&
+        unscaled_src->y_height >= 720) {
+      cpi->rc.high_motion_screen_content = 1;
+      // Compute fast coarse/global motion for 128x128 superblock centered
+      // at middle of frames, to determine if motion is scroll.
+      int pos_col = (unscaled_src->y_width >> 1) - 64;
+      int pos_row = (unscaled_src->y_height >> 1) - 64;
+      src_y = unscaled_src->y_buffer + pos_row * src_ystride + pos_col;
+      last_src_y =
+          unscaled_last_src->y_buffer + pos_row * last_src_ystride + pos_col;
+      int best_intmv_col = 0;
+      int best_intmv_row = 0;
+      unsigned int y_sad = estimate_scroll_motion(
+          cpi, src_y, last_src_y, src_ystride, last_src_ystride, BLOCK_128X128,
+          pos_col, pos_row, &best_intmv_col, &best_intmv_row);
+      if (y_sad < 100 && (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16))
+        cpi->rc.high_motion_screen_content = 0;
+    }
+    // Pass the flag value to all layer frames.
+    if (cpi->svc.number_spatial_layers > 1 ||
+        cpi->svc.number_temporal_layers > 1) {
+      SVC *svc = &cpi->svc;
+      for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+        for (int tl = 1; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->high_motion_screen_content = rc->high_motion_screen_content;
+        }
+      }
+    }
+  }
   // Scene detection is only on base SLO, and using full/orignal resolution.
   // Pass the state to the upper spatial layers.
   if (cpi->svc.number_spatial_layers > 1) {
diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 5fcb65e..0a5cfbc 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h
@@ -190,6 +190,7 @@
   int sframe_due;
 
   int high_source_sad;
+  int high_motion_screen_content;
   uint64_t avg_source_sad;
   uint64_t prev_avg_source_sad;
   uint64_t frame_source_sad;
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 31fe03a..4b6ea16 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1608,10 +1608,18 @@
           sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
       }
     }
-    if (cpi->rc.max_block_source_sad > 20000 &&
-        cpi->rc.frame_source_sad > 100 && speed >= 6 &&
-        (cpi->rc.percent_blocks_with_motion > 1 ||
-         cpi->svc.last_layer_dropped[0])) {
+    if (speed >= 11 && cpi->rc.high_motion_screen_content) {
+      sf->rt_sf.higher_thresh_scene_detection = 1;
+      sf->rt_sf.force_only_last_ref = 1;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+      sf->part_sf.fixed_partition_size = BLOCK_32X32;
+      sf->rt_sf.use_fast_fixed_part = 1;
+      sf->rt_sf.increase_source_sad_thresh = 1;
+      sf->rt_sf.selective_cdf_update = 1;
+    } else if (cpi->rc.max_block_source_sad > 20000 &&
+               cpi->rc.frame_source_sad > 100 && speed >= 6 &&
+               (cpi->rc.percent_blocks_with_motion > 1 ||
+                cpi->svc.last_layer_dropped[0])) {
       sf->mv_sf.search_method = NSTEP;
       sf->rt_sf.fullpel_search_step_param = 2;
     }
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index e5908f4..0b449e8 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -627,14 +627,12 @@
   }
 }
 
-// Increase partition thresholds for noisy content. Apply it only for
-// superblocks where sumdiff is low, as we assume the sumdiff of superblock
-// whose only change is due to noise will be low (i.e, noise will average
-// out over large block).
-static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
-                                                    int64_t threshold_base,
-                                                    int content_lowsumdiff,
-                                                    int num_pixels) {
+// Increase the base partition threshold, based on content and noise level.
+static AOM_INLINE int64_t tune_base_thresh_content(AV1_COMP *cpi,
+                                                   int64_t threshold_base,
+                                                   int content_lowsumdiff,
+                                                   int source_sad_nonrd,
+                                                   int num_pixels) {
   AV1_COMMON *const cm = &cpi->common;
   int64_t updated_thresh_base = threshold_base;
   if (cpi->noise_estimate.enabled && content_lowsumdiff &&
@@ -647,23 +645,12 @@
              !cpi->sf.rt_sf.prefer_large_partition_blocks)
       updated_thresh_base = (5 * updated_thresh_base) >> 2;
   }
-  // TODO(kyslov) Enable var based partition adjusment on temporal denoising
-#if 0  // CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
-      cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
-      updated_thresh_base =
-          av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level,
-                                content_state, cpi->svc.temporal_layer_id);
-  else
-    threshold_base =
-        scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width,
-                                  cm->height, cpi->ppi->rtc_ref.non_reference_frame);
-#else
-  // Increase base variance threshold based on content_state/sum_diff level.
   updated_thresh_base = scale_part_thresh_content(
       updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
       cpi->ppi->rtc_ref.non_reference_frame);
-#endif
+  if (cpi->oxcf.speed >= 11 && source_sad_nonrd > kLowSad &&
+      cpi->rc.high_motion_screen_content)
+    updated_thresh_base = updated_thresh_base << 5;
   return updated_thresh_base;
 }
 
@@ -686,8 +673,8 @@
     return;
   }
 
-  threshold_base = tune_thresh_noisy_content(cpi, threshold_base,
-                                             content_lowsumdiff, num_pixels);
+  threshold_base = tune_base_thresh_content(
+      cpi, threshold_base, content_lowsumdiff, source_sad_nonrd, num_pixels);
   thresholds[0] = threshold_base >> 1;
   thresholds[1] = threshold_base;
   thresholds[3] = threshold_base << threshold_left_shift;