rtc-screen : Force zeromv-skip at block level

In parent version, zeromv-skip decision was based on the SAD
metric calculated w.r.t. LAST_FRAME recon at superblock
level. In this CL, the decision is extended to block level
by recalculating SAD w.r.t. recon LAST_FRAME. This CL helps
in speed-up by forcing zeromv-skip for the blocks in a
superblock that have lower recon SAD. The decision is also
extended to the blocks in partial superblocks at frame
boundaries.

    Instruction Count        BD-Rate Loss(%)
cpu   Reduction(%)     avg.psnr   ovr.psnr    ssim
 9       1.398         -1.5024    -0.1591   -0.8699

STATS_CHANGED for rtc-screen speed 9

Change-Id: I5be439fd52b293c78c7ba172d446fb2882cfccbe
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index fe46aec..7a35256 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -1017,9 +1017,16 @@
    */
   int cnt_zeromv;
 
-  /*!\brief Flag to force zeromv-skip block, for nonrd path.
+  /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path.
+   *
+   * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks
+   * in the superblock may be marked as zeromv-skip at block level.
    */
-  int force_zeromv_skip;
+  int force_zeromv_skip_for_sb;
+
+  /*!\brief Flag to force zeromv-skip at block level, for nonrd path.
+   */
+  int force_zeromv_skip_for_blk;
 
   /*! \brief Previous segment id for which qmatrices were updated.
    * This is used to bypass setting of qmatrices if no change in qindex.
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 857e069..e2381cd 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -912,7 +912,7 @@
     x->content_state_sb.source_sad_rd = kMedSad;
     x->content_state_sb.lighting_change = 0;
     x->content_state_sb.low_sumdiff = 0;
-    x->force_zeromv_skip = 0;
+    x->force_zeromv_skip_for_sb = 0;
 
     if (cpi->oxcf.mode == ALLINTRA) {
       x->intra_sb_rdmult_modifier = 128;
@@ -1387,6 +1387,43 @@
 #endif  // !CONFIG_REALTIME_ONLY
 }
 
+#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000
+#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4
+
+// Populates block level thresholds for force zeromv-skip decision
+static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) {
+  if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return;
+
+  // Threshold for forcing zeromv-skip decision is as below:
+  // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103.
+  // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221
+  // allowing slightly higher error for smaller blocks.
+  // Per Pixel Threshold of 64x64 block        Area of 64x64 block         1  1
+  // ------------------------------------=sqrt(---------------------)=sqrt(-)=-
+  // Per Pixel Threshold of 128x128 block      Area of 128x128 block       4  2
+  // Thus, per pixel thresholds for blocks of size 32x32, 16x16,...  can be
+  // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for
+  // small blocks, the same is clipped to 4.
+  const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF;
+  const int num_128x128_pix =
+      block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128];
+
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+    const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize];
+
+    // Calculate the threshold for zeromv-skip decision based on area of the
+    // partition
+    unsigned int thresh_exit_part_blk =
+        (unsigned int)(thresh_exit_128x128_part *
+                           sqrt((double)num_block_pix / num_128x128_pix) +
+                       0.5);
+    thresh_exit_part_blk = AOMMIN(
+        thresh_exit_part_blk,
+        (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix));
+    cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk;
+  }
+}
+
 /*!\brief Encoder setup(only for the current frame), encoding, and recontruction
  * for a single frame
  *
@@ -1650,6 +1687,7 @@
   // has to be called after 'skip_mode_flag' is initialized.
   av1_initialize_rd_consts(cpi);
   av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex);
+  populate_thresh_to_force_zeromv_skip(cpi);
 
   enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
   enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index a5a83f4..f35843f 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -3411,6 +3411,11 @@
    * Struct for the reference structure for RTC.
    */
   RTC_REF rtc_ref;
+
+  /*!
+   * Block level thresholds to force zeromv-skip at partition level.
+   */
+  unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL];
 } AV1_COMP;
 
 /*!
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 9eaa578..73b8a95 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -630,7 +630,7 @@
   TX_SIZE tx_size;
   int k;
 
-  if (x->force_zeromv_skip) {
+  if (x->force_zeromv_skip_for_blk) {
     *early_term = 1;
     rd_stats->rate = 0;
     rd_stats->dist = 0;
@@ -785,7 +785,14 @@
 static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
                               RD_STATS *rd_stats, unsigned int *var_out,
-                              int calculate_rd) {
+                              int calculate_rd, int *early_term) {
+  if (x->force_zeromv_skip_for_blk && early_term != NULL) {
+    *early_term = 1;
+    rd_stats->rate = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+  }
+
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -1542,7 +1549,7 @@
     xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
     unsigned int var;
-    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL);
 
     const int16_t mode_ctx =
         av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
@@ -1654,7 +1661,7 @@
                                 &pf_rd_stats[i], this_early_term, 1, best_sse,
                                 &curr_var, UINT_MAX);
     else
-      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], &curr_var, 1);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], &curr_var, 1, NULL);
     pf_rd_stats[i].rate += av1_get_switchable_rate(
         x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
     cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
@@ -1801,7 +1808,7 @@
                                   &pf_rd_stats[i], this_early_term, 1, best_sse,
                                   NULL, UINT_MAX);
       else
-        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1, NULL);
       pf_rd_stats[i].rate +=
           av1_get_switchable_rate(x, xd, cm->features.interp_filter,
                                   cm->seq_params->enable_dual_filter);
@@ -1864,7 +1871,7 @@
                                     &pf_rd_stats[i], this_early_term, 1,
                                     best_sse, NULL, UINT_MAX);
         else
-          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1);
+          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], NULL, 1, NULL);
 
         pf_rd_stats[i].rate +=
             mode_costs->motion_mode_cost[bsize][mi->motion_mode];
@@ -2096,7 +2103,7 @@
   }
 
   if (use_last_ref_frame &&
-      (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip ||
+      (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk ||
        (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
     use_golden_ref_frame = 0;
     use_alt_ref_frame = 0;
@@ -2790,7 +2797,7 @@
   int use_zeromv =
       cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
       ((cpi->oxcf.speed >= 9 && cpi->rc.avg_frame_low_motion > 70) ||
-       cpi->sf.rt_sf.nonrd_agressive_skip || x->force_zeromv_skip);
+       cpi->sf.rt_sf.nonrd_agressive_skip || x->force_zeromv_skip_for_blk);
   int skip_pred_mv = 0;
   const int num_inter_modes =
       use_zeromv ? NUM_INTER_MODES_REDUCED : NUM_INTER_MODES_RT;
@@ -2897,7 +2904,7 @@
   get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
                          use_ref_frame_mask, &force_skip_low_temp_var);
 
-  skip_pred_mv = x->force_zeromv_skip ||
+  skip_pred_mv = x->force_zeromv_skip_for_blk ||
                  (x->nonrd_prune_ref_frame_search > 2 &&
                   x->color_sensitivity[0] != 2 && x->color_sensitivity[1] != 2);
 
@@ -3004,7 +3011,7 @@
 
     if (!use_ref_frame_mask[ref_frame]) continue;
 
-    if (x->force_zeromv_skip &&
+    if (x->force_zeromv_skip_for_blk &&
         ((!(this_mode == NEARESTMV &&
             frame_mv[this_mode][ref_frame].as_int == 0) &&
           this_mode != GLOBALMV) ||
@@ -3271,7 +3278,8 @@
                                   &this_early_term, 0, best_pickmode.best_sse,
                                   &var, var_threshold);
       } else {
-        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 0);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 0,
+                          &this_early_term);
       }
       if (!comp_pred) {
         vars[INTER_OFFSET(this_mode)][ref_frame] = var;
@@ -3494,7 +3502,7 @@
   ms_stat.num_nonskipped_searches[bsize][DC_PRED]++;
 #endif
 
-  if (!x->force_zeromv_skip)
+  if (!x->force_zeromv_skip_for_blk)
     estimate_intra_mode(cpi, x, bsize, best_early_term,
                         ref_costs_single[INTRA_FRAME], reuse_inter_pred,
                         &orig_dst, tmp, &this_mode_pred, &best_rdc,
@@ -3507,7 +3515,7 @@
 
   // Check for IDTX: based only on Y channel, so avoid when color_sen is set.
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
-      !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip &&
+      !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
       is_inter_mode(best_pickmode.best_mode) &&
       (!cpi->sf.rt_sf.prune_idtx_nonrd ||
        (cpi->sf.rt_sf.prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
@@ -3545,7 +3553,7 @@
       av1_allow_palette(cpi->common.features.allow_screen_content_tools,
                         mi->bsize);
   try_palette = try_palette && is_mode_intra(best_pickmode.best_mode) &&
-                x->source_variance > 0 && !x->force_zeromv_skip &&
+                x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
                 (cpi->rc.high_source_sad || x->source_variance > 500);
 
   if (try_palette) {
diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 9a7aee3..d704bb4 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c
@@ -2223,6 +2223,46 @@
                            cm->seq_params->sb_size, bsize, mi_row, mi_col);
 }
 
+static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi,
+                                              const MACROBLOCK *x,
+                                              BLOCK_SIZE bsize) {
+  // Force zero MV skip based on SB level decision
+  if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb;
+
+  // For blocks of size equal to superblock size, the decision would have been
+  // already done at superblock level. Hence zeromv-skip decision is skipped.
+  const AV1_COMMON *const cm = &cpi->common;
+  if (bsize == cm->seq_params->sb_size) return 0;
+
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const unsigned int thresh_exit_part_y =
+      cpi->zeromv_skip_thresh_exit_part[bsize];
+  const unsigned int thresh_exit_part_uv =
+      CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
+  const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y,
+                                                        thresh_exit_part_uv,
+                                                        thresh_exit_part_uv };
+  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, LAST_FRAME);
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes);
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+        p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+    assert(plane < MAX_MB_PLANE);
+    if (plane_sad >= thresh_exit_part[plane]) return 0;
+  }
+  return 1;
+}
+
 /*!\brief Top level function to pick block mode for non-RD optimized case
  *
  * \ingroup partition_search
@@ -2291,7 +2331,11 @@
     p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
   }
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
-  if (!x->force_zeromv_skip) {
+
+  x->force_zeromv_skip_for_blk =
+      get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
+
+  if (!x->force_zeromv_skip_for_blk) {
     x->source_variance = av1_get_perpixel_variance_facade(
         cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
   }
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index ed3e9b4..7e57bdc 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1406,7 +1406,7 @@
     // TODO(marpan): Check settings for speed 7 and 8.
     if (speed >= 9) {
       sf->rt_sf.prune_idtx_nonrd = 1;
-      sf->rt_sf.part_early_exit_zeromv = 1;
+      sf->rt_sf.part_early_exit_zeromv = 2;
       sf->rt_sf.skip_lf_screen = 1;
       sf->rt_sf.use_nonrd_filter_search = 0;
       sf->rt_sf.nonrd_prune_ref_frame_search = 3;
@@ -1421,6 +1421,7 @@
         sf->part_sf.disable_8x8_part_based_on_qidx = 1;
       sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
       sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
+      sf->rt_sf.part_early_exit_zeromv = 1;
     }
     sf->rt_sf.skip_cdef_sb = 1;
     sf->rt_sf.use_rtc_tf = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1888e23..a740cde 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1550,6 +1550,9 @@
 
   // For nonrd: early exit out of variance partition that sets the
   // block size to superblock size, and sets mode to zeromv-last skip.
+  // 0: disabled
+  // 1: zeromv-skip is enabled at SB level only
+  // 2: zeromv-skip is enabled at SB level and coding block level
   int part_early_exit_zeromv;
 
   // Early terminate inter mode search based on sse in non-rd path.
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 53d3d2a..235a1d9 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -1373,14 +1373,12 @@
   chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, is_key_frame, zero_motion,
                uv_sad);
 
-  x->force_zeromv_skip = 0;
+  x->force_zeromv_skip_for_sb = 0;
   const bool is_set_force_zeromv_skip =
       is_set_force_zeromv_skip_based_on_src_sad(
           cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad,
           x->content_state_sb.source_sad_nonrd);
 
-  const unsigned int thresh_exit_part =
-      (cm->seq_params->sb_size == BLOCK_64X64) ? 5000 : 10000;
   // If the superblock is completely static (zero source sad) and
   // the y_sad (relative to LAST ref) is very small, take the sb_size partition
   // and exit, and force zeromv_last skip mode for nonrd_pickmode.
@@ -1391,18 +1389,25 @@
       cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->cyclic_refresh->apply_cyclic_refresh &&
       segment_id == CR_SEGMENT_ID_BASE && is_set_force_zeromv_skip &&
-      ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0 &&
-      y_sad < thresh_exit_part && uv_sad[0]<(3 * thresh_exit_part)>> 2 &&
-      uv_sad[1]<(3 * thresh_exit_part)>> 2) {
+      ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) {
     const int block_width = mi_size_wide[cm->seq_params->sb_size];
     const int block_height = mi_size_high[cm->seq_params->sb_size];
+    const unsigned int thresh_exit_part_y =
+        cpi->zeromv_skip_thresh_exit_part[bsize];
+    const unsigned int thresh_exit_part_uv =
+        CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
     if (mi_col + block_width <= tile->mi_col_end &&
-        mi_row + block_height <= tile->mi_row_end) {
+        mi_row + block_height <= tile->mi_row_end &&
+        y_sad < thresh_exit_part_y && uv_sad[0] < thresh_exit_part_uv &&
+        uv_sad[1] < thresh_exit_part_uv) {
       set_block_size(cpi, mi_row, mi_col, bsize);
-      x->force_zeromv_skip = 1;
+      x->force_zeromv_skip_for_sb = 1;
       if (vt2) aom_free(vt2);
       if (vt) aom_free(vt);
       return 0;
+    } else if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+               cpi->sf.rt_sf.part_early_exit_zeromv >= 2) {
+      x->force_zeromv_skip_for_sb = 2;
     }
   }
 
diff --git a/av1/encoder/var_based_part.h b/av1/encoder/var_based_part.h
index 0136268..7febc0e 100644
--- a/av1/encoder/var_based_part.h
+++ b/av1/encoder/var_based_part.h
@@ -28,6 +28,8 @@
   100  // Use increased thresholds for midres for speed 9 when qindex is above
        // this threshold
 
+#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \
+  ((3 * (thresh_exit_part)) >> 2)
 /*!\brief Set the thresholds for variance based partition.
  *
  * Set the variance split thresholds for following the block sizes: