Improve VBP motion estimation for RTC speed >= 9

For RTC speed >= 9, enabled evaluating of the neighbours' MVs
for super blocks with low and medium source SAD. This sped up
the RTC encoder.

Bord test results:
            avg_psnr:  ovr_psnr: ssim:  encoding_spdup:
speed 9:
rtc_derf:    0.063      0.059    0.091   0.681
rtc:         0.303      0.302    0.604   1.899
rtc_1080p:   0.082      0.082    0.035   0.639
speed 10:
rtc:         0.342      0.360    0.682   1.817
rtc_1080p:  -0.168     -0.166    0.073   0.450

STATS_CHANGED

Change-Id: I837773254d87b82bc064128ac4afa616492a0454
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 3fd1bcc..7e244d0 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1305,6 +1305,10 @@
       sf->rt_sf.use_adaptive_subpel_search = false;
     }
     if (speed >= 10) {
+      // TODO(yunqingwang@google.com): To be conservative, disable
+      // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga
+      // for now. May enable it in the future.
+      sf->rt_sf.estimate_motion_for_var_based_partition = 0;
       sf->rt_sf.skip_intra_pred = 2;
       sf->rt_sf.hybrid_intra_pickmode = 3;
       sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
@@ -1443,6 +1447,7 @@
     // estimate_motion_for_var_based_partition == 2 helps here.
     if (sf->rt_sf.estimate_motion_for_var_based_partition == 2)
       sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+    if (speed >= 9) sf->rt_sf.estimate_motion_for_var_based_partition = 0;
   }
   // Screen settings.
   if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
@@ -1506,6 +1511,7 @@
     // for screen contents.
     if (sf->rt_sf.estimate_motion_for_var_based_partition == 2)
       sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+    if (speed >= 9) sf->rt_sf.estimate_motion_for_var_based_partition = 0;
   }
 }
 
@@ -1777,7 +1783,7 @@
   }
   if (speed >= 9) {
     sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3;
-    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.estimate_motion_for_var_based_partition = 3;
     sf->rt_sf.prefer_large_partition_blocks = 3;
     sf->rt_sf.skip_intra_pred = 2;
     sf->rt_sf.var_part_split_threshold_shift = 9;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index fd475d9..da0fd7a 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1396,6 +1396,7 @@
   // 0 - Only use zero MV
   // 1 - perform coarse ME
   // 2 - perform coarse ME, and also use neighbours' MVs
+  // 3 - use neighbours' MVs without performing coarse ME
   int estimate_motion_for_var_based_partition;
 
   // For nonrd_use_partition: mode of extra check of leaf partition
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index f9dd0a7..995b64e 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -1210,9 +1210,20 @@
   }
 }
 
+static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0,
+                                        const FULLPEL_MV *mv1) {
+  return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col);
+}
+
 static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
                                               unsigned int *y_sad,
-                                              bool is_small_sb) {
+                                              bool is_small_sb,
+                                              int est_motion) {
+  const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+  // TODO(yunqingwang@google.com): test if this condition works with other
+  // speeds.
+  if (est_motion > 2 && source_sad_nonrd > kMedSad) return;
+
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
   MB_MODE_INFO *mi = xd->mi[0];
@@ -1227,6 +1238,7 @@
 
   // Current best MV
   FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv);
+  const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8;
 
   if (xd->up_available) {
     const MB_MODE_INFO *above_mbmi = xd->above_mbmi;
@@ -1236,7 +1248,7 @@
       clamp_mv(&temp, &subpel_mv_limits);
       above_mv = get_fullmv_from_mv(&temp);
 
-      if (above_mv.row != best_mv.row || above_mv.col != best_mv.col) {
+      if (mv_distance(&best_mv, &above_mv) > 0) {
         uint8_t const *ref_buf =
             get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv);
         above_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
@@ -1253,8 +1265,8 @@
       clamp_mv(&temp, &subpel_mv_limits);
       left_mv = get_fullmv_from_mv(&temp);
 
-      if ((left_mv.row != best_mv.row || left_mv.col != best_mv.col) &&
-          (left_mv.row != above_mv.row || left_mv.col != above_mv.col)) {
+      if (mv_distance(&best_mv, &left_mv) > 0 &&
+          mv_distance(&above_mv, &left_mv) > 0) {
         uint8_t const *ref_buf =
             get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv);
         left_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
@@ -1264,12 +1276,12 @@
     }
   }
 
-  if (above_y_sad < *y_sad && above_y_sad < left_y_sad) {
+  if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) {
     *y_sad = above_y_sad;
     mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv);
     clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
   }
-  if (left_y_sad < *y_sad && left_y_sad < above_y_sad) {
+  if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) {
     *y_sad = left_y_sad;
     mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv);
     clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
@@ -1347,7 +1359,10 @@
     mi->bsize = cm->seq_params->sb_size;
     mi->mv[0].as_int = 0;
     mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
-    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
+
+    const int est_motion =
+        cpi->sf.rt_sf.estimate_motion_for_var_based_partition;
+    if (est_motion == 1 || est_motion == 2) {
       if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
         const MV dummy_mv = { 0, 0 };
         *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
@@ -1365,9 +1380,8 @@
     // Evaluate if neighbours' MVs give better predictions. Zero MV is tested
     // already, so only non-zero MVs are tested here. Here the neighbour blocks
     // are the first block above or left to this superblock.
-    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition == 2 &&
-        (xd->up_available || xd->left_available))
-      evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb);
+    if (est_motion >= 2 && (xd->up_available || xd->left_available))
+      evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion);
 
     *y_sad_last = *y_sad;
   }