rtc: Reduce blk-level MV pel precision

This CL extends the sf force_half_pel_block to speeds 7, 8
for rtc set with a new algorithm. It reduces MV precision for
relatively-static (e.g. background), low-complex large blocks
using source variance, SAD, bsize and qp.

The speed-up / BD Rate trade-off is:
 ---------------------------------------------------------
|cpu|Resolution|Instr. Count|    BD-Rate Drop (%)         |
|   |          |Reduction(%)|avg. psnr|ovr. psnr|  ssim   |
 ---------------------------------------------------------
| 7 |  HDRES   |  2.256     |  0.2120 |  0.2121 | 0.2066  |
| 8 |  HDRES   |  2.195     |  0.1821 |  0.1830 | 0.1826  |
 ---------------------------------------------------------

Worst and Best case-drops:
 ----------------------------------------------------------
|      |   |                 |        BD Rate Drop (%)     |
|      |cpu|    Clip         |   (-ve: Gain, +ve Loss)     |
|      |   |                 |-----------------------------|
|      |   |                 |avg.psnr |ovr. psnr|  ssim   |
|----------------------------------------------------------|
| Best | 7 |testnoise720p    | -0.3958 | -0.2445 | -0.7787 |
|      | 8 |testnoise720p    | -0.7229 | -0.4860 | -0.7079 |
|----------------------------------------------------------|
|Worst | 7 |vidyo1 (avg.psnr)|  0.8261 |  0.7455 |  0.5534 |
|      | 7 |mj1vc720p (ssim) |  0.6386 |  0.6482 |  0.7535 |
|      | 8 |mj1vc720p        |  0.8805 |  0.8451 |  0.9263 |
 ----------------------------------------------------------

No changes to speed 9, 10 or rtc-derf set

STATS_CHANGED

Change-Id: I7f9ba58274e003b8cfc188cd80099b28f187be67
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 0ad118d..9fb37e4 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -807,8 +807,9 @@
 } SOURCE_SAD;
 
 typedef struct {
-  //! SAD levels in non-rd path for var-based part and inter-mode search
-  SOURCE_SAD source_sad_nonrd;
+  //! SAD levels in non-rd path
+  //! 0: var-based part and inter-mode search, 1: blk-level mv pel precision
+  SOURCE_SAD source_sad_nonrd[2];
   //! SAD levels in rd-path for var-based part qindex thresholds
   SOURCE_SAD source_sad_rd;
   int lighting_change;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 2a395aa..64cca4e 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -797,10 +797,12 @@
   if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
       cpi->svc.number_spatial_layers <= 1 &&
       cm->current_frame.frame_type != KEY_FRAME) {
-    if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0)
+    if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) {
       calc_src_content = true;
-    else
-      x->content_state_sb.source_sad_nonrd = kZeroSad;
+    } else {
+      x->content_state_sb.source_sad_nonrd[0] = kZeroSad;
+      x->content_state_sb.source_sad_nonrd[1] = kZeroSad;
+    }
   } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) &&
              (cm->width * cm->height <= 352 * 288)) {
     if (cpi->rc.frame_source_sad > 0)
@@ -891,7 +893,8 @@
     x->color_sensitivity_sb_g[1] = 0;
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
-    x->content_state_sb.source_sad_nonrd = kMedSad;
+    x->content_state_sb.source_sad_nonrd[0] = kMedSad;
+    x->content_state_sb.source_sad_nonrd[1] = kMedSad;
     x->content_state_sb.source_sad_rd = kMedSad;
     x->content_state_sb.lighting_change = 0;
     x->content_state_sb.low_sumdiff = 0;
diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 3846b78..a2eddd4 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c
@@ -1321,8 +1321,10 @@
   uint8_t *last_src_y = cpi->last_source->y_buffer;
   int last_src_ystride = cpi->last_source->y_stride;
   const int offset = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
-  uint64_t avg_source_sse_threshold[2] = { 100000,   // ~5*5*(64*64)
-                                           36000 };  // ~3*3*(64*64)
+  uint64_t avg_source_sse_threshold_low[3] = { 100000,   // ~5*5*(64*64)
+                                               36000,    // ~3*3*(64*64)
+                                               10000 };  // ~1.5*1.5*(64*64)
+
   uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
   uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -1334,16 +1336,21 @@
   tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
                                             last_src_ystride, &tmp_sse);
   // rd thresholds
-  if (tmp_sse < avg_source_sse_threshold[1])
+  if (tmp_sse < avg_source_sse_threshold_low[1])
     x->content_state_sb.source_sad_rd = kLowSad;
 
   // nonrd thresholds
   if (tmp_sse == 0)
-    x->content_state_sb.source_sad_nonrd = kZeroSad;
-  else if (tmp_sse < avg_source_sse_threshold[0])
-    x->content_state_sb.source_sad_nonrd = kLowSad;
+    x->content_state_sb.source_sad_nonrd[0] = kZeroSad;
+  else if (tmp_sse < avg_source_sse_threshold_low[0])
+    x->content_state_sb.source_sad_nonrd[0] = kLowSad;
   else if (tmp_sse > avg_source_sse_threshold_high)
-    x->content_state_sb.source_sad_nonrd = kHighSad;
+    x->content_state_sb.source_sad_nonrd[0] = kHighSad;
+
+  if (tmp_sse == 0)
+    x->content_state_sb.source_sad_nonrd[1] = kZeroSad;
+  else if (tmp_sse < avg_source_sse_threshold_low[2])
+    x->content_state_sb.source_sad_nonrd[1] = kLowSad;
 
   // Detect large lighting change.
   // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index cf3af34..1553186 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -205,19 +205,34 @@
   memset(&bp->pmi, 0, sizeof(bp->pmi));
 }
 
-static INLINE int subpel_select(AV1_COMP *cpi, BLOCK_SIZE bsize, int_mv *mv) {
-  int mv_thresh = 4;
-  const int is_low_resoln =
-      (cpi->common.width * cpi->common.height <= 320 * 240);
-  mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
-  if (cpi->rc.avg_frame_low_motion > 0 && cpi->rc.avg_frame_low_motion < 40)
-    mv_thresh = 12;
-  mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
-  if (abs(mv->as_fullmv.row) >= mv_thresh ||
-      abs(mv->as_fullmv.col) >= mv_thresh)
-    return HALF_PEL;
-  else
-    return cpi->sf.mv_sf.subpel_force_stop;
+static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                                int_mv *mv) {
+  assert(cpi->sf.rt_sf.reduce_mv_pel_precision);
+  if (cpi->sf.rt_sf.reduce_mv_pel_precision == 2) {
+    int mv_thresh = 4;
+    const int is_low_resoln =
+        (cpi->common.width * cpi->common.height <= 320 * 240);
+    mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
+    if (cpi->rc.avg_frame_low_motion > 0 && cpi->rc.avg_frame_low_motion < 40)
+      mv_thresh = 12;
+    mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
+    if (abs(mv->as_fullmv.row) >= mv_thresh ||
+        abs(mv->as_fullmv.col) >= mv_thresh)
+      return HALF_PEL;
+  } else if (cpi->sf.rt_sf.reduce_mv_pel_precision == 1) {
+    // Reduce MV precision for relatively static (e.g. background), low-complex
+    // large areas
+    const int qband = x->qindex >> (QINDEX_BITS - 2);
+    assert(qband < 4);
+    if (x->content_state_sb.source_sad_nonrd[1] <= kLowSad &&
+        bsize > BLOCK_16X16 && qband != 0) {
+      if (x->source_variance < 500)
+        return FULL_PEL;
+      else if (x->source_variance < 5000)
+        return HALF_PEL;
+    }
+  }
+  return cpi->sf.mv_sf.subpel_force_stop;
 }
 
 /*!\brief Runs Motion Estimation for a specific block and specific ref frame.
@@ -311,9 +326,9 @@
     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
                                       cost_list);
-    if (cpi->sf.rt_sf.force_half_pel_block &&
+    if (cpi->sf.rt_sf.reduce_mv_pel_precision &&
         cpi->sf.mv_sf.subpel_force_stop < HALF_PEL)
-      ms_params.forced_stop = subpel_select(cpi, bsize, tmp_mv);
+      ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv);
     if (cpi->sf.rt_sf.reduce_zeromv_mvres && ref_mv.row == 0 &&
         ref_mv.col == 0 && start_mv.row == 0 && start_mv.col == 0) {
       // If both the refmv and the fullpel results show zero mv, then there is
@@ -407,9 +422,9 @@
 
     SUBPEL_MOTION_SEARCH_PARAMS ms_params;
     av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
-    if (cpi->sf.rt_sf.force_half_pel_block &&
+    if (cpi->sf.rt_sf.reduce_mv_pel_precision &&
         cpi->sf.mv_sf.subpel_force_stop < HALF_PEL)
-      ms_params.forced_stop = subpel_select(cpi, bsize, &best_mv);
+      ms_params.forced_stop = subpel_select(cpi, x, bsize, &best_mv);
     MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
     cpi->mv_search_params.find_fractional_mv_step(
         xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
@@ -1297,7 +1312,8 @@
     int left_mv_valid = 0;
     int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
     int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
-    if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
+    if (bsize >= BLOCK_64X64 &&
+        content_state_sb.source_sad_nonrd[0] != kHighSad &&
         spatial_variance < 300 &&
         (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
       this_rdc->rdcost = this_rdc->rdcost << 2;
@@ -2080,7 +2096,7 @@
     // capture case where only part of frame has high motion.
     // Exclude screen content mode.
     if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
-        x->content_state_sb.source_sad_nonrd >= kHighSad &&
+        x->content_state_sb.source_sad_nonrd[0] >= kHighSad &&
         bsize <= BLOCK_32X32 && cpi->rc.frame_source_sad < 50000)
       use_golden_ref_frame = 1;
   }
@@ -2093,7 +2109,7 @@
 
   // Skip golden reference if color is set, on flat blocks with motion.
   if (x->source_variance < 500 &&
-      x->content_state_sb.source_sad_nonrd > kLowSad &&
+      x->content_state_sb.source_sad_nonrd[0] > kLowSad &&
       (x->color_sensitivity_sb_g[0] == 1 || x->color_sensitivity_sb_g[1] == 1))
     use_golden_ref_frame = 0;
 
@@ -2209,18 +2225,18 @@
       do_early_exit_rdthresh = 0;
     }
     if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
-         x->content_state_sb.source_sad_nonrd >= kHighSad) ||
+         x->content_state_sb.source_sad_nonrd[0] >= kHighSad) ||
         (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
          x->source_variance == 0 &&
          ((bsize >= BLOCK_32X32 &&
-           x->content_state_sb.source_sad_nonrd != kZeroSad) ||
+           x->content_state_sb.source_sad_nonrd[0] != kZeroSad) ||
           x->color_sensitivity[0] == 1 || x->color_sensitivity[1] == 1)))
       force_intra_check = 1;
     // For big blocks worth checking intra (since only DC will be checked),
     // even if best_early_term is set.
     if (bsize >= BLOCK_32X32) best_early_term = 0;
   } else if (cpi->sf.rt_sf.source_metrics_sb_nonrd &&
-             x->content_state_sb.source_sad_nonrd == kLowSad) {
+             x->content_state_sb.source_sad_nonrd[0] == kLowSad) {
     perform_intra_pred = 0;
   }
 
@@ -2278,7 +2294,7 @@
         cpi->sf.rt_sf.source_metrics_sb_nonrd) {
       // For spatially flat blocks with zero motion only check
       // DC mode.
-      if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+      if (x->content_state_sb.source_sad_nonrd[0] == kZeroSad &&
           x->source_variance == 0 && this_mode != DC_PRED)
         continue;
       // Only test Intra for big blocks if spatial_variance is 0.
@@ -2345,7 +2361,7 @@
       // Otherwise bias against intra for blocks with zero
       // motion and no color, on non-scene/slide changes.
       else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
-               x->content_state_sb.source_sad_nonrd == kZeroSad &&
+               x->content_state_sb.source_sad_nonrd[0] == kZeroSad &&
                x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0)
         this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1;
     }
@@ -2426,8 +2442,8 @@
     return 1;
   }
 
-  if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
-      force_skip_low_temp_var && mode == NEWMV) {
+  if (content_state_sb.source_sad_nonrd[0] != kHighSad &&
+      bsize >= BLOCK_64X64 && force_skip_low_temp_var && mode == NEWMV) {
     return 1;
   }
   return 0;
@@ -2965,7 +2981,7 @@
       use_modeled_non_rd_cost =
           (quant_params->base_qindex > 120 && x->source_variance > 100 &&
            bsize <= BLOCK_16X16 && !x->content_state_sb.lighting_change &&
-           x->content_state_sb.source_sad_nonrd != kHighSad);
+           x->content_state_sb.source_sad_nonrd[0] != kHighSad);
   }
 
 #if COLLECT_PICK_MODE_STAT
@@ -3079,9 +3095,9 @@
       // below after search_new_mv.
       if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
         if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
-             x->content_state_sb.source_sad_nonrd == kZeroSad) ||
+             x->content_state_sb.source_sad_nonrd[0] == kZeroSad) ||
             (frame_mv[this_mode][ref_frame].as_int == 0 &&
-             x->content_state_sb.source_sad_nonrd != kZeroSad &&
+             x->content_state_sb.source_sad_nonrd[0] != kZeroSad &&
              ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
               cpi->rc.high_source_sad) &&
              x->source_variance == 0))
@@ -3175,7 +3191,7 @@
         cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
         cpi->sf.rt_sf.source_metrics_sb_nonrd) {
       if (frame_mv[this_mode][ref_frame].as_int == 0 &&
-          x->content_state_sb.source_sad_nonrd != kZeroSad &&
+          x->content_state_sb.source_sad_nonrd[0] != kZeroSad &&
           ((x->color_sensitivity[0] == 0 && x->color_sensitivity[1] == 0) ||
            cpi->rc.high_source_sad) &&
           x->source_variance == 0)
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index f2e0bd3..10ead95 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -1339,13 +1339,16 @@
   } else {
     if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3;
     if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0;
+    if (speed >= 7) sf->rt_sf.reduce_mv_pel_precision = 1;
     if (speed >= 9) {
       sf->rt_sf.sad_based_adp_altref_lag = 1;
       sf->rt_sf.sad_based_comp_prune = 1;
+      sf->rt_sf.reduce_mv_pel_precision = 0;
     }
     if (speed >= 10) {
       sf->rt_sf.sad_based_adp_altref_lag = 3;
       sf->rt_sf.sad_based_comp_prune = 2;
+      sf->rt_sf.reduce_mv_pel_precision = 2;
     }
   }
   if (cpi->ppi->use_svc) {
@@ -1394,7 +1397,7 @@
       sf->rt_sf.nonrd_prune_ref_frame_search = 3;
       sf->rt_sf.var_part_split_threshold_shift = 10;
       sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-      sf->rt_sf.force_half_pel_block = 1;
+      sf->rt_sf.reduce_mv_pel_precision = 2;
       sf->rt_sf.reduce_zeromv_mvres = true;
     }
     if (speed >= 10 && cm->width * cm->height > 1920 * 1080)
@@ -1701,6 +1704,7 @@
     sf->rt_sf.var_part_based_on_qidx = 0;
     sf->rt_sf.frame_level_mode_cost_update = true;
     sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+    sf->rt_sf.reduce_mv_pel_precision = 0;
   }
   if (speed >= 10) {
     sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
@@ -1708,7 +1712,7 @@
     sf->rt_sf.nonrd_prune_ref_frame_search = 3;
     sf->rt_sf.var_part_split_threshold_shift = 10;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-    sf->rt_sf.force_half_pel_block = 1;
+    sf->rt_sf.reduce_mv_pel_precision = 2;
     sf->rt_sf.reduce_zeromv_mvres = true;
     sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
   }
@@ -2023,7 +2027,7 @@
   rt_sf->prune_inter_modes_with_golden_ref = 0;
   rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
   rt_sf->prune_inter_modes_using_temp_var = 0;
-  rt_sf->force_half_pel_block = 0;
+  rt_sf->reduce_mv_pel_precision = 0;
   rt_sf->prune_intra_mode_based_on_mv_range = 0;
   rt_sf->var_part_split_threshold_shift = 7;
   rt_sf->gf_refresh_based_on_qp = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 96a21e4..3c87384 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -1508,8 +1508,10 @@
   // variance wrt LAST reference.
   int prune_inter_modes_using_temp_var;
 
-  // Force half_pel at block level.
-  int force_half_pel_block;
+  // Reduce MV precision at block level, represents various algos (0: disabled)
+  // 1: switch to halfpel, fullpel based on blk SAD, source var, bsize and qp
+  // 2: switch to halfpel based on integer mv size, bsize, frame-level motion
+  int reduce_mv_pel_precision;
 
   // Prune intra mode evaluation in inter frames based on mv range.
   BLOCK_SIZE prune_intra_mode_based_on_mv_range;
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 8ec7b71..f429190 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -1128,7 +1128,7 @@
   // For non-SVC GOLDEN is another temporal reference. Check if it should be
   // used as reference for partitioning.
   if (!cpi->ppi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
-      x->content_state_sb.source_sad_nonrd != kZeroSad) {
+      x->content_state_sb.source_sad_nonrd[0] != kZeroSad) {
     yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
     if (yv12_g && yv12_g != yv12) {
       av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -1273,12 +1273,12 @@
     const int q =
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
     set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
-                       x->content_state_sb.source_sad_nonrd,
+                       x->content_state_sb.source_sad_nonrd[0],
                        x->content_state_sb.source_sad_rd, 1);
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
                        x->content_state_sb.low_sumdiff,
-                       x->content_state_sb.source_sad_nonrd,
+                       x->content_state_sb.source_sad_nonrd[0],
                        x->content_state_sb.source_sad_rd, 0);
   }
 
@@ -1346,7 +1346,7 @@
       cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->cyclic_refresh->apply_cyclic_refresh &&
       segment_id == CR_SEGMENT_ID_BASE &&
-      x->content_state_sb.source_sad_nonrd == kZeroSad &&
+      x->content_state_sb.source_sad_nonrd[0] == kZeroSad &&
       ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0 &&
       y_sad < thresh_exit_part && uv_sad[0]<(3 * thresh_exit_part)>> 2 &&
       uv_sad[1]<(3 * thresh_exit_part)>> 2) {
@@ -1427,7 +1427,7 @@
                          (thresholds[2] >> 1) &&
                      maxvar_16x16[m][i] > thresholds[2]) ||
                     (cpi->sf.rt_sf.prefer_large_partition_blocks &&
-                     x->content_state_sb.source_sad_nonrd > kLowSad &&
+                     x->content_state_sb.source_sad_nonrd[0] > kLowSad &&
                      cpi->rc.frame_source_sad < 20000 &&
                      maxvar_16x16[m][i] > (thresholds[2] >> 4) &&
                      maxvar_16x16[m][i] > (minvar_16x16[m][i] << 2)))) {