Extend downsample SAD computation for lowres, midres

Added a new method to enable downsample SAD during motion
search based on the SAD deviation of the even rows to odd
rows.

    Instruction Count        BD-Rate Loss(%)
cpu   Reduction(%)    avg.psnr   ovr.psnr    ssim
 5       1.748        -0.0222    -0.0224   -0.0220
 6       1.379         0.0284     0.0274    0.0632

STATS_CHANGED for good preset, speed 5 and 6

Change-Id: I65cfd6d85f648ac4b2a55e04b5fe209ca2cd6d86
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index befdb50..c0d658b 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -280,7 +280,7 @@
       cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
   FULLPEL_MOTION_SEARCH_PARAMS ms_params;
   av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
-                                     first_pass_search_sites,
+                                     start_mv, first_pass_search_sites,
                                      fine_search_interval);
   av1_set_mv_search_method(&ms_params, first_pass_search_sites, NSTEP);
 
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index b8d78b9..b316616 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -94,10 +94,12 @@
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
-    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
     int fine_search_interval) {
   const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+  const int is_key_frame =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
 
   // High level params
   ms_params->bsize = bsize;
@@ -129,19 +131,6 @@
 
   av1_set_mv_search_method(ms_params, search_sites, search_method);
 
-  const int use_downsampled_sad =
-      mv_sf->use_downsampled_sad && block_size_high[bsize] >= 16;
-  if (use_downsampled_sad) {
-    ms_params->sdf = ms_params->vfp->sdsf;
-    ms_params->sdx4df = ms_params->vfp->sdsx4df;
-    // Skip version of sadx3 is not is not available yet
-    ms_params->sdx3df = ms_params->vfp->sdsx4df;
-  } else {
-    ms_params->sdf = ms_params->vfp->sdf;
-    ms_params->sdx4df = ms_params->vfp->sdx4df;
-    ms_params->sdx3df = ms_params->vfp->sdx3df;
-  }
-
   ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
   ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
   ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
@@ -161,6 +150,47 @@
   // Mvcost params
   init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
                       x->errorperbit, x->sadperbit);
+
+  ms_params->sdf = ms_params->vfp->sdf;
+  ms_params->sdx4df = ms_params->vfp->sdx4df;
+  ms_params->sdx3df = ms_params->vfp->sdx3df;
+
+  if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) {
+    ms_params->sdf = ms_params->vfp->sdsf;
+    ms_params->sdx4df = ms_params->vfp->sdsx4df;
+    // Skip version of sadx3 is not available yet
+    ms_params->sdx3df = ms_params->vfp->sdsx4df;
+  } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 &&
+             !is_key_frame) {
+    FULLPEL_MV start_mv_clamped = start_mv;
+    // adjust start_mv to make sure it is within MV range
+    clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits);
+
+    const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+    const int ref_stride = ref->stride;
+    const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped);
+    const struct buf_2d *const src = ms_params->ms_buffers.src;
+    const uint8_t *src_buf = src->buf;
+    const int src_stride = src->stride;
+
+    unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows;
+    start_mv_sad_even_rows =
+        ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride);
+    start_mv_sad_odd_rows =
+        ms_params->vfp->sdsf(src_buf + src_stride, src_stride,
+                             best_address + ref_stride, ref_stride);
+
+    // If the absolute SAD difference computed between the pred-to-src of even
+    // and odd rows is small, skip every other row in sad computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 4;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      ms_params->sdf = ms_params->vfp->sdsf;
+      ms_params->sdx4df = ms_params->vfp->sdsx4df;
+      ms_params->sdx3df = ms_params->vfp->sdsx4df;
+    }
+  }
 }
 
 void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index c4468cf..51a4777 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -144,7 +144,7 @@
 
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
-    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
     const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
     int fine_search_interval);
 
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index da6729c..b771b05 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -265,8 +265,6 @@
   // Allow more mesh searches for screen content type on the ARF.
   const int fine_search_interval = use_fine_search_interval(cpi);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
-  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     src_search_site_cfg, fine_search_interval);
 
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION: {
@@ -278,7 +276,11 @@
 
         if (smv.as_int == INVALID_MV) continue;
 
-        int thissme =
+        av1_make_default_fullpel_ms_params(
+            &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv,
+            src_search_site_cfg, fine_search_interval);
+
+        const int thissme =
             av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
                                   cond_cost_list(cpi, cost_list), &this_best_mv,
                                   &this_second_best_mv);
@@ -294,6 +296,10 @@
       }
     } break;
     case OBMC_CAUSAL:
+      av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                         &ref_mv, start_mv, src_search_site_cfg,
+                                         fine_search_interval);
+
       bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
                                            step_param, &best_mv->as_fullmv);
       break;
@@ -618,16 +624,16 @@
     const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
     const search_site_config *src_search_sites =
         av1_get_search_site_config(cpi, x, search_method);
+    // Use the mv result from the single mode as mv predictor.
+    const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
     av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
-                                       &ref_mv[id].as_mv, src_search_sites,
+                                       &ref_mv[id].as_mv, start_fullmv,
+                                       src_search_sites,
                                        /*fine_search_interval=*/0);
 
     av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                              mask_stride, id);
 
-    // Use the mv result from the single mode as mv predictor.
-    const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
-
     // Small-range full-pixel motion search.
     if (!cpi->sf.mv_sf.disable_extensive_joint_motion_search &&
         mbmi->interinter_comp.type != COMPOUND_WEDGE) {
@@ -772,16 +778,16 @@
   const SEARCH_METHODS search_method = cpi->sf.mv_sf.search_method;
   const search_site_config *src_search_sites =
       av1_get_search_site_config(cpi, x, search_method);
+  // Use the mv result from the single mode as mv predictor.
+  const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
-                                     &ref_mv.as_mv, src_search_sites,
+                                     &ref_mv.as_mv, start_fullmv,
+                                     src_search_sites,
                                      /*fine_search_interval=*/0);
 
   av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
                            mask_stride, ref_idx);
 
-  // Use the mv result from the single mode as mv predictor.
-  const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
-
   // Small-range full-pixel motion search.
   bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
                                   &best_mv.as_fullmv, NULL);
@@ -999,7 +1005,8 @@
   const search_site_config *src_search_sites =
       av1_get_search_site_config(cpi, x, search_method);
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
-                                     src_search_sites, fine_search_interval);
+                                     start_mv, src_search_sites,
+                                     fine_search_interval);
 
   var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
                               cond_cost_list(cpi, cost_list),
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 7754cd6..05bcf1d 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c
@@ -238,7 +238,7 @@
       av1_get_search_site_config(cpi, x, search_method);
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     src_search_sites,
+                                     start_mv, src_search_sites,
                                      /*fine_search_interval=*/0);
 
   const unsigned int full_var_rd = av1_full_pixel_search(
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 7ce91a9..9fc255d 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3189,8 +3189,10 @@
   FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
   const search_site_config *lookahead_search_sites =
       cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+  const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
   av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
-                                     &dv_ref.as_mv, lookahead_search_sites,
+                                     &dv_ref.as_mv, start_mv,
+                                     lookahead_search_sites,
                                      /*fine_search_interval=*/0);
   const IntraBCMVCosts *const dv_costs = x->dv_costs;
   av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
@@ -3237,7 +3239,6 @@
     }
 
     const int step_param = cpi->mv_search_params.mv_step_param;
-    const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
     IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
     int_mv best_mv, best_hash_mv;
 
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index c3b00ff..594ecb7 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -206,7 +206,7 @@
   if (is_720p_or_larger) {
     // TODO(chiyotsai@google.com): make this speed feature adaptive based on
     // current block's vertical texture instead of hardcoded with resolution
-    sf->mv_sf.use_downsampled_sad = 1;
+    sf->mv_sf.use_downsampled_sad = 2;
   }
 
   if (speed >= 1) {
@@ -587,6 +587,13 @@
   const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
   const bool use_hbd = cpi->oxcf.use_highbitdepth;
+  // Speed features applicable for temporal filtering and tpl modules may be
+  // changed based on frame type at places where the sf is applied (Example :
+  // use_downsampled_sad). This is because temporal filtering and tpl modules
+  // are called before this function (except for the first key frame).
+  // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal
+  // filtering and tpl modules, modify the sf initialization appropriately
+  // before calling the modules.
   const int boosted = frame_is_boosted(cpi);
   const int is_boosted_arf2_bwd_type =
       boosted ||
@@ -625,7 +632,7 @@
   if (is_720p_or_larger) {
     // TODO(chiyotsai@google.com): make this speed feature adaptive based on
     // current block's vertical texture instead of hardcoded with resolution
-    sf->mv_sf.use_downsampled_sad = 1;
+    sf->mv_sf.use_downsampled_sad = 2;
   }
 
   if (!is_720p_or_larger) {
@@ -817,6 +824,7 @@
       sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
       sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
           (boosted || allow_screen_content_tools) ? 0 : 1;
+      sf->mv_sf.use_downsampled_sad = 1;
     }
 
     if (!is_480p_or_larger) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index ae9b99e..910e191 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -808,7 +808,16 @@
   int full_pixel_search_level;
 
   // Whether to downsample the rows in sad calculation during motion search.
-  // This is only active when there are at least 16 rows.
+  // This is only active when there are at least 16 rows. When this sf is
+  // active, if there is a large discrepancy in the SAD values for the final
+  // motion vector between skipping vs not skipping, motion search is redone
+  // with skip row features off.
+  // 0: Disabled (do not downsample rows)
+  // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and
+  //    odd rows for the starting MV is small. Redo motion search with sf off
+  //    when SAD deviation is high for the final motion vector.
+  // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the
+  //    start MV and tested only for the final MV.
   int use_downsampled_sad;
 
   // Enable/disable extensive joint motion search.
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 050c701..76897e8 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -146,7 +146,7 @@
   const int q = av1_get_q(cpi);
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
-                                     &baseline_mv, search_site_cfg,
+                                     &baseline_mv, start_mv, search_site_cfg,
                                      /*fine_search_interval=*/0);
   av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
   full_ms_params.run_mesh_search = 1;
@@ -205,7 +205,7 @@
         mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
         av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
                                            subblock_size, &baseline_mv,
-                                           search_site_cfg,
+                                           start_mv, search_site_cfg,
                                            /*fine_search_interval=*/0);
         av1_set_mv_search_method(&full_ms_params, search_site_cfg,
                                  search_method);
diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 55a2b27..c28b6e9 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c
@@ -278,7 +278,7 @@
 
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
-                                     search_site_cfg,
+                                     start_mv, search_site_cfg,
                                      /*fine_search_interval=*/0);
   av1_set_mv_search_method(&full_ms_params, search_site_cfg,
                            tpl_sf->search_method);