Downsample sad computation in hd motion search

This CL adds a speed feature to downsample the SAD computation during
motion search by skipping every other row in sad computation.

The downsampling is only performed when there are sufficiently many rows
to get a good estimation. Empirically, this happens when there are at
leas 16 rows.

Currently this speed feature is only enabled on hdres set.

Performance:
 SPD_SET | OVR_PSNR | AVG_PSNR |   SSIM  |  SPD
    0    |  +0.021% |  +0.025% | +0.084% | +2.3%
    1    |  +0.053% |  +0.071% | +0.136% | +3.8%
    2    |  +0.037% |  +0.055% | +0.091% | +4.3%
    3    |  +0.025% |  +0.038% | +0.074% | +2.6%
    4    |  +0.029% |  +0.043% | +0.093% | +3.0%
    5    |  +0.117% |  +0.128% | +0.194% | +3.3%
    6    |  +0.065% |  +0.082% | +0.179% | +3.2%

BUG=aomedia:2781

STATS_CHANGED

Change-Id: Ibf2afd9a7ffed939897249527b41bbaa4152a62c
(cherry picked from commit 0a32d3c251a0ae49b6e0a76249a699d33244e0be)
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index e8ffc8e..b8aea1d 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -16,6 +16,7 @@
 #include <string.h>
 
 #include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
 
 #if CONFIG_DENOISE
 #include "aom_dsp/grain_table.h"
@@ -1299,6 +1300,35 @@
 
   MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
 
+#define SDSFP(BT, SDSF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdsf = SDSF;   \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d);
+  SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d);
+  SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d);
+  SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d);
+  SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d);
+  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d);
+  SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d);
+  SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d);
+  SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d);
+  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d);
+
+  SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d);
+  SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d);
+  SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d);
+  SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d);
+  SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d);
+  SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d);
+  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d);
+  SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d);
+  SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d);
+  SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d);
+  SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d);
+  SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d);
+#undef SDSFP
+
 #if CONFIG_AV1_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 36de007..40e7c08 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h
@@ -328,6 +328,15 @@
               aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
               aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
 
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdsf = SDSF;          \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT)            \
+  HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,               \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT, \
+               aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d)
+
 #define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
   static unsigned int fnname##_bits8(                                    \
       const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
@@ -658,6 +667,29 @@
                "cm->seq_params.bit_depth should be AOM_BITS_8, "
                "AOM_BITS_10 or AOM_BITS_12");
     }
+
+    HIGHBD_SDSFP_WRAPPER(128, 128);
+    HIGHBD_SDSFP_WRAPPER(128, 64);
+    HIGHBD_SDSFP_WRAPPER(64, 128);
+    HIGHBD_SDSFP_WRAPPER(64, 64);
+    HIGHBD_SDSFP_WRAPPER(64, 32);
+    HIGHBD_SDSFP_WRAPPER(64, 16);
+    HIGHBD_SDSFP_WRAPPER(32, 64);
+    HIGHBD_SDSFP_WRAPPER(32, 32);
+    HIGHBD_SDSFP_WRAPPER(32, 16);
+    HIGHBD_SDSFP_WRAPPER(32, 8);
+    HIGHBD_SDSFP_WRAPPER(16, 64);
+    HIGHBD_SDSFP_WRAPPER(16, 32);
+    HIGHBD_SDSFP_WRAPPER(16, 16);
+    HIGHBD_SDSFP_WRAPPER(16, 8);
+    HIGHBD_SDSFP_WRAPPER(8, 16);
+    HIGHBD_SDSFP_WRAPPER(8, 8);
+    HIGHBD_SDSFP_WRAPPER(4, 16);
+    HIGHBD_SDSFP_WRAPPER(4, 8);
+    HIGHBD_SDSFP_WRAPPER(4, 16);
+    HIGHBD_SDSFP_WRAPPER(8, 32);
+    HIGHBD_SDSFP_WRAPPER(32, 8);
+    HIGHBD_SDSFP_WRAPPER(64, 16);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 4225b00..7c8457f 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -99,6 +99,16 @@
 
   av1_set_mv_search_method(ms_params, search_sites, search_method);
 
+  const int use_downsampled_sad =
+      mv_sf->use_downsampled_sad && block_size_high[bsize] >= 16;
+  if (use_downsampled_sad) {
+    ms_params->sdf = ms_params->vfp->sdsf;
+    ms_params->sdx4df = ms_params->vfp->sdsx4df;
+  } else {
+    ms_params->sdf = ms_params->vfp->sdf;
+    ms_params->sdx4df = ms_params->vfp->sdx4df;
+  }
+
   ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
   ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
   ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
@@ -608,11 +618,10 @@
                                  const struct buf_2d *const src,
                                  const uint8_t *const ref_address,
                                  const int ref_stride) {
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const uint8_t *src_buf = src->buf;
   const int src_stride = src->stride;
 
-  return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+  return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
 }
 
 static INLINE int get_mvpred_compound_var_cost(
@@ -668,7 +677,7 @@
   } else if (second_pred) {
     return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
   } else {
-    return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+    return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
   }
 }
 
@@ -808,7 +817,6 @@
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
   const search_site *site = ms_params->search_sites->site[search_step];
 
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   unsigned char const *block_offset[4];
   unsigned int sads[4];
   const uint8_t *best_address;
@@ -819,8 +827,8 @@
   for (int j = 0; j < 4; j++)
     block_offset[j] = site[cand_start + j].offset + best_address;
 
-  // 4-point sad calcuation.
-  vfp->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
+  // 4-point sad calculation.
+  ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
 
   for (int j = 0; j < 4; j++) {
     const FULLPEL_MV this_mv = {
@@ -1219,7 +1227,6 @@
   const int ref_stride = ref->stride;
   const uint8_t *best_address;
 
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const uint8_t *mask = ms_params->ms_buffers.mask;
   const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
@@ -1268,7 +1275,7 @@
         for (j = 0; j < 4; j++)
           block_offset[j] = site[idx + j].offset + best_address;
 
-        vfp->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
+        ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
         for (j = 0; j < 4; j++) {
           if (sads[j] < bestsad) {
             const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
@@ -1390,7 +1397,6 @@
                                   const int range, const int step,
                                   FULLPEL_MV *best_mv,
                                   FULLPEL_MV *second_best_mv) {
-  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
   const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
   const struct buf_2d *const src = ms_params->ms_buffers.src;
   const struct buf_2d *const ref = ms_params->ms_buffers.ref;
@@ -1430,7 +1436,8 @@
             const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
             addrs[i] = get_buf_from_fullmv(ref, &mv);
           }
-          vfp->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+          ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
 
           for (i = 0; i < 4; ++i) {
             if (sads[i] < best_sad) {
@@ -1683,6 +1690,40 @@
     }
   }
 
+  if (ms_params->sdf != ms_params->vfp->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run mesh search with
+    // skip row features off.
+    // TODO(chiyotsai@google.com): Handle the case where we have a vertical
+    // offset of 1 before we hit this statement to avoid having to redo
+    // motion search.
+    const struct buf_2d *src = ms_params->ms_buffers.src;
+    const struct buf_2d *ref = ms_params->ms_buffers.ref;
+    const int src_stride = src->stride;
+    const int ref_stride = ref->stride;
+
+    const uint8_t *src_address = src->buf;
+    const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
+    const int sad =
+        ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
+    const int skip_sad =
+        ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
+    // We will keep the result of skipping rows if it's good enough. Here, good
+    // enough means the error is less than 1 per pixel.
+    const int kSADThresh =
+        1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
+      new_ms_params.sdf = new_ms_params.vfp->sdf;
+      new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+
+      return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
+                                   cost_list, best_mv, second_best_mv);
+    }
+  }
+
   if (run_mesh_search) {
     int var_ex;
     FULLPEL_MV tmp_mv_ex;
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 05ae381..2519cc8 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -191,6 +191,11 @@
 
   // For calculating mv cost
   MV_COST_PARAMS mv_cost_params;
+
+  // Stores the function used to compute the sad. This can be different from the
+  // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
+  aom_sad_fn_t sdf;
+  aom_sad_multi_d_fn_t sdx4df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
 void av1_make_default_fullpel_ms_params(
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index ce4fbee..e287649 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -161,6 +161,12 @@
     sf->part_sf.ml_early_term_after_part_split_level = 1;
   }
 
+  if (is_720p_or_larger) {
+    // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+    // current block's vertical texture instead of hardcoded with resolution
+    sf->mv_sf.use_downsampled_sad = 1;
+  }
+
   if (speed >= 1) {
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
@@ -1016,6 +1022,7 @@
   mv_sf->use_accurate_subpel_search = USE_8_TAPS;
   mv_sf->use_bsize_dependent_search_method = 0;
   mv_sf->use_fullpel_costlist = 0;
+  mv_sf->use_downsampled_sad = 0;
 }
 
 static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 1ed4ac6..a59a172 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -533,6 +533,10 @@
 
   // Accurate full pixel motion search based on TPL stats.
   int full_pixel_search_level;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 16 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
 typedef struct INTER_MODE_SPEED_FEATURES {