Add coarse ME to VAR_BASED_PART mode. Ref mode set for nonRD.

Adding coarse ME to variance based partitioning. Before calculating
residual variance do coarse ME. This will allow to go with bigger blocks
if they are well-predicted

Also making better ref mode set for nonRD inter mode selection

Alltogether this brings about 0.5-0.7 dB to RealTime mode speed 8

Change-Id: Ie2481791592edb05e17b61a33712733e8cfe4506
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 0ee0b6e..18a1852 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -847,6 +847,18 @@
   add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/aom_minmax_8x8 sse2/;
 
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_int_pro_row sse2/;
+
+  add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_int_pro_col sse2/;
+
+  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_vector_var sse2/;
+
   #
   # hamadard transform and satd for implmenting temporal dependency model
   #
diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index a97cd0f..43d2760 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -180,3 +180,48 @@
   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
   return satd;
 }
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64, 128}.
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  const int norm_factor = height >> 1;
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 32640].
+    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 1020].
+    hbuf[idx] /= norm_factor;
+    ++ref;
+  }
+}
+
+// width: value range {16, 32, 64, 128}.
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  // sum: 14 bit, dynamic range [0, 32640]
+  for (idx = 0; idx < width; ++idx) sum += ref[idx];
+  return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
+    mean += diff;                // mean: dynamic range 16 bits.
+    sse += diff * diff;          // sse:  dynamic range 26 bits.
+  }
+
+  // (mean * mean): dynamic range 31 bits.
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 3317a93..d4a4d34 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -2095,6 +2095,216 @@
   return is_allowed;
 }
 
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = aom_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+  { -1, 0 },
+  { 0, -1 },
+  { 0, 1 },
+  { 1, 0 },
+};
+
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, const MV *ref_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  DECLARE_ALIGNED(16, int16_t, hbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
+  int idx;
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  MvLimits subpel_mv_limits;
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         MAX_MB_PLANE);
+  }
+
+  if (xd->bd != 8) {
+    unsigned int sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                 xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return sad;
+  }
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t *const pos[4] = {
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  set_subpel_mv_search_range(
+      &x->mv_limits, &subpel_mv_limits.col_min, &subpel_mv_limits.col_max,
+      &subpel_mv_limits.row_min, &subpel_mv_limits.row_max, ref_mv);
+  clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+           subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int method,
                           int run_mesh_search, int error_per_bit,
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 1284fa4..4131a36 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -83,6 +83,11 @@
                             int distance, const aom_variance_fn_ptr_t *fn_ptr,
                             const MV *center_mv);
 
+unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
+                                           MACROBLOCK *x, BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col,
+                                           const MV *ref_mv);
+
 // Runs sequence of diamond searches in smaller steps for RD.
 int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param, int sadpb,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index b386f10..fee0395 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -11141,28 +11141,51 @@
   { ALTREF_FRAME, INTRA_FRAME },  { BWDREF_FRAME, INTRA_FRAME },
 };
 
-static void default_skip_mask(mode_skip_mask_t *mask,
-                              int enable_reduced_reference_set) {
-  if (enable_reduced_reference_set) {
+static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = {
+  { LAST_FRAME, NONE_FRAME },
+  { ALTREF_FRAME, NONE_FRAME },
+  { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME },
+  { BWDREF_FRAME, NONE_FRAME }
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) {
+  if (ref_set == REF_SET_FULL) {
+    // Everything available by default.
+    memset(mask, 0, sizeof(*mask));
+  } else {
     // All modes available by default.
     memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
-
     // All references disabled first.
     for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
       for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
         mask->ref_combo[ref1][ref2 + 1] = true;
       }
     }
+    const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+    int num_ref_combos;
+
     // Then enable reduced set of references explicitly.
-    const int num_reduced_ref_combos =
-        (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
-    for (int i = 0; i < num_reduced_ref_combos; ++i) {
-      const MV_REFERENCE_FRAME *const this_combo = reduced_ref_combos[i];
+    switch (ref_set) {
+      case REF_SET_REDUCED:
+        ref_set_combos = reduced_ref_combos;
+        num_ref_combos =
+            (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+        break;
+      case REF_SET_REALTIME:
+        ref_set_combos = real_time_ref_combos;
+        num_ref_combos =
+            (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+        break;
+      default: assert(0); num_ref_combos = 0;
+    }
+
+    for (int i = 0; i < num_ref_combos; ++i) {
+      const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
       mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
     }
-  } else {
-    // Everything available by default.
-    memset(mask, 0, sizeof(*mask));
   }
 }
 
@@ -11174,8 +11197,14 @@
   MB_MODE_INFO *const mbmi = xd->mi[0];
   unsigned char segment_id = mbmi->segment_id;
   const SPEED_FEATURES *const sf = &cpi->sf;
+  REF_SET ref_set = REF_SET_FULL;
 
-  default_skip_mask(mask, cpi->oxcf.enable_reduced_reference_set);
+  if (sf->use_real_time_ref_set)
+    ref_set = REF_SET_REALTIME;
+  else if (cpi->oxcf.enable_reduced_reference_set)
+    ref_set = REF_SET_REDUCED;
+
+  default_skip_mask(mask, ref_set);
 
   int min_pred_mv_sad = INT_MAX;
   MV_REFERENCE_FRAME ref_frame;
@@ -13157,14 +13186,14 @@
   CompoundTypeRdBuffers rd_buffers;
   alloc_compound_type_rd_buffers(cm, &rd_buffers);
 
-  for (int midx = 0; midx < LAST_SINGLE_REF_MODES + 1; ++midx) {
+  for (int midx = 0; midx < MAX_MODES; ++midx) {
     const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
     this_mode = mode_order->mode;
     const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
     const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
     const int comp_pred = second_ref_frame > INTRA_FRAME;
 
-    if (ref_frame > LAST_FRAME) continue;
+    if (second_ref_frame != NONE_FRAME) continue;
 
     // When single ref motion search ends:
     // 1st pass: To evaluate single ref RD results and rewind to the beginning;
@@ -13178,7 +13207,7 @@
         motion_mode_skip_mask = analyze_simple_trans_states(cpi, x);
         continue;
       }
-      if (!comp_pred) {  // single ref mode
+      if (!comp_pred && ref_frame != INTRA_FRAME) {  // single ref mode
         if (args.single_ref_first_pass) {
           // clear stats
           for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 74b1c56..200452d 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -234,6 +234,7 @@
   sf->prune_motion_mode_level = 1;
   sf->cb_pred_filter_search = 0;
   sf->use_nonrd_pick_mode = 0;
+  sf->use_real_time_ref_set = 0;
 
   if (speed >= 1) {
     sf->gm_erroradv_type = GM_ERRORADV_TR_1;
@@ -456,6 +457,7 @@
   sf->prune_motion_mode_level = 1;
   sf->cb_pred_filter_search = 0;
   sf->use_nonrd_pick_mode = 0;
+  sf->use_real_time_ref_set = 0;
 
   if (speed >= 1) {
     sf->gm_erroradv_type = GM_ERRORADV_TR_1;
@@ -612,6 +614,7 @@
     sf->default_max_partition_size = BLOCK_128X128;
     sf->default_min_partition_size = BLOCK_8X8;
     sf->partition_search_type = VAR_BASED_PARTITION;
+    sf->use_real_time_ref_set = 1;
     // Can't use LARGEST TX mode with pre-calculated partition
     // and disabled TX64
     if (!cpi->oxcf.enable_tx64) sf->tx_size_search_method = USE_FAST_RD;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e18928d..b90ea15 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -675,6 +675,9 @@
 
   // Enable/disable smooth intra modes.
   int disable_smooth_intra;
+
+  // use reduced ref set for real-time mode
+  int use_real_time_ref_set;
 } SPEED_FEATURES;
 
 struct AV1_COMP;
diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index 7e13dc9..3cead91 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c
@@ -512,6 +512,11 @@
     mi->sb_type = cm->seq_params.sb_size;
     mi->mv[0].as_int = 0;
     mi->interp_filters = av1_make_interp_filters(BILINEAR, BILINEAR);
+    if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+      const MV dummy_mv = { 0, 0 };
+      av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, mi_row,
+                                    mi_col, &dummy_mv);
+    }
 
 // TODO(kyslov): bring the small SAD functionality back
 #if 0