Refactor subpel motion estimation

Introduced early exit during subpel motion estimation around
second_best_mv based on subpel mvs corresponding to best_mv.

When tested for 20 frames of BasketBallDrill_832x480_50 at 1 mbps,
observed ~0.5% reduction in encoder time for speed=1 preset.

Change-Id: If721fb1cae4eccdb6616d3d7e8fc377aea913bd8
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index bf95d2b..98691d9 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -368,6 +368,9 @@
   // Store the second best motion vector during full-pixel motion search
   int_mv second_best_mv;
 
+  // Store the fractional best motion vector during sub/Qpel-pixel motion search
+  int_mv fractional_best_mv[3];
+
   // use default transform and skip transform type search for intra modes
   int use_default_intra_tx_type;
   // use default transform and skip transform type search for inter modes
diff --git a/av1/encoder/mbgraph.c b/av1/encoder/mbgraph.c
index 1a35ff7..f0b537a 100644
--- a/av1/encoder/mbgraph.c
+++ b/av1/encoder/mbgraph.c
@@ -60,7 +60,7 @@
         x, &cpi->common, mb_row, mb_col, ref_mv,
         cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0,
         mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL,
-        NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
+        NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0, 1);
   }
 
   if (has_second_ref(xd->mi[0]))
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 4f3c4e2..f11c4a0 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -409,7 +409,7 @@
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search) {
+    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -426,6 +426,7 @@
   (void)cm;
   (void)mi_row;
   (void)mi_col;
+  (void)do_reset_fractional_mv;
 
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -481,12 +482,13 @@
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search) {
+    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
   SETUP_SUBPEL_SEARCH;
   (void)use_accurate_subpel_search;
   (void)cm;
   (void)mi_row;
   (void)mi_col;
+  (void)do_reset_fractional_mv;
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -549,12 +551,13 @@
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search) {
+    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
   SETUP_SUBPEL_SEARCH;
   (void)use_accurate_subpel_search;
   (void)cm;
   (void)mi_row;
   (void)mi_col;
+  (void)do_reset_fractional_mv;
 
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
                                src_address, src_stride, y, y_stride,
@@ -750,7 +753,7 @@
     int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
     unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
     int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search) {
+    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
   const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -796,7 +799,16 @@
 
   (void)cost_list;  // to silence compiler warning
 
+  if (do_reset_fractional_mv) {
+    av1_set_fractional_mv(x->fractional_best_mv);
+  }
+
   for (iter = 0; iter < round; ++iter) {
+    if ((x->fractional_best_mv[iter].as_mv.row == br) &&
+        (x->fractional_best_mv[iter].as_mv.col == bc))
+      return INT_MAX;
+    x->fractional_best_mv[iter].as_mv.row = br;
+    x->fractional_best_mv[iter].as_mv.col = bc;
     // Check vertical and horizontal sub-pixel positions.
     for (idx = 0; idx < 4; ++idx) {
       tr = br + search_step[idx].row;
@@ -2827,16 +2839,14 @@
   (void)thismse;                    \
   (void)cost_list;
 // Return the maximum MV.
-int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
-                                int mi_row, int mi_col, const MV *ref_mv,
-                                int allow_hp, int error_per_bit,
-                                const aom_variance_fn_ptr_t *vfp,
-                                int forced_stop, int iters_per_step,
-                                int *cost_list, int *mvjcost, int *mvcost[2],
-                                int *distortion, unsigned int *sse1,
-                                const uint8_t *second_pred, const uint8_t *mask,
-                                int mask_stride, int invert_mask, int w, int h,
-                                int use_accurate_subpel_search) {
+int av1_return_max_sub_pixel_mv(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
   COMMON_MV_TEST;
   (void)mask;
   (void)mask_stride;
@@ -2847,6 +2857,7 @@
   (void)cm;
   (void)mi_row;
   (void)mi_col;
+  (void)do_reset_fractional_mv;
 
   bestmv->row = maxr;
   bestmv->col = maxc;
@@ -2857,16 +2868,14 @@
   return besterr;
 }
 // Return the minimum MV.
-int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
-                                int mi_row, int mi_col, const MV *ref_mv,
-                                int allow_hp, int error_per_bit,
-                                const aom_variance_fn_ptr_t *vfp,
-                                int forced_stop, int iters_per_step,
-                                int *cost_list, int *mvjcost, int *mvcost[2],
-                                int *distortion, unsigned int *sse1,
-                                const uint8_t *second_pred, const uint8_t *mask,
-                                int mask_stride, int invert_mask, int w, int h,
-                                int use_accurate_subpel_search) {
+int av1_return_min_sub_pixel_mv(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search, const int do_reset_fractional_mv) {
   COMMON_MV_TEST;
   (void)maxr;
   (void)maxc;
@@ -2877,6 +2886,7 @@
   (void)cm;
   (void)mi_row;
   (void)mi_col;
+  (void)do_reset_fractional_mv;
 
   bestmv->row = minr;
   bestmv->col = minc;
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index a975218..3f8b3b1 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -103,7 +103,7 @@
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1, const uint8_t *second_pred,
     const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
-    int use_accurate_subpel_search);
+    int use_accurate_subpel_search, const int do_reset_fractional_mv);
 
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
@@ -154,6 +154,12 @@
                                   int mi_row, int mi_col, int *pts0,
                                   int *pts_inref0, int total_samples);
 
+static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
+  for (int z = 0; z < 3; z++) {
+    fractional_best_mv[z].as_int = INVALID_MV;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index a018d8b..f5a1196 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -6593,7 +6593,7 @@
           cpi->common.allow_high_precision_mv, x->errorperbit,
           &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
           x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
-          mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search);
+          mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
     }
 
     // Restore the pointer to the first prediction buffer.
@@ -6958,13 +6958,12 @@
                                  x->second_best_mv.as_int != x->best_mv.as_int;
           const int pw = block_size_wide[bsize];
           const int ph = block_size_high[bsize];
-
           best_mv_var = cpi->find_fractional_mv_step(
               x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
               x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
-              0, 0, pw, ph, 1);
+              0, 0, pw, ph, 1, 1);
 
           if (try_second) {
             const int minc =
@@ -6989,7 +6988,7 @@
                   cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                  &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
+                  &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1, 0);
               if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
               x->best_mv.as_mv = best_mv;
             }
@@ -7000,7 +6999,7 @@
               x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
-              0, 0, 0, 0, 0);
+              0, 0, 0, 0, 0, 1);
         }
         break;
       case OBMC_CAUSAL:
@@ -7176,7 +7175,7 @@
         cpi->common.allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
         x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
-        ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search);
+        ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
   }
 
   // Restore the pointer to the first unscaled prediction buffer.
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 75fdf02..374ea23 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -259,7 +259,7 @@
         cpi->common.allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
-        NULL, 0, 0, 0, 0, 0);
+        NULL, 0, 0, 0, 0, 0, 1);
   }
 
   x->e_mbd.mi[0]->mv[0] = x->best_mv;