Use better motion search for temporal filtering

This is ported from VP9 work:
https://chromium-review.googlesource.com/c/webm/libvpx/+/1154488
https://chromium-review.googlesource.com/c/webm/libvpx/+/1157910

Tested encoding performance on lowres and midres with 30 frames.
Coding gains(ovr_psnr):
speed 0: lowres 0.06%  midres 0.36%
speed 1: lowres 0.05%  midres 0.29%

STATS_CHANGED

Change-Id: Ib30455465b34215285ca53603cf080c3b410504d
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index d93e01a..ee58802 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -2100,11 +2100,11 @@
 }
 
 int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
+                          MV *mvp_full, int step_param, int method,
+                          int run_mesh_search, int error_per_bit,
                           int *cost_list, const MV *ref_mv, int var_max, int rd,
                           int x_pos, int y_pos, int intra) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const SEARCH_METHODS method = sf->mv.search_method;
   const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
 
@@ -2169,11 +2169,35 @@
     default: assert(0 && "Invalid search method.");
   }
 
+  // Should we allow a follow on exhaustive search?
+  if (!run_mesh_search) {
+    if (method == NSTEP) {
+      if (is_exhaustive_allowed(cpi, x)) {
+        int exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>=
+            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) run_mesh_search = 1;
+      }
+    }
+  }
+
+  if (run_mesh_search) {
+    int var_ex;
+    MV tmp_mv_ex;
+    var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+                                   cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+    if (var_ex < var) {
+      var = var_ex;
+      x->best_mv.as_mv = tmp_mv_ex;
+    }
+  }
+
   if (method != NSTEP && rd && var < var_max)
     var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
 
   do {
-    if (!av1_use_hash_me(&cpi->common)) break;
+    if (!intra || !av1_use_hash_me(&cpi->common)) break;
 
     // already single ME
     // get block size and original buffer of current block
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 539e8f4..7df15d6 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -120,8 +120,9 @@
 
 int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
-                          int error_per_bit, int *cost_list, const MV *ref_mv,
-                          int var_max, int rd, int x_pos, int y_pos, int intra);
+                          int method, int run_mesh_search, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max, int rd,
+                          int x_pos, int y_pos, int intra);
 
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
                                 MV *mvp_full, int step_param, int sadpb,
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index c0aa935..f062870 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -7014,10 +7014,10 @@
 
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
-      bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                      sadpb, cond_cost_list(cpi, cost_list),
-                                      &ref_mv, INT_MAX, 1, (MI_SIZE * mi_col),
-                                      (MI_SIZE * mi_row), 0);
+      bestsme = av1_full_pixel_search(
+          cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+          sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+          (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
       break;
     case OBMC_CAUSAL:
       bestsme = av1_obmc_full_pixel_diamond(
@@ -9673,8 +9673,8 @@
     int sadpb = x->sadperbit16;
     int cost_list[5];
     int bestsme = av1_full_pixel_search(
-        cpi, x, bsize, &mvp_full, step_param, sadpb,
-        cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+        sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
         (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
 
     x->mv_limits = tmp_mv_limits;
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index d335dfc..054eea3 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -218,7 +218,8 @@
 static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
-                                              int stride) {
+                                              int stride, int x_pos,
+                                              int y_pos) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -254,11 +255,9 @@
   x->mvcost = x->mv_cost_stack;
   x->nmvjointcost = x->nmv_vec_cost;
 
-  // Use mv costing from x->mvcost directly
-  av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                 cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
-                 &best_ref_mv1);
-
+  av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+                        NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
   x->mv_limits = tmp_mv_limits;
 
   // Ignore mv costing by sending NULL pointer instead of cost array
@@ -374,7 +373,8 @@
           // Find best match in this frame by MC
           int err = temporal_filter_find_matching_mb_c(
               cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
-              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
+              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+              mb_col * 16, mb_row * 16);
 
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior