Improve full pixel motion search using TPL stats

Used TPL mv result and combined with neighbor's mv information to improve
full pixel motion search accuracy. This improved encoding quality.

Borg test result at speed 1:
       avg_psnr:  ovr_psnr:  ssim:   avg speed change:
hdres:  -0.142    -0.151    -0.168      -1.2%
midres: -0.173    -0.168    -0.180      -1.1%
lowres: -0.031    -0.031    -0.075      -0.3%
This change is disabled for speed > 2.

STATS_CHANGED

Change-Id: I16c0c156f0e7c1e80353dde0d1eb24097e7fa105
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index eaa586d..e0a96c2 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -476,6 +476,8 @@
   int valid_cost_b;
   int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
   int64_t intra_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int_mv mv_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]
+             [INTER_REFS_PER_FRAME];
   int cost_stride;
 
   // The type of mv cost used during motion search
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 1c3038d..5a3c2d3 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3968,7 +3968,8 @@
 
 static int get_tpl_stats_b(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
                            int mi_col, int64_t *intra_cost_b,
-                           int64_t *inter_cost_b, int *stride) {
+                           int64_t *inter_cost_b,
+                           int_mv mv_b[][INTER_REFS_PER_FRAME], int *stride) {
   if (!cpi->oxcf.enable_tpl_model) return 0;
   if (cpi->superres_mode != SUPERRES_NONE) return 0;
   if (cpi->common.current_frame.frame_type == KEY_FRAME) return 0;
@@ -3995,6 +3996,7 @@
       coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
   const int mi_col_end_sr =
       coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  // mi_cols_sr is mi_cols at superres case.
   const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
 
   // TPL store unit size is not the same as the motion estimation unit size.
@@ -4004,7 +4006,10 @@
   const int step = mi_size_wide[tpl_bsize];
   assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
 
-  *stride = (mi_col_end_sr - mi_col_sr) / step;
+  const int str = (mi_col_end_sr > mi_cols_sr)
+                      ? (mi_cols_sr - mi_col_sr) / step
+                      : (mi_col_end_sr - mi_col_sr) / step;
+  *stride = str;
 
   for (int row = mi_row; row < mi_row + mi_high; row += step) {
     for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
@@ -4013,6 +4018,7 @@
           row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
       inter_cost_b[mi_count] = this_stats->inter_cost;
       intra_cost_b[mi_count] = this_stats->intra_cost;
+      memcpy(mv_b[mi_count], this_stats->mv, sizeof(this_stats->mv));
       mi_count++;
     }
   }
@@ -4743,7 +4749,7 @@
     // No stats for overlay frames. Exclude key frame.
     x->valid_cost_b =
         get_tpl_stats_b(cpi, sb_size, mi_row, mi_col, x->intra_cost_b,
-                        x->inter_cost_b, &x->cost_stride);
+                        x->inter_cost_b, x->mv_b, &x->cost_stride);
 
     reset_partition(pc_root, sb_size);
 
@@ -4788,7 +4794,8 @@
                         max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
                         pc_root, NULL, SB_WET_PASS, NULL);
     }
-
+    // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
+    x->valid_cost_b = 0;
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, rd_pick_partition_time);
 #endif
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index e12fc4e..fe14fdb 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -19,6 +19,23 @@
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
+
+typedef struct {
+  FULLPEL_MV fmv;
+  int weight;
+} cand_mv_t;
+
+static int compare_weight(const void *a, const void *b) {
+  const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight;
+  if (diff < 0)
+    return 1;
+  else if (diff > 0)
+    return -1;
+  return 0;
+}
 
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
@@ -100,6 +117,73 @@
   }
 
   const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+  FULLPEL_MV start_mv;
+  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+    start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
+  else
+    start_mv = get_fullmv_from_mv(&ref_mv);
+
+  cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB] = { { { 0, 0 },
+                                                                      0 } };
+  cand[0].fmv = start_mv;
+  int cnt = 1;
+  int total_weight = 0;
+
+  if (!cpi->sf.mv_sf.full_pixel_search_level &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION) {
+    if (x->valid_cost_b) {
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const int tplw = mi_size_wide[tpl_bsize];
+      const int tplh = mi_size_high[tpl_bsize];
+      const int nw = mi_size_wide[bsize] / tplw;
+      const int nh = mi_size_high[bsize] / tplh;
+
+      if (nw >= 1 && nh >= 1) {
+        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
+        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
+        int valid = 1;
+
+        // Assign large weight to start_mv, so it is always tested.
+        cand[0].weight = nw * nh;
+
+        for (int k = 0; k < nh; k++) {
+          for (int l = 0; l < nw; l++) {
+            const int_mv mv =
+                x->mv_b[start + k * x->cost_stride + l][ref - LAST_FRAME];
+            if (mv.as_int == INVALID_MV) {
+              valid = 0;
+              break;
+            }
+
+            const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+                                     GET_MV_RAWPEL(mv.as_mv.col) };
+            int unique = 1;
+            for (int m = 0; m < cnt; m++) {
+              if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) &&
+                  RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) {
+                unique = 0;
+                cand[m].weight++;
+                break;
+              }
+            }
+
+            if (unique) {
+              cand[cnt].fmv = fmv;
+              cand[cnt].weight = 1;
+              cnt++;
+            }
+          }
+          if (!valid) break;
+        }
+
+        if (valid) {
+          total_weight = 2 * nh * nw;
+          if (cnt > 2) qsort(cand, cnt, sizeof(cand[0]), &compare_weight);
+        }
+      }
+    }
+  }
 
   // Further reduce the search range.
   if (search_range < INT_MAX) {
@@ -115,12 +199,6 @@
     }
   }
 
-  FULLPEL_MV start_mv;
-  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
-    start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
-  else
-    start_mv = get_fullmv_from_mv(&ref_mv);
-
   int cost_list[5];
   int_mv second_best_mv;
   x->best_mv.as_int = second_best_mv.as_int = INVALID_MV;
@@ -132,11 +210,34 @@
                                      src_search_sites);
 
   switch (mbmi->motion_mode) {
-    case SIMPLE_TRANSLATION:
-      bestsme = av1_full_pixel_search(
-          start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
-          &x->best_mv.as_fullmv, &second_best_mv.as_fullmv);
-      break;
+    case SIMPLE_TRANSLATION: {
+      int cur_bestsme = bestsme;
+      FULLPEL_MV best_mv;
+      int sum_weight = 0;
+      FULLPEL_MV second_best_mv0;
+
+      for (int m = 0; m < cnt; m++) {
+        FULLPEL_MV smv = cand[m].fmv;
+        bestsme = av1_full_pixel_search(
+            smv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+            &x->best_mv.as_fullmv, &second_best_mv.as_fullmv);
+
+        // Use first search's second_best_mv
+        // TODO(yunqing): second_best_mv decision will be improved later.
+        if (!m) second_best_mv0 = second_best_mv.as_fullmv;
+
+        if (!m || bestsme < cur_bestsme) {
+          cur_bestsme = bestsme;
+          best_mv = x->best_mv.as_fullmv;
+        }
+
+        sum_weight += cand[m].weight;
+        if (m >= 2 || 4 * sum_weight > 3 * total_weight) break;
+      }
+      x->best_mv.as_fullmv = best_mv;
+      bestsme = cur_bestsme;
+      second_best_mv.as_fullmv = second_best_mv0;
+    } break;
     case OBMC_CAUSAL:
       bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
                                            step_param, &(x->best_mv.as_fullmv));
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index ecba055..28af368 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -441,6 +441,7 @@
     // The values in x->pred_mv[] differ for single and multi-thread cases.
     // See aomedia:1778.
     // sf->mv_sf.adaptive_motion_search = 1;
+    sf->mv_sf.full_pixel_search_level = 1;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
     sf->mv_sf.search_method = DIAMOND;
@@ -630,6 +631,7 @@
   sf->intra_sf.intra_pruning_with_hog = 1;
   sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
 
+  sf->mv_sf.full_pixel_search_level = 1;
   sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
 
   sf->rt_sf.check_intra_pred_nonrd = 1;
@@ -928,6 +930,7 @@
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+  mv_sf->full_pixel_search_level = 0;
   mv_sf->adaptive_motion_search = 0;
   mv_sf->auto_mv_step_size = 0;
   mv_sf->exhaustive_searches_thresh = 0;
diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index e43fb10..96baf5d 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h
@@ -472,6 +472,9 @@
   // 0: obmc_full_pixel_diamond
   // 1: obmc_refining_search_sad (faster)
   int obmc_full_pixel_search_level;
+
+  // Accurate full pixel motion search based on TPL stats.
+  int full_pixel_search_level;
 } MV_SPEED_FEATURES;
 
 typedef struct INTER_MODE_SPEED_FEATURES {