Use rd cost to decide final motion vector

Use the rd cost to decide the final step motion vector search.
The compression performance is improved in speed 1, q mode:

       overall PSNR     SSIM
lowres  -0.10%          -0.17%
midres2 -0.07%          -0.05%

STATS_CHANGED

Change-Id: I7fc3c3933e69262108781df079c417ac7e9066e5
diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 5113d72..6294895 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c
@@ -20,6 +20,7 @@
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
 
 #define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
 
@@ -303,19 +304,52 @@
         if (cpi->sf.mv_sf.use_accurate_subpel_search) {
           const int try_second = second_best_mv.as_int != INVALID_MV &&
                                  second_best_mv.as_int != best_mv->as_int;
-          const int best_mv_var = mv_search_params->find_fractional_mv_step(
+          mv_search_params->find_fractional_mv_step(
               xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
               &x->pred_sse[ref], fractional_ms_list);
 
           if (try_second) {
+            struct macroblockd_plane *p = xd->plane;
+            const BUFFER_SET orig_dst = {
+              { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+              { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+            };
+            mbmi->mv[0].as_mv = best_mv->as_mv;
+            av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                          bsize, 0, 0);
+            av1_subtract_plane(x, bsize, 0);
+            RD_STATS this_rd_stats;
+            av1_init_rd_stats(&this_rd_stats);
+            av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
+                                  max_txsize_rect_lookup[bsize]);
+            int this_mv_rate = av1_mv_bit_cost(
+                &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+            int64_t rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate,
+                                this_rd_stats.dist);
+
             MV this_best_mv;
             subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
             if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
                                          subpel_start_mv)) {
-              const int this_var = mv_search_params->find_fractional_mv_step(
+              mv_search_params->find_fractional_mv_step(
                   xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
                   &x->pred_sse[ref], fractional_ms_list);
-              if (this_var < best_mv_var) best_mv->as_mv = this_best_mv;
+              mbmi->mv[0].as_mv = this_best_mv;
+              av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                            bsize, 0, 0);
+              av1_subtract_plane(x, bsize, 0);
+              RD_STATS tmp_rd_stats;
+              av1_init_rd_stats(&tmp_rd_stats);
+              av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
+                                    max_txsize_rect_lookup[bsize]);
+              int tmp_mv_rate = av1_mv_bit_cost(
+                  &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                  mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+              int64_t tmp_rd =
+                  RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
+                         tmp_rd_stats.dist);
+              if (tmp_rd < rd) best_mv->as_mv = this_best_mv;
             }
           }
         } else {
@@ -374,7 +408,8 @@
   // Prediction buffer from second frame.
   DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
   uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
-  int_mv best_mv;
+
+  int_mv best_mv, second_best_mv;
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
@@ -453,13 +488,18 @@
     // Small-range full-pixel motion search.
     if (!cpi->sf.mv_sf.disable_extensive_joint_motion_search &&
         mbmi->interinter_comp.type != COMPOUND_WEDGE) {
-      bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
-                                      &best_mv.as_fullmv, NULL);
+      bestsme =
+          av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+                                &best_mv.as_fullmv, &second_best_mv.as_fullmv);
     } else {
       bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
                                          &best_mv.as_fullmv);
+      second_best_mv = best_mv;
     }
 
+    const int try_second = second_best_mv.as_int != INVALID_MV &&
+                           second_best_mv.as_int != best_mv.as_int;
+
     // Restore the pointer to the first (possibly scaled) prediction buffer.
     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
 
@@ -493,6 +533,58 @@
       MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
       bestsme = cpi->mv_search_params.find_fractional_mv_step(
           xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+
+      if (try_second) {
+        struct macroblockd_plane *p = xd->plane;
+        const BUFFER_SET orig_dst = {
+          { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+          { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+        };
+        mbmi->mv[id].as_mv = best_mv.as_mv;
+        mbmi->mv[!id].as_mv = cur_mv[!id].as_mv;
+
+        xd->plane[plane].pre[0] = ref_yv12[0];
+        xd->plane[plane].pre[1] = ref_yv12[1];
+
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+                                      0, 0);
+        av1_subtract_plane(x, bsize, 0);
+        RD_STATS this_rd_stats;
+        av1_init_rd_stats(&this_rd_stats);
+        av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
+                              max_txsize_rect_lookup[bsize]);
+        int this_mv_rate = av1_mv_bit_cost(
+            &best_mv.as_mv, &ref_mv[id].as_mv, mv_costs->nmv_joint_cost,
+            mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+        int64_t rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate,
+                            this_rd_stats.dist);
+
+        MV this_best_mv;
+        MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+        if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
+          const int this_var = cpi->mv_search_params.find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis, &sse,
+              NULL);
+          mbmi->mv[id].as_mv = this_best_mv;
+          mbmi->mv[!id].as_mv = cur_mv[!id].as_mv;
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                        bsize, 0, 0);
+          av1_subtract_plane(x, bsize, 0);
+          RD_STATS tmp_rd_stats;
+          av1_init_rd_stats(&tmp_rd_stats);
+          av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
+                                max_txsize_rect_lookup[bsize]);
+          int tmp_mv_rate = av1_mv_bit_cost(
+              &this_best_mv, &ref_mv[id].as_mv, mv_costs->nmv_joint_cost,
+              mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+          int64_t tmp_rd = RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
+                                  tmp_rd_stats.dist);
+          if (tmp_rd < rd) {
+            best_mv.as_mv = this_best_mv;
+            bestsme = AOMMIN(bestsme, this_var);
+          }
+        }
+      }
     }
 
     // Restore the pointer to the first prediction buffer.