Improve MV prediction accuracy to achieve performance gain

Add vp8_mv_pred() to better predict starting MV for NEWMV
mode in vp8_rd_pick_inter_mode(). Set different search
ranges according to MV prediction accuracy, which improves
encoder performance without hurting the quality. Also,
as Yaowu suggested, using diamond search result as full
search starting point and therefore adjusting(reducing)
full search range helps the performance.

Change-Id: Ie4a3c8df87e697c1f4f6e2ddb693766bba1b77b6
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 8a94fa3..539a28f 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -472,7 +472,7 @@
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
 
     // Initial step/diamond search centred on best mv
-    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
     if ( tmp_err < INT_MAX-new_mv_mode_penalty )
         tmp_err += new_mv_mode_penalty;
 
@@ -495,7 +495,7 @@
             num00--;
         else
         {
-            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
             if ( tmp_err < INT_MAX-new_mv_mode_penalty )
                 tmp_err += new_mv_mode_penalty;
 
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index bb85afa..b9d86a2 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -913,7 +913,8 @@
     int *num00,
     vp8_variance_fn_ptr_t *fn_ptr,
     int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
     int i, j, step;
@@ -949,7 +950,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // search_param determines the length of the initial step and hence the number of iterations
@@ -982,7 +983,7 @@
                 {
                     this_mv.row = this_row_offset << 3;
                     this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1013,7 +1014,7 @@
         return INT_MAX;
 
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 
 int vp8_diamond_search_sadx4
@@ -1028,7 +1029,8 @@
     int *num00,
     vp8_variance_fn_ptr_t *fn_ptr,
     int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
     int i, j, step;
@@ -1064,7 +1066,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // search_param determines the length of the initial step and hence the number of iterations
@@ -1108,7 +1110,7 @@
                     {
                         this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                         this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                         if (sad_array[t] < bestsad)
                         {
@@ -1137,7 +1139,7 @@
                     {
                         this_mv.row = this_row_offset << 3;
                         this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                         if (thissad < bestsad)
                         {
@@ -1168,12 +1170,12 @@
         return INT_MAX;
 
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 
 
 #if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
     unsigned char *what = (*(b->base_src) + b->src);
     int what_stride = b->src_stride;
@@ -1211,7 +1213,7 @@
         // Baseline value at the centre
 
         //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1239,7 +1241,7 @@
             this_mv.col = c << 3;
             //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
             //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
 
             if (thissad < bestsad)
             {
@@ -1258,12 +1260,12 @@
 
     if (bestsad < INT_MAX)
         return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
     else
         return INT_MAX;
 }
 
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
     unsigned char *what = (*(b->base_src) + b->src);
     int what_stride = b->src_stride;
@@ -1301,7 +1303,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1336,7 +1338,7 @@
                 if (thissad < bestsad)
                 {
                     this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1359,7 +1361,7 @@
             if (thissad < bestsad)
             {
                 this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                 if (thissad < bestsad)
                 {
@@ -1381,14 +1383,14 @@
 
     if (bestsad < INT_MAX)
         return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
     else
         return INT_MAX;
 }
 #endif
 
 
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
     unsigned char *what = (*(b->base_src) + b->src);
     int what_stride = b->src_stride;
@@ -1427,7 +1429,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1462,7 +1464,7 @@
                 if (thissad < bestsad)
                 {
                     this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1491,7 +1493,7 @@
                 if (thissad < bestsad)
                 {
                     this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1514,7 +1516,7 @@
             if (thissad < bestsad)
             {
                 this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                 if (thissad < bestsad)
                 {
@@ -1535,7 +1537,7 @@
 
     if (bestsad < INT_MAX)
         return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
     else
         return INT_MAX;
 }
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 7d60362..b54fe73 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -67,7 +67,8 @@
      int distance, \
      vp8_variance_fn_ptr_t *fn_ptr, \
      int *mvcost[2], \
-     int *mvsadcost[2] \
+     int *mvsadcost[2], \
+     MV *center_mv \
     )
 
 #define prototype_diamond_search_sad(sym)\
@@ -83,7 +84,8 @@
      int *num00, \
      vp8_variance_fn_ptr_t *fn_ptr, \
      int *mvsadcost[2], \
-     int *mvcost[2] \
+     int *mvcost[2], \
+     MV *center_mv \
     )
 
 #if ARCH_X86 || ARCH_X86_64
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 5f02a5a..fd7b566 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -283,6 +283,21 @@
 
 void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 {
+    // Delete last frame MV storage buffers
+    if (cpi->lfmv != 0)
+        vpx_free(cpi->lfmv);
+
+    cpi->lfmv = 0;
+
+    if (cpi->lf_ref_frame_sign_bias != 0)
+        vpx_free(cpi->lf_ref_frame_sign_bias);
+
+    cpi->lf_ref_frame_sign_bias = 0;
+
+    if (cpi->lf_ref_frame != 0)
+        vpx_free(cpi->lf_ref_frame);
+
+    cpi->lf_ref_frame = 0;
 
     // Delete sementation map
     if (cpi->segmentation_map != 0)
@@ -2145,7 +2160,10 @@
     cpi->alt_is_last  = 0 ;
     cpi->gold_is_alt  = 0 ;
 
-
+    // allocate memory for storing last frame's MVs for MV prediction.
+    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int_mv)));
+    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
+    CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+1) * (cpi->common.mb_cols+1), sizeof(int)));
 
     // Create the encoder segmentation map and set all entries to 0
     CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
@@ -4165,6 +4183,60 @@
     }
 #endif
 
+
+
+
+////////////////////////////////
+////////////////////////////////
+    // This frame's MVs are saved and will be used in next frame's MV prediction.
+    if(cm->show_frame)   //do not save for altref frame
+    {
+      int mb_row;
+      int mb_col;
+      MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays.
+      //static int last_video_frame = 0;
+
+      /*
+      if (cm->current_video_frame == 0)   //first frame: set to 0
+      {
+        for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+        {
+            for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+            {
+                cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride)].as_int = 0;
+                cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride)] = 0;
+                cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride)] = 0;
+            }
+        }
+      }else
+      */
+
+      if(cm->frame_type != KEY_FRAME)
+      {
+        for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+        {
+          for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+          {
+              if(tmp->mbmi.ref_frame != INTRA_FRAME)
+                cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride)].as_int = tmp->mbmi.mv.as_int;
+
+              cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+              cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride)] = tmp->mbmi.ref_frame;
+              //printf("[%d, %d]  ", cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride-1)].as_mv.row, cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride-1)].as_mv.col);
+              tmp++;
+          }
+        }
+
+      //last_video_frame = cm->current_video_frame;
+      }
+    }
+
+//printf("after: %d   %d \n", cm->current_video_frame, cm->show_frame );
+
+
+
+
+
     // Update the GF useage maps.
     // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
     vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 81e32f0..31e627b 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -239,6 +239,12 @@
     BLOCK_MAX_SEGMENTS
 };
 
+typedef union
+{
+    unsigned int as_int;
+    MV           as_mv;
+} int_mv;        /* facilitates rapid equality tests */
+
 typedef struct
 {
 
@@ -661,6 +667,10 @@
     unsigned char *gf_active_flags;   // Record of which MBs still refer to last golden frame either directly or through 0,0
     int gf_active_count;
 
+    //Store last frame's MV info for next frame MV prediction
+    int_mv *lfmv;
+    int *lf_ref_frame_sign_bias;
+    int *lf_ref_frame;
 
 } VP8_COMP;
 
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 2f7dd9c..8dfca35 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -685,7 +685,7 @@
 #if 0
 
             // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
+            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
             mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
             mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -698,7 +698,7 @@
                     num00--;
                 else
                 {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);
+                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);
 
                     if (thissme < bestsme)
                     {
@@ -724,7 +724,7 @@
             }
             else
             {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -743,7 +743,7 @@
                         num00--;
                     else
                     {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9
 
                         if (thissme < bestsme)
                         {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 8a753fd..99129ac 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1236,7 +1236,7 @@
                             bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
                         else
                         {
-                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
+                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost, best_ref_mv);
 
                             n = num00;
                             num00 = 0;
@@ -1249,7 +1249,7 @@
                                     num00--;
                                 else
                                 {
-                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
+                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost, best_ref_mv);
 
                                     if (thissme < bestsme)
                                     {
@@ -1264,7 +1264,7 @@
                         // Should we do a full search (best quality only)
                         if ((compressor_speed == 0) && (bestsme >> sseshift) > 4000)
                         {
-                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost);
+                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost, best_ref_mv);
 
                             if (thissme < bestsme)
                             {
@@ -1385,6 +1385,273 @@
 }
 
 
+
+/////////////////////////
+static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = x->mbmi.mv.as_mv;
+
+    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+static void lf_mv_bias(const int lf_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = mvp->as_mv;
+
+    if (lf_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+static void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
+{
+    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static void swap(int *x,int *y)
+{
+   int tmp;
+
+   tmp = *x;
+   *x = *y;
+   *y = tmp;
+}
+
+static void quicksortmv(int arr[],int left, int right)
+{
+   int lidx,ridx,pivot;
+
+   lidx = left;
+   ridx = right;
+
+   if( left < right)
+   {
+      pivot = (left + right)/2;
+
+      while(lidx <=pivot && ridx >=pivot)
+      {
+          while(arr[lidx] < arr[pivot] && lidx <= pivot)
+              lidx++;
+          while(arr[ridx] > arr[pivot] && ridx >= pivot)
+              ridx--;
+          swap(&arr[lidx], &arr[ridx]);
+          lidx++;
+          ridx--;
+          if(lidx-1 == pivot)
+          {
+              ridx++;
+              pivot = ridx;
+          }
+          else if(ridx+1 == pivot)
+          {
+              lidx--;
+              pivot = lidx;
+          }
+      }
+      quicksortmv(arr, left, pivot - 1);
+      quicksortmv(arr, pivot + 1, right);
+   }
+}
+
+static void quicksortsad(int arr[],int idx[], int left, int right)
+{
+   int lidx,ridx,pivot;
+
+   lidx = left;
+   ridx = right;
+
+   if( left < right)
+   {
+      pivot = (left + right)/2;
+
+      while(lidx <=pivot && ridx >=pivot)
+      {
+          while(arr[lidx] < arr[pivot] && lidx <= pivot)
+              lidx++;
+          while(arr[ridx] > arr[pivot] && ridx >= pivot)
+              ridx--;
+          swap(&arr[lidx], &arr[ridx]);
+          swap(&idx[lidx], &idx[ridx]);
+          lidx++;
+          ridx--;
+          if(lidx-1 == pivot)
+          {
+              ridx++;
+              pivot = ridx;
+          }
+          else if(ridx+1 == pivot)
+          {
+              lidx--;
+              pivot = lidx;
+          }
+      }
+      quicksortsad(arr, idx, left, pivot - 1);
+      quicksortsad(arr, idx, pivot + 1, right);
+   }
+}
+
+//The improved MV prediction
+static void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    MV *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+)
+{
+    const MODE_INFO *above = here - xd->mode_info_stride;
+    const MODE_INFO *left = here - 1;
+    const MODE_INFO *aboveleft = above - 1;
+    int_mv           near_mvs[7];
+    int              near_ref[7];
+    int_mv           mv;
+    int              vcnt=0;
+    int              find=0;
+    int              mb_offset;
+
+    int              mvx[7];
+    int              mvy[7];
+    int              i;
+
+    mv.as_int = 0;
+
+    if(here->mbmi.ref_frame != INTRA_FRAME)
+    {
+        near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = 0;
+        near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = 0;
+
+        // read in 3 nearby block's MVs from current frame as prediction candidates.
+        if (above->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = above->mbmi.mv.as_int;
+            mv_bias(above, refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  above->mbmi.ref_frame;
+        }
+        vcnt++;
+        if (left->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = left->mbmi.mv.as_int;
+            mv_bias(left, refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  left->mbmi.ref_frame;
+        }
+        vcnt++;
+        if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = aboveleft->mbmi.mv.as_int;
+            mv_bias(aboveleft, refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  aboveleft->mbmi.ref_frame;
+        }
+        vcnt++;
+
+        // read in 4 nearby block's MVs from last frame.
+        if(cpi->common.last_frame_type != KEY_FRAME)
+        {
+            mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride) + (-xd->mb_to_left_edge/128 +1) ;
+
+            // current in last frame
+            if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
+                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];
+            }
+            vcnt++;
+
+            // above in last frame
+            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride].as_int;
+                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride];
+            }
+            vcnt++;
+
+            // left in last frame
+            if (cpi->lf_ref_frame[mb_offset-1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset -1].as_int;
+                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];
+            }
+            vcnt++;
+
+            // aboveleft in last frame
+            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride -1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride -1].as_int;
+                lf_mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride -1];
+            }
+            vcnt++;
+        }
+
+        for(i=0; i< vcnt; i++)
+        {
+            if(near_ref[near_sadidx[i]] != INTRA_FRAME)
+            {
+                if(here->mbmi.ref_frame == near_ref[near_sadidx[i]])
+                {
+                    mv.as_int = near_mvs[near_sadidx[i]].as_int;
+                    find = 1;
+                    if(vcnt<2)
+                        *sr = 4;
+                    else if (vcnt<4)
+                        *sr = 3;
+                    else
+                        *sr = 2;
+                    break;
+                }
+            }
+        }
+
+        if(!find)
+        {
+            for(i=0; i<vcnt; i++)
+            {
+                mvx[i] = near_mvs[i].as_mv.row;
+                mvy[i] = near_mvs[i].as_mv.col;
+            }
+
+            quicksortmv (mvx, 0, vcnt-1);
+            quicksortmv (mvy, 0, vcnt-1);
+            mv.as_mv.row = mvx[vcnt/2];
+            mv.as_mv.col = mvy[vcnt/2];
+
+            find = 1;
+            //sr is set to 0 to allow calling function to decide the search range.
+            *sr = 0;
+        }
+    }
+
+    /* Set up return values */
+    *mvp = mv.as_mv;
+    vp8_clamp_mv(mvp, xd);
+}
+
 int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
     BLOCK *b = &x->block[0];
@@ -1422,6 +1689,12 @@
     int tteob = 0;
     int force_no_skip = 0;
 
+    MV mvp;
+    int near_sad[7]; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf aboveleft
+    int near_sadidx[7] = {0, 1, 2, 3, 4, 5, 6};
+    int saddone=0;
+    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
+
     *returnintra = INT_MAX;
 
     vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); // clean
@@ -1589,6 +1862,72 @@
                           &mode_mv[NEARESTMV], &mode_mv[NEARMV], &best_ref_mv,
                           mdcounts, x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias);
 
+        if(x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+        {
+            if(!saddone)
+            {
+                //calculate sad for current frame 3 nearby MBs.
+                if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+                {
+                    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+                }else if(xd->mb_to_top_edge==0)
+                {   //only has left MB for sad calculation.
+                    near_sad[0] = near_sad[2] = INT_MAX;
+                    near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+                }else if(xd->mb_to_left_edge ==0)
+                {   //only has left MB for sad calculation.
+                    near_sad[1] = near_sad[2] = INT_MAX;
+                    near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+                }else
+                {
+                    near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+                    near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+                    near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff);
+                }
+
+                if(cpi->common.last_frame_type != KEY_FRAME)
+                {
+                    //calculate sad for last frame 4 nearby MBs.
+                    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+                    int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+                    if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+                    {
+                        near_sad[4] = near_sad[5] = near_sad[6] = INT_MAX;
+                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+                    }else if(xd->mb_to_top_edge==0)
+                    {   //only has left MB for sad calculation.
+                        near_sad[4] = near_sad[6] = INT_MAX;
+                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+                        near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
+                    }else if(xd->mb_to_left_edge ==0)
+                    {   //only has left MB for sad calculation.
+                        near_sad[5] = near_sad[6] = INT_MAX;
+                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+                        near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
+                    }else
+                    {
+                        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+                        near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
+                        near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
+                        near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16 -16, pre_y_stride, 0x7fffffff);
+                    }
+                }
+
+                if(cpi->common.last_frame_type != KEY_FRAME)
+                {
+                    quicksortsad(near_sad, near_sadidx, 0, 6);
+                }else
+                {
+                    quicksortsad(near_sad, near_sadidx, 0, 2);
+                }
+
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+        }
 
         // Estimate the reference frame signaling cost and add it to the rolling cost variable.
         frame_cost = ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
@@ -1764,6 +2103,10 @@
                 int further_steps;
                 int n;
 
+                //adjust search range according to sr from mv prediction
+                if(sr > step_param)
+                    step_param = sr;
+
                 // Work out how long a search we should do
                 search_range = MAXF(abs(best_ref_mv.col), abs(best_ref_mv.row)) >> 3;
 
@@ -1784,7 +2127,7 @@
                     }
                     else
                     {
-                        bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
+                        bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -1803,7 +2146,7 @@
                                 num00--;
                             else
                             {
-                                thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
+                                thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9
 
                                 if (thissme < bestsme)
                                 {
@@ -1827,9 +2170,14 @@
                 {
                     int thissme;
                     int full_flag_thresh = 0;
+                    MV full_mvp;
+
+                    full_mvp.row = d->bmi.mv.as_mv.row <<3;    // use diamond search result as full search staring point
+                    full_mvp.col = d->bmi.mv.as_mv.col <<3;
 
                     // Update x->vector_range based on best vector found in step search
-                    search_range = MAXF(abs(d->bmi.mv.as_mv.row), abs(d->bmi.mv.as_mv.col));
+                    search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
+                    //search_range *= 1.4;  //didn't improve PSNR
 
                     if (search_range > x->vector_range)
                         x->vector_range = search_range;
@@ -1838,9 +2186,20 @@
 
                     // Apply limits
                     search_range = (search_range > cpi->sf.max_fs_radius) ? cpi->sf.max_fs_radius : search_range;
+
+                    //add this to reduce full search range.
+                    if(sr<=3 && search_range > 8) search_range = 8;
+
                     {
                         int sadpb = x->sadperbit16 >> 2;
-                        thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost);
+                        thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost,&best_ref_mv);
+                        /*
+                        MV dia_ref_mv;
+                        dia_ref_mv.row = d->bmi.mv.as_mv.row << 3;
+                        dia_ref_mv.col = d->bmi.mv.as_mv.col << 3;
+                        thissme = cpi->full_search_sad(x, b, d, &dia_ref_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost,&best_ref_mv);
+                        */
+
                     }
 
                     // Barrier threshold to initiating full search
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index fd5dd7e..745dc31 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -246,7 +246,7 @@
             step_param,
             sadpb / 2/*x->errorperbit*/,
             &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost); //sadpb < 9
+            mvsadcost, mvcost, &best_ref_mv1); //sadpb < 9
 
         // Further step/diamond searches as necessary
         n = 0;
@@ -268,7 +268,7 @@
                     step_param + n,
                     sadpb / 4/*x->errorperbit*/,
                     &num00, &cpi->fn_ptr[BLOCK_16X16],
-                    mvsadcost, mvcost); //sadpb = 9
+                    mvsadcost, mvcost, &best_ref_mv1); //sadpb = 9
 
                 if (thissme < bestsme)
                 {