a new way of determining reference motion vector

Using surrounding reconstructed pixels from left and above to select
best matching mv to use as reference motion vector for mv encoding.

Test results:
       AVGPSNR  GLBPSNR VPXSSIM
Derf:  1.107%   1.062%  0.992%
Std-hd:1.209%   1.176%  1.029%

Change-Id: I8f10e09ee6538c05df2fb9f069abcaf1edb3fca6
diff --git a/configure b/configure
index b135874..2593a0e 100755
--- a/configure
+++ b/configure
@@ -226,6 +226,7 @@
     hybridtransform8x8
     switchable_interp
     tx16x16
+    newbestrefmv
 "
 CONFIG_LIST="
     external_build
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 2932fd4..a0fe46c 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -267,6 +267,9 @@
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   TX_SIZE txfm_size;
   int_mv mv, second_mv;
+#if CONFIG_NEWBESTREFMV
+  int_mv ref_mv, second_ref_mv;
+#endif
   unsigned char partitioning;
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
@@ -423,6 +426,9 @@
 #endif
 
   int mb_index;   // Index of the MB in the SB (0..3)
+#if CONFIG_NEWBESTREFMV
+  int_mv ref_mv[4];
+#endif
 
 #if CONFIG_HYBRIDTRANSFORM
   int q_index;
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index d35e2c4..303893d 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -10,6 +10,7 @@
 
 
 #include "findnearmv.h"
+#include <limits.h>
 
 const unsigned char vp8_mbsplit_offset[4][16] = {
   { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
@@ -18,6 +19,15 @@
   { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
 };
 
+static void lower_mv_precision(int_mv *mv)
+{
+  if (mv->as_mv.row & 1)
+    mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
+  if (mv->as_mv.col & 1)
+    mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1);
+}
+
+
 /* Predict motion vectors using those from already-decoded nearby blocks.
    Note that we only consider one 4x4 subblock from each candidate 16x16
    macroblock.   */
@@ -32,8 +42,7 @@
   int_mv *best_mv,
   int cnt[4],
   int refframe,
-  int *ref_frame_sign_bias
-) {
+  int *ref_frame_sign_bias) {
   const MODE_INFO *above = here - xd->mode_info_stride;
   const MODE_INFO *left = here - 1;
   const MODE_INFO *aboveleft = above - 1;
@@ -43,16 +52,30 @@
   int             *cntx = cnt;
   enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
 
+#if CONFIG_NEWBESTREFMV
+  int_mv          *ref_mv = xd->ref_mv;
+#endif
+
   /* Zero accumulators */
   mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
   cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+#if CONFIG_NEWBESTREFMV
+  ref_mv[0].as_int = ref_mv[1].as_int
+                   = ref_mv[2].as_int
+                   = ref_mv[3].as_int
+                   = 0;
+#endif
 
   /* Process above */
   if (above->mbmi.ref_frame != INTRA_FRAME) {
     if (above->mbmi.mv.as_int) {
-      (++mv)->as_int = above->mbmi.mv.as_int;
+      ++ mv;
+      mv->as_int = above->mbmi.mv.as_int;
       mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
               refframe, mv, ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+      ref_mv[0].as_int = mv->as_int;
+#endif
       ++cntx;
     }
     *cntx += 2;
@@ -65,10 +88,13 @@
       this_mv.as_int = left->mbmi.mv.as_int;
       mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
               refframe, &this_mv, ref_frame_sign_bias);
-
+#if CONFIG_NEWBESTREFMV
+      ref_mv[1].as_int = this_mv.as_int;
+#endif
       if (this_mv.as_int != mv->as_int) {
-        (++mv)->as_int = this_mv.as_int;
-        ++cntx;
+        ++ mv;
+        mv->as_int = this_mv.as_int;
+        ++ cntx;
       }
       *cntx += 2;
     } else
@@ -79,9 +105,21 @@
       (lf_here->mbmi.ref_frame == LAST_FRAME && refframe == LAST_FRAME)) {
     if (aboveleft->mbmi.mv.as_int) {
       third = aboveleft;
+#if CONFIG_NEWBESTREFMV
+      ref_mv[2].as_int = aboveleft->mbmi.mv.as_int;
+      mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame],
+              refframe, (ref_mv+2), ref_frame_sign_bias);
+#endif
     } else if (lf_here->mbmi.mv.as_int) {
       third = lf_here;
     }
+#if CONFIG_NEWBESTREFMV
+    if (lf_here->mbmi.mv.as_int) {
+      ref_mv[3].as_int = lf_here->mbmi.mv.as_int;
+      mv_bias(ref_frame_sign_bias[lf_here->mbmi.ref_frame],
+              refframe, (ref_mv+3), ref_frame_sign_bias);
+    }
+#endif
     if (third) {
       int_mv this_mv;
       this_mv.as_int = third->mbmi.mv.as_int;
@@ -89,8 +127,9 @@
               refframe, &this_mv, ref_frame_sign_bias);
 
       if (this_mv.as_int != mv->as_int) {
-        (++mv)->as_int = this_mv.as_int;
-        ++cntx;
+        ++ mv;
+        mv->as_int = this_mv.as_int;
+        ++ cntx;
       }
       *cntx += 1;
     } else
@@ -134,18 +173,9 @@
    * is not being used, by truncating the last bit towards 0
    */
   if (!xd->allow_high_precision_mv) {
-    if (best_mv->as_mv.row & 1)
-      best_mv->as_mv.row += (best_mv->as_mv.row > 0 ? -1 : 1);
-    if (best_mv->as_mv.col & 1)
-      best_mv->as_mv.col += (best_mv->as_mv.col > 0 ? -1 : 1);
-    if (nearest->as_mv.row & 1)
-      nearest->as_mv.row += (nearest->as_mv.row > 0 ? -1 : 1);
-    if (nearest->as_mv.col & 1)
-      nearest->as_mv.col += (nearest->as_mv.col > 0 ? -1 : 1);
-    if (nearby->as_mv.row & 1)
-      nearby->as_mv.row += (nearby->as_mv.row > 0 ? -1 : 1);
-    if (nearby->as_mv.col & 1)
-      nearby->as_mv.col += (nearby->as_mv.col > 0 ? -1 : 1);
+    lower_mv_precision(best_mv);
+    lower_mv_precision(nearest);
+    lower_mv_precision(nearby);
   }
 
   // TODO: move clamp outside findnearmv
@@ -163,3 +193,72 @@
   p[3] = pc->fc.vp8_mode_contexts [near_mv_ref_ct[3]] [3];
   return p;
 }
+
+#if CONFIG_NEWBESTREFMV
+/* check a list of motion vectors by sad score using a number rows of pixels
+ * above and a number cols of pixels in the left to select the one with best
+ * score to use as ref motion vector
+ */
+void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
+                           unsigned char *ref_y_buffer,
+                           int ref_y_stride,
+                           int_mv *best_mv){
+  int_mv *ref_mv = xd->ref_mv;
+  int bestsad = INT_MAX;
+  int i;
+  unsigned char *above_src;
+  unsigned char *left_src;
+  unsigned char *above_ref;
+  unsigned char *left_ref;
+  int sad;
+
+  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+  left_src  = xd->dst.y_buffer - 2;
+  above_ref = ref_y_buffer - ref_y_stride * 2;
+  left_ref  = ref_y_buffer - 2;
+
+  bestsad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+                          above_ref, ref_y_stride,
+                          INT_MAX);
+  bestsad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+                           left_ref, ref_y_stride,
+                           INT_MAX);
+  best_mv->as_int = 0;
+
+  for(i = 0; i < 4; ++i) {
+    if (ref_mv[i].as_int) {
+      int_mv this_mv;
+      int offset=0;
+      int row_offset, col_offset;
+      this_mv.as_int = ref_mv[i].as_int;
+      vp8_clamp_mv(&this_mv,
+                   xd->mb_to_left_edge - LEFT_TOP_MARGIN + 16,
+                   xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+                   xd->mb_to_top_edge - LEFT_TOP_MARGIN + 16,
+                   xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+
+      row_offset = (this_mv.as_mv.row > 0) ?
+        ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
+      col_offset = (this_mv.as_mv.col > 0) ?
+        ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
+      offset = ref_y_stride * row_offset + col_offset;
+
+      sad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+                          above_ref + offset, ref_y_stride, INT_MAX);
+
+      sad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+                           left_ref + offset, ref_y_stride, INT_MAX);
+
+      if (sad < bestsad) {
+        bestsad = sad;
+        best_mv->as_int = this_mv.as_int;
+      }
+    }
+  }
+  if (!xd->allow_high_precision_mv)
+    lower_mv_precision(best_mv);
+
+  vp8_clamp_mv2(best_mv, xd);
+}
+
+#endif
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index d4769e6..3bb2024 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -33,20 +33,14 @@
 
 #define LEFT_TOP_MARGIN (16 << 3)
 #define RIGHT_BOTTOM_MARGIN (16 << 3)
-static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
-  if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-    mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-  else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-    mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
 
-  if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-    mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-  else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-    mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
 
-static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
-                         int mb_to_top_edge, int mb_to_bottom_edge) {
+
+static void vp8_clamp_mv(int_mv *mv,
+                         int mb_to_left_edge,
+                         int mb_to_right_edge,
+                         int mb_to_top_edge,
+                         int mb_to_bottom_edge) {
   mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
                   mb_to_left_edge : mv->as_mv.col;
   mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
@@ -56,15 +50,26 @@
   mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
                   mb_to_bottom_edge : mv->as_mv.row;
 }
-static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
-                                        int mb_to_right_edge, int mb_to_top_edge,
+
+static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+  vp8_clamp_mv(mv,
+              xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+              xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+              xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+              xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+
+
+static unsigned int vp8_check_mv_bounds(int_mv *mv,
+                                        int mb_to_left_edge,
+                                        int mb_to_right_edge,
+                                        int mb_to_top_edge,
                                         int mb_to_bottom_edge) {
-  unsigned int need_to_clamp;
-  need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
-  need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
-  need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
-  need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
-  return need_to_clamp;
+  return (mv->as_mv.col < mb_to_left_edge) ||
+         (mv->as_mv.col > mb_to_right_edge) ||
+         (mv->as_mv.row < mb_to_top_edge) ||
+         (mv->as_mv.row > mb_to_bottom_edge);
 }
 
 void vp8_find_near_mvs
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 5a11f64..2e0049d 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -619,10 +619,44 @@
     int_mv nearest_second, nearby_second, best_mv_second;
     vp8_prob mv_ref_p [VP8_MVREFS - 1];
 
+#if CONFIG_NEWBESTREFMV
+    int recon_y_stride, recon_yoffset;
+    int recon_uv_stride, recon_uvoffset;
+#endif
+
     vp8_find_near_mvs(xd, mi,
                       prev_mi,
                       &nearest, &nearby, &best_mv, rct,
-                      mbmi->ref_frame, pbi->common.ref_frame_sign_bias);
+                      mbmi->ref_frame, cm->ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+    {
+      int ref_fb_idx;
+
+      /* Select the appropriate reference frame for this MB */
+      if (mbmi->ref_frame == LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+      else if (mbmi->ref_frame == GOLDEN_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+      else
+        ref_fb_idx = cm->alt_fb_idx;
+
+      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;
+      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+
+      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+
+      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+      vp8_find_best_ref_mvs(xd,
+                            xd->pre.y_buffer,
+                            recon_y_stride,
+                            &best_mv);
+    }
+#endif
+
     vp8_mv_ref_probs(&pbi->common, mv_ref_p, rct);
 
     // Is the segment level mode feature enabled for this segment
@@ -672,11 +706,41 @@
       mbmi->second_ref_frame = mbmi->ref_frame + 1;
       if (mbmi->second_ref_frame == 4)
         mbmi->second_ref_frame = 1;
+#if CONFIG_NEWBESTREFMV
+      if (mbmi->second_ref_frame) {
+        int second_ref_fb_idx;
+        /* Select the appropriate reference frame for this MB */
+        if (mbmi->second_ref_frame == LAST_FRAME)
+          second_ref_fb_idx = cm->lst_fb_idx;
+        else if (mbmi->second_ref_frame ==
+          GOLDEN_FRAME)
+          second_ref_fb_idx = cm->gld_fb_idx;
+        else
+          second_ref_fb_idx = cm->alt_fb_idx;
 
-      vp8_find_near_mvs(xd, mi,
-                        prev_mi,
-                        &nearest_second, &nearby_second, &best_mv_second, rct,
-                        mbmi->second_ref_frame, pbi->common.ref_frame_sign_bias);
+        xd->second_pre.y_buffer =
+          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+        xd->second_pre.u_buffer =
+          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->second_pre.v_buffer =
+          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+        vp8_find_near_mvs(xd, mi, prev_mi,
+                          &nearest_second, &nearby_second, &best_mv_second,
+                          rct,
+                          mbmi->second_ref_frame,
+                          cm->ref_frame_sign_bias);
+        vp8_find_best_ref_mvs(xd,
+                              xd->second_pre.y_buffer,
+                              recon_y_stride,
+                              &best_mv_second);
+      }
+#else
+      vp8_find_near_mvs(xd, mi, prev_mi,
+                        &nearest_second, &nearby_second, &best_mv_second,
+                        rct,
+                        mbmi->second_ref_frame,
+                        pbi->common.ref_frame_sign_bias);
+#endif
     } else {
       mbmi->second_ref_frame = 0;
     }
@@ -941,7 +1005,7 @@
       }
     } else {
       mbmi->uv_mode = (MB_PREDICTION_MODE)vp8_read_uv_mode(
-		      bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
+        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
       pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
     }
 
@@ -1021,7 +1085,7 @@
 void vpx_decode_mode_mvs_init(VP8D_COMP *pbi){
   VP8_COMMON *cm = &pbi->common;
   mb_mode_mv_init(pbi);
-  if (cm->frame_type == KEY_FRAME &&!cm->kf_ymode_probs_update)
+  if (cm->frame_type == KEY_FRAME && !cm->kf_ymode_probs_update)
     cm->kf_ymode_probs_index = vp8_read_literal(&pbi->bc, 3);
 }
 void vpx_decode_mb_mode_mv(VP8D_COMP *pbi,
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 59f453e..ffa7f0c 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -631,10 +631,6 @@
       xd->up_available = (mb_row != 0);
       xd->left_available = (mb_col != 0);
 
-      if(pbi->interleaved_decoding)
-        vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
-
-      update_blockd_bmi(xd);
 
       recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
       recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
@@ -643,6 +639,11 @@
       xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
       xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
+      if(pbi->interleaved_decoding)
+        vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
+
+      update_blockd_bmi(xd);
+
       /* Select the appropriate reference frame for this MB */
       if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
         ref_fb_idx = pc->lst_fb_idx;
diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index f7d93b2..f9195b6 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -149,7 +149,7 @@
 
   pbi->decoded_key_frame = 0;
 
-  pbi->interleaved_decoding = 1;
+  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV;
 
   return (VP8D_PTR) pbi;
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 574427c..103391c 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -933,6 +933,9 @@
 
             vp8_find_near_mvs(xd, m, prev_m, &n1, &n2, &best_mv, ct,
                               rf, cpi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+            best_mv.as_int = mi->ref_mv.as_int;
+#endif
             vp8_mv_ref_probs(&cpi->common, mv_ref_p, ct);
 
 #ifdef ENTROPY_STATS
@@ -983,7 +986,11 @@
             vp8_find_near_mvs(xd, m,
                               prev_m,
                               &n1, &n2, &best_second_mv, ct,
-                              mi->second_ref_frame, cpi->common.ref_frame_sign_bias);
+                              mi->second_ref_frame,
+                              cpi->common.ref_frame_sign_bias);
+#if CONFIG_NEWBESTREFMV
+            best_second_mv.as_int = mi->second_ref_mv.as_int;
+#endif
           }
 
           // does the feature use compound prediction or not
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e1d0bf4..460c160 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -47,7 +47,8 @@
 int mb_row_debug, mb_col_debug;
 #endif
 
-extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x,
+                         TOKENEXTRA **t, int dry_run);
 
 extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
 extern void vp8_auto_select_speed(VP8_COMP *cpi);
@@ -1467,8 +1468,12 @@
   if (output_enabled) {
     // Tokenize
     sum_intra_stats(cpi, x);
-    vp8_tokenize_mb(cpi, &x->e_mbd, t);
+    vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
   }
+#if CONFIG_NEWBESTREFMV
+  else
+    vp8_tokenize_mb(cpi, &x->e_mbd, t, 1);
+#endif
 }
 #ifdef SPEEDSTATS
 extern int cnt_pm;
@@ -1624,8 +1629,9 @@
       fflush(stdout);
     }
 #endif
-    if (output_enabled)
-      vp8_tokenize_mb(cpi, xd, t);
+
+    vp8_tokenize_mb(cpi, xd, t, !output_enabled);
+
 #ifdef ENC_DEBUG
     if (enc_debug) {
       printf("Tokenized\n");
@@ -1640,12 +1646,14 @@
       0;
     if (cpi->common.mb_no_coeff_skip) {
       xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-      cpi->skip_true_count[mb_skip_context]++;
+      if (output_enabled)
+        cpi->skip_true_count[mb_skip_context]++;
       vp8_fix_contexts(xd);
     } else {
-      vp8_stuff_mb(cpi, xd, t);
+      vp8_stuff_mb(cpi, xd, t, !output_enabled);
       xd->mode_info_context->mbmi.mb_skip_coeff = 0;
-      cpi->skip_false_count[mb_skip_context]++;
+      if (output_enabled)
+        cpi->skip_false_count[mb_skip_context]++;
     }
   }
 }
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 6a0a902..6b6167b 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -761,7 +761,7 @@
 
 int rd_cost_intra_mb(MACROBLOCKD *x);
 
-void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **, int dry_run);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 67bf33d..df76fc3 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2730,6 +2730,10 @@
 #if CONFIG_PRED_FILTER
   int best_filter_state;
 #endif
+#if CONFIG_NEWBESTREFMV
+  int_mv ref_mv[MAX_REF_FRAMES] = {0};
+#endif
+
   // int all_rds[MAX_MODES];        // Experimental debug code.
   // int all_rates[MAX_MODES];
   // int all_dist[MAX_MODES];
@@ -2789,6 +2793,13 @@
     y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
     u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
     v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
+#if CONFIG_NEWBESTREFMV
+    vp8_find_best_ref_mvs(&x->e_mbd,
+                          y_buffer[LAST_FRAME],
+                          lst_yv12->y_stride,
+                          &frame_best_ref_mv[LAST_FRAME]);
+    ref_mv[LAST_FRAME].as_int = frame_best_ref_mv[LAST_FRAME].as_int;
+#endif
   }
 
   if (cpi->ref_frame_flags & VP8_GOLD_FLAG) {
@@ -2802,6 +2813,13 @@
     y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
     u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
     v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
+#if CONFIG_NEWBESTREFMV
+    vp8_find_best_ref_mvs(&x->e_mbd,
+                          y_buffer[GOLDEN_FRAME],
+                          gld_yv12->y_stride,
+                          &frame_best_ref_mv[GOLDEN_FRAME]);
+    ref_mv[GOLDEN_FRAME].as_int = frame_best_ref_mv[GOLDEN_FRAME].as_int;
+#endif
   }
 
   if (cpi->ref_frame_flags & VP8_ALT_FLAG) {
@@ -2815,6 +2833,13 @@
     y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
     u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
     v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
+#if CONFIG_NEWBESTREFMV
+    vp8_find_best_ref_mvs(&x->e_mbd,
+                          y_buffer[ALTREF_FRAME],
+                          alt_yv12->y_stride,
+                          &frame_best_ref_mv[ALTREF_FRAME]);
+    ref_mv[ALTREF_FRAME].as_int = frame_best_ref_mv[ALTREF_FRAME].as_int;
+#endif
   }
 
   *returnintra = INT64_MAX;
@@ -2872,6 +2897,12 @@
       vp8_mode_order[mode_index].ref_frame;
     xd->mode_info_context->mbmi.second_ref_frame =
       vp8_mode_order[mode_index].second_ref_frame;
+#if CONFIG_NEWBESTREFMV
+    x->e_mbd.mode_info_context->mbmi.ref_mv =
+      ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+    x->e_mbd.mode_info_context->mbmi.second_ref_mv =
+      ref_mv[x->e_mbd.mode_info_context->mbmi.second_ref_frame];
+#endif
 #if CONFIG_PRED_FILTER
     xd->mode_info_context->mbmi.pred_filter_enabled = 0;
 #endif
@@ -3851,8 +3882,14 @@
   }
 
   // macroblock modes
-  vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-
+  vpx_memcpy(&x->e_mbd.mode_info_context->mbmi,
+             &best_mbmode, sizeof(MB_MODE_INFO));
+#if CONFIG_NEWBESTREFMV
+  x->e_mbd.mode_info_context->mbmi.ref_mv =
+    ref_mv[best_mbmode.ref_frame];
+  x->e_mbd.mode_info_context->mbmi.second_ref_mv =
+    ref_mv[best_mbmode.second_ref_frame];
+#endif
   if (best_mbmode.mode == B_PRED) {
     for (i = 0; i < 16; i++) {
       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
index 5ce13ec..78a87f3 100644
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -104,7 +104,24 @@
 
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
-
+#if CONFIG_NEWBESTREFMV
+unsigned int vp8_sad2x16_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad){
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 2, 16);
+}
+unsigned int vp8_sad16x2_c(
+  const unsigned char *src_ptr,
+  int  src_stride,
+  const unsigned char *ref_ptr,
+  int  ref_stride,
+  int max_sad){
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 2);
+}
+#endif
 void vp8_sad16x16x3_c(
   const unsigned char *src_ptr,
   int  src_stride,
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 105aa6a..a8b6436 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -38,10 +38,13 @@
                     [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
 #endif
 #endif
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
-void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+void vp8_stuff_mb(VP8_COMP *cpi,
+                  MACROBLOCKD *x, TOKENEXTRA **t, int dry_run);
+void vp8_stuff_mb_8x8(VP8_COMP *cpi,
+                      MACROBLOCKD *x, TOKENEXTRA **t, int dry_run);
 #if CONFIG_TX16X16
-void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x,
+                        TOKENEXTRA **t, int dry_run);
 #endif
 void vp8_fix_contexts(MACROBLOCKD *x);
 
@@ -110,9 +113,15 @@
 }
 
 #if CONFIG_TX16X16
-static void tokenize1st_order_b_16x16(MACROBLOCKD *xd, const BLOCKD *const b, TOKENEXTRA **tp,
-                                      const int type, const FRAME_TYPE frametype, ENTROPY_CONTEXT *a,
-                                      ENTROPY_CONTEXT *l, VP8_COMP *cpi) {
+static void tokenize1st_order_b_16x16(MACROBLOCKD *xd,
+                                      const BLOCKD *const b,
+                                      TOKENEXTRA **tp,
+                                      const int type,
+                                      const FRAME_TYPE frametype,
+                                      ENTROPY_CONTEXT *a,
+                                      ENTROPY_CONTEXT *l,
+                                      VP8_COMP *cpi,
+                                      int dry_run) {
   int pt; /* near block/prev token context index */
   int c = 0;                  /* start at DC unless type 0 */
   const int eob = b->eob;     /* one beyond last nonzero coeff */
@@ -147,8 +156,8 @@
     t->context_tree = cpi->common.fc.coef_probs_16x16[type][band][pt];
 
     t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-    ++cpi->coef_counts_16x16[type][band][pt][x];
+    if (!dry_run)
+      ++cpi->coef_counts_16x16[type][band][pt][x];
   } while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < seg_eob);
 
   *tp = t;
@@ -166,8 +175,8 @@
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   int c = 0;          /* start at DC */
   const int eob = b->eob;     /* one beyond last nonzero coeff */
@@ -213,8 +222,8 @@
              x, vp8_coef_encodings[x].Len, t->skip_eob_node, eob, c, band, type,
              cpi->count, mb_row_debug, mb_col_debug);
 #endif
-
-    ++cpi->coef_counts_8x8       [type] [band] [pt] [x];
+    if (!dry_run)
+      ++cpi->coef_counts_8x8       [type] [band] [pt] [x];
   } while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < seg_eob);
 
 
@@ -224,8 +233,10 @@
 
 }
 
-static void tokenize2nd_order_b(MACROBLOCKD *xd, TOKENEXTRA **tp,
-                                VP8_COMP *cpi) {
+static void tokenize2nd_order_b(MACROBLOCKD *xd,
+                                TOKENEXTRA **tp,
+                                VP8_COMP *cpi,
+                                int dry_run) {
   int pt;             /* near block/prev token context index */
   int c;              /* start at DC */
   TOKENEXTRA *t = *tp;/* store tokens starting here */
@@ -261,7 +272,8 @@
 
     t->skip_eob_node = ((pt == 0) && (band > 0));
 
-    ++cpi->coef_counts       [1] [band] [pt] [token];
+    if (!dry_run)
+      ++cpi->coef_counts       [1] [band] [pt] [token];
 
     pt = vp8_prev_token_class[token];
     t++;
@@ -274,7 +286,8 @@
 
     t->skip_eob_node = ((pt == 0) && (band > 0));
 
-    ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+    if (!dry_run)
+      ++cpi->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
 
     t++;
   }
@@ -295,8 +308,8 @@
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   int c = type ? 0 : 1;       /* start at DC unless type 0 */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
@@ -324,7 +337,9 @@
     t->context_tree = cpi->common.fc.coef_probs_8x8[type][band][pt];
 
     t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
-    ++cpi->coef_counts_8x8[type][band][pt][x];
+
+    if (!dry_run)
+      ++cpi->coef_counts_8x8[type][band][pt][x];
 
     pt = vp8_prev_token_class[x];
     ++t;
@@ -337,7 +352,8 @@
     t->context_tree = cpi->common.fc.coef_probs_8x8 [type] [band] [pt];
     t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
 
-    ++cpi->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN];
+    if (!dry_run)
+      ++cpi->coef_counts_8x8[type][band][pt][DCT_EOB_TOKEN];
     ++t;
   }
 
@@ -350,7 +366,8 @@
 static void tokenize1st_order_ht(   MACROBLOCKD *xd,
                                     TOKENEXTRA **tp,
                                     int type,
-                                    VP8_COMP    *cpi) {
+                                    VP8_COMP *cpi,
+                                    int dry_run) {
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
@@ -428,7 +445,8 @@
       t->skip_eob_node = pt == 0 &&
           ((band > 0 && type > 0) || (band > 1 && type == 0));
 
-      ++cpi->coef_counts       [type] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -441,8 +459,8 @@
 
       t->skip_eob_node = pt == 0 &&
           ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-      ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -478,7 +496,8 @@
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -490,9 +509,8 @@
       t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
-
-      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
-
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
       t++;
     }
 
@@ -510,8 +528,8 @@
   MACROBLOCKD *xd,
   TOKENEXTRA **tp,
   int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
@@ -557,7 +575,8 @@
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -570,7 +589,8 @@
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -586,8 +606,8 @@
   MACROBLOCKD *xd,
   TOKENEXTRA **tp,
   int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
@@ -626,8 +646,8 @@
 
       t->skip_eob_node = pt == 0 &&
                          ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-      ++cpi->coef_counts       [type] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -640,8 +660,8 @@
 
       t->skip_eob_node = pt == 0 &&
                          ((band > 0 && type > 0) || (band > 1 && type == 0));
-
-      ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -671,7 +691,8 @@
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [token];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [token];
 
       pt = vp8_prev_token_class[token];
       t++;
@@ -684,7 +705,8 @@
 
       t->skip_eob_node = ((pt == 0) && (band > 0));
 
-      ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+      if (!dry_run)
+        ++cpi->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
 
       t++;
     }
@@ -757,12 +779,16 @@
 }
 #endif
 
-void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_tokenize_mb(VP8_COMP *cpi,
+                     MACROBLOCKD *x,
+                     TOKENEXTRA **t,
+                     int dry_run) {
   int plane_type;
   int has_y2_block;
   int b;
   int tx_type = x->mode_info_context->mbmi.txfm_size;
   int mb_skip_context = get_pred_context(&cpi->common, x, PRED_MBSKIP);
+  TOKENEXTRA *t_backup = *t;
 
   // If the MB is going to be skipped because of a segment level flag
   // exclude this from the skip count stats used to calculate the
@@ -804,25 +830,28 @@
   }
 
   if (x->mode_info_context->mbmi.mb_skip_coeff) {
-    cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
     if (!cpi->common.mb_no_coeff_skip) {
 #if CONFIG_TX16X16
       if (tx_type == TX_16X16)
-        vp8_stuff_mb_16x16(cpi, x, t);
+        vp8_stuff_mb_16x16(cpi, x, t, dry_run);
       else
 #endif
       if (tx_type == TX_8X8)
-        vp8_stuff_mb_8x8(cpi, x, t);
+        vp8_stuff_mb_8x8(cpi, x, t, dry_run);
       else
-        vp8_stuff_mb(cpi, x, t);
+        vp8_stuff_mb(cpi, x, t, dry_run);
     } else {
       vp8_fix_contexts(x);
     }
-
+    if (dry_run)
+      *t = t_backup;
     return;
   }
 
-  cpi->skip_false_count[mb_skip_context] += skip_inc;
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
 
   plane_type = 3;
   if (has_y2_block) {
@@ -832,9 +861,10 @@
       tokenize2nd_order_b_8x8(x,
                               x->block + 24, t, 1, x->frame_type,
                               A + vp8_block2above_8x8[24],
-                              L + vp8_block2left_8x8[24], cpi);
+                              L + vp8_block2left_8x8[24],
+                              cpi, dry_run);
     } else
-      tokenize2nd_order_b(x, t, cpi);
+      tokenize2nd_order_b(x, t, cpi, dry_run);
 
     plane_type = 0;
   }
@@ -843,14 +873,15 @@
   if (tx_type == TX_16X16) {
     ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
     ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
-    tokenize1st_order_b_16x16(x, x->block, t, 3, x->frame_type, A, L, cpi);
+    tokenize1st_order_b_16x16(x, x->block, t, 3,
+                              x->frame_type, A, L, cpi, dry_run);
     for (b = 1; b < 16; b++) {
       *(A + vp8_block2above[b]) = *(A);
       *(L + vp8_block2left[b] ) = *(L);
     }
     for (b = 16; b < 24; b += 4) {
       tokenize1st_order_b_8x8(x, x->block + b, t, 2, x->frame_type,
-          A + vp8_block2above_8x8[b], L + vp8_block2left_8x8[b], cpi);
+          A + vp8_block2above_8x8[b], L + vp8_block2left_8x8[b], cpi, dry_run);
       *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);
     }
@@ -867,7 +898,7 @@
                               x->block + b, t, plane_type, x->frame_type,
                               A + vp8_block2above_8x8[b],
                               L + vp8_block2left_8x8[b],
-                              cpi);
+                              cpi, dry_run);
       *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b] + 1)  = *(L + vp8_block2left_8x8[b]);
     }
@@ -876,14 +907,14 @@
                               x->block + b, t, 2, x->frame_type,
                               A + vp8_block2above_8x8[b],
                               L + vp8_block2left_8x8[b],
-                              cpi);
+                              cpi, dry_run);
       *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
       *(L + vp8_block2left_8x8[b] + 1) = *(L + vp8_block2left_8x8[b]);
     }
   } else {
 #if CONFIG_HYBRIDTRANSFORM
     if(active_ht) {
-      tokenize1st_order_ht(x, t, plane_type, cpi);
+      tokenize1st_order_ht(x, t, plane_type, cpi, dry_run);
     } else {
 
 #if CONFIG_HYBRIDTRANSFORM8X8
@@ -896,23 +927,25 @@
                                   x->frame_type,
                                   A + vp8_block2above_8x8[b],
                                   L + vp8_block2left_8x8[b],
-                                  cpi);
+                                  cpi, dry_run);
           *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
           *(L + vp8_block2left_8x8[b] + 1)  = *(L + vp8_block2left_8x8[b]);
         }
-        tokenize1st_order_chroma(x, t, PLANE_TYPE_UV, cpi);
+        tokenize1st_order_chroma(x, t, PLANE_TYPE_UV, cpi, dry_run);
       } else {
-        tokenize1st_order_b(x, t, plane_type, cpi);
+        tokenize1st_order_b(x, t, plane_type, cpi, dry_run);
       }
 #else
-      tokenize1st_order_b(x, t, plane_type, cpi);
+      tokenize1st_order_b(x, t, plane_type, cpi, dry_run);
 #endif
 
     }
 #else
-    tokenize1st_order_b(x, t, plane_type, cpi);
+    tokenize1st_order_b(x, t, plane_type, cpi, dry_run);
 #endif
   }
+  if (dry_run)
+    *t = t_backup;
 }
 
 
@@ -1178,16 +1211,15 @@
 }
 
 
-static __inline void stuff2nd_order_b_8x8
-(
+static __inline void stuff2nd_order_b_8x8(
   const BLOCKD *const b,
   TOKENEXTRA **tp,
   const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1202,7 +1234,8 @@
   ++t;
 
   *tp = t;
-  ++cpi->coef_counts_8x8       [1] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts_8x8       [1] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0;
   *a = *l = pt;
 
@@ -1216,8 +1249,8 @@
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1231,7 +1264,8 @@
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts_8x8       [0] [1] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts_8x8[0] [1] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 
@@ -1247,8 +1281,8 @@
   const FRAME_TYPE frametype,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1262,13 +1296,17 @@
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts_8x8[2] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts_8x8[2] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 
 }
 
-void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_stuff_mb_8x8(VP8_COMP *cpi,
+                      MACROBLOCKD *x,
+                      TOKENEXTRA **t,
+                      int dry_run) {
   ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
   ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
   int plane_type;
@@ -1276,14 +1314,14 @@
 
   stuff2nd_order_b_8x8(x->block + 24, t, 1, x->frame_type,
                        A + vp8_block2above_8x8[24],
-                       L + vp8_block2left_8x8[24], cpi);
+                       L + vp8_block2left_8x8[24], cpi, dry_run);
   plane_type = 0;
 
   for (b = 0; b < 16; b += 4) {
     stuff1st_order_b_8x8(x->block + b, t, plane_type, x->frame_type,
                          A + vp8_block2above_8x8[b],
                          L + vp8_block2left_8x8[b],
-                         cpi);
+                         cpi, dry_run);
     *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
     *(L + vp8_block2left_8x8[b] + 1)  = *(L + vp8_block2left_8x8[b]);
   }
@@ -1292,7 +1330,7 @@
     stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
                            A + vp8_block2above[b],
                            L + vp8_block2left[b],
-                           cpi);
+                           cpi, dry_run);
     *(A + vp8_block2above_8x8[b] + 1) = *(A + vp8_block2above_8x8[b]);
     *(L + vp8_block2left_8x8[b] + 1) = *(L + vp8_block2left_8x8[b]);
   }
@@ -1301,9 +1339,13 @@
 
 #if CONFIG_TX16X16
 static __inline
-void stuff1st_order_b_16x16(const BLOCKD *const b, TOKENEXTRA **tp, const FRAME_TYPE frametype,
-                            ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, VP8_COMP *cpi)
-{
+void stuff1st_order_b_16x16(const BLOCKD *const b,
+                            TOKENEXTRA **tp,
+                            const FRAME_TYPE frametype,
+                            ENTROPY_CONTEXT *a,
+                            ENTROPY_CONTEXT *l,
+                            VP8_COMP *cpi,
+                            int dry_run){
     int pt; /* near block/prev token context index */
     TOKENEXTRA *t = *tp;        /* store tokens starting here */
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1315,17 +1357,21 @@
     t->skip_eob_node = 0;
     ++t;
     *tp = t;
-    ++cpi->coef_counts_16x16[3][1][pt][DCT_EOB_TOKEN];
+    if (!dry_run)
+      ++cpi->coef_counts_16x16[3][1][pt][DCT_EOB_TOKEN];
     pt = 0; /* 0 <-> all coeff data is zero */
     *a = *l = pt;
 }
 
-void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_stuff_mb_16x16(VP8_COMP *cpi,
+                        MACROBLOCKD *x,
+                        TOKENEXTRA **t,
+                        int dry_run) {
   ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
   ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
   int b, i;
 
-  stuff1st_order_b_16x16(x->block, t, x->frame_type, A, L, cpi);
+  stuff1st_order_b_16x16(x->block, t, x->frame_type, A, L, cpi, dry_run);
   for (i = 1; i < 16; i++) {
     *(A + vp8_block2above[i]) = *(A);
     *(L +  vp8_block2left[i]) = *(L);
@@ -1334,7 +1380,7 @@
     stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
         A + vp8_block2above[b],
         L + vp8_block2left[b],
-        cpi);
+        cpi, dry_run);
     *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);
     *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);
   }
@@ -1348,8 +1394,8 @@
   TOKENEXTRA **tp,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1359,20 +1405,19 @@
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
 
   pt = 0;
   *a = *l = pt;
 
 }
 
-static __inline void stuff1st_order_b
-(
-  TOKENEXTRA **tp,
-  ENTROPY_CONTEXT *a,
-  ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+static __inline void stuff1st_order_b(TOKENEXTRA **tp,
+                                      ENTROPY_CONTEXT *a,
+                                      ENTROPY_CONTEXT *l,
+                                      VP8_COMP *cpi,
+                                      int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1382,7 +1427,8 @@
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts[0] [1] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 
@@ -1393,8 +1439,8 @@
   TOKENEXTRA **tp,
   ENTROPY_CONTEXT *a,
   ENTROPY_CONTEXT *l,
-  VP8_COMP *cpi
-) {
+  VP8_COMP *cpi,
+  int dry_run) {
   int pt; /* near block/prev token context index */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
@@ -1404,31 +1450,39 @@
   t->skip_eob_node = 0;
   ++t;
   *tp = t;
-  ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+  if (!dry_run)
+    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
 }
 
-void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t, int dry_run) {
   ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
   ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
   int plane_type;
   int b;
+  TOKENEXTRA *t_backup = *t;
 
   stuff2nd_order_b(t,
-                   A + vp8_block2above[24], L + vp8_block2left[24], cpi);
+                   A + vp8_block2above[24],
+                   L + vp8_block2left[24],
+                   cpi, dry_run);
   plane_type = 0;
 
   for (b = 0; b < 16; b++)
     stuff1st_order_b(t,
                      A + vp8_block2above[b],
-                     L + vp8_block2left[b], cpi);
+                     L + vp8_block2left[b],
+                     cpi, dry_run);
 
   for (b = 16; b < 24; b++)
     stuff1st_order_buv(t,
                        A + vp8_block2above[b],
-                       L + vp8_block2left[b], cpi);
+                       L + vp8_block2left[b],
+                       cpi, dry_run);
 
+  if (dry_run)
+    *t = t_backup;
 }
 void vp8_fix_contexts(MACROBLOCKD *x) {
   /* Clear entropy contexts for Y2 blocks */