Merge "Fix highbd obmc_variance unit test" into nextgenv2
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 53fd1a6..d2fc980 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -1298,97 +1298,63 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_OBMC
-// obmc_mask_N[is_neighbor_predictor][overlap_position]
-static const uint8_t obmc_mask_1[2][1] = {
-    { 55},
-    {  9}
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = {
+  55
 };
 
-static const uint8_t obmc_mask_2[2][2] = {
-    { 45, 62},
-    { 19,  2}
+static const uint8_t obmc_mask_2[2] = {
+  45, 62
 };
 
-static const uint8_t obmc_mask_4[2][4] = {
-    { 39, 50, 59, 64},
-    { 25, 14,  5,  0}
+static const uint8_t obmc_mask_4[4] = {
+  39, 50, 59, 64
 };
 
-static const uint8_t obmc_mask_8[2][8] = {
-    { 36, 42, 48, 53, 57, 61, 63, 64},
-    { 28, 22, 16, 11,  7,  3,  1,  0}
+static const uint8_t obmc_mask_8[8] = {
+  36, 42, 48, 53, 57, 61, 63, 64
 };
 
-static const uint8_t obmc_mask_16[2][16] = {
-    { 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 63, 64, 64, 64},
-    { 30, 27, 24, 21, 18, 15, 12, 10,  8,  6,  4,  3,  1,  0,  0,  0}
+static const uint8_t obmc_mask_16[16] = {
+  34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 63, 64, 64, 64
 };
 
-static const uint8_t obmc_mask_32[2][32] = {
-    { 33, 35, 36, 38, 40, 41, 43, 44,
-      45, 47, 48, 50, 51, 52, 53, 55,
-      56, 57, 58, 59, 60, 60, 61, 62,
-      62, 63, 63, 64, 64, 64, 64, 64 },
-    { 31, 29, 28, 26, 24, 23, 21, 20,
-      19, 17, 16, 14, 13, 12, 11,  9,
-       8,  7,  6,  5,  4,  4,  3,  2,
-       2,  1,  1,  0,  0,  0,  0,  0 }
+static const uint8_t obmc_mask_32[32] = {
+  33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55,
+  56, 57, 58, 59, 60, 60, 61, 62, 62, 63, 63, 64, 64, 64, 64, 64
 };
 
 #if CONFIG_EXT_PARTITION
-static const uint8_t obmc_mask_64[2][64] = {
-    {
-      33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
-      45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
-      56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
-      62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-    }, {
-      31, 30, 29, 29, 28, 27, 26, 25, 24, 24, 23, 22, 21, 20, 20, 20,
-      19, 18, 17, 17, 16, 15, 14, 13, 13, 13, 12, 12, 11, 10,  9,  8,
-      8,  8,  7,  7,  6,  6,  5, 4,  4,  4,  4,  4,  3,  2,  2,  2,
-      2,  2,  1,  1, 1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-    }
+static const uint8_t obmc_mask_64[64] = {
+  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
 };
 #endif  // CONFIG_EXT_PARTITION
 
 
-void setup_obmc_mask(int length, const uint8_t *mask[2]) {
+const uint8_t* vp10_get_obmc_mask(int length) {
   switch (length) {
     case 1:
-      mask[0] = obmc_mask_1[0];
-      mask[1] = obmc_mask_1[1];
-      break;
+      return obmc_mask_1;
     case 2:
-      mask[0] = obmc_mask_2[0];
-      mask[1] = obmc_mask_2[1];
-      break;
+      return obmc_mask_2;
     case 4:
-      mask[0] = obmc_mask_4[0];
-      mask[1] = obmc_mask_4[1];
-      break;
+      return obmc_mask_4;
     case 8:
-      mask[0] = obmc_mask_8[0];
-      mask[1] = obmc_mask_8[1];
-      break;
+      return obmc_mask_8;
     case 16:
-      mask[0] = obmc_mask_16[0];
-      mask[1] = obmc_mask_16[1];
-      break;
+      return obmc_mask_16;
     case 32:
-      mask[0] = obmc_mask_32[0];
-      mask[1] = obmc_mask_32[1];
-      break;
+      return obmc_mask_32;
 #if CONFIG_EXT_PARTITION
     case 64:
-      mask[0] = obmc_mask_64[0];
-      mask[1] = obmc_mask_64[1];
-      break;
+      return obmc_mask_64;
 #endif  // CONFIG_EXT_PARTITION
     default:
-      mask[0] = NULL;
-      mask[1] = NULL;
       assert(0);
-      break;
+      return NULL;
   }
 }
 
@@ -1398,168 +1364,101 @@
 // xd->plane[].dst.buf
 void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
                                       MACROBLOCKD *xd, int mi_row, int mi_col,
-                                      int use_tmp_dst_buf,
-                                      uint8_t *final_buf[MAX_MB_PLANE],
-                                      int final_stride[MAX_MB_PLANE],
-                                      uint8_t *tmp_buf1[MAX_MB_PLANE],
-                                      int tmp_stride1[MAX_MB_PLANE],
-                                      uint8_t *tmp_buf2[MAX_MB_PLANE],
-                                      int tmp_stride2[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int plane, i, mi_step;
-  int above_available = mi_row > tile->mi_row_start;
+                                      uint8_t *above[MAX_MB_PLANE],
+                                      int above_stride[MAX_MB_PLANE],
+                                      uint8_t *left[MAX_MB_PLANE],
+                                      int left_stride[MAX_MB_PLANE]) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int plane, i;
 #if CONFIG_VP9_HIGHBITDEPTH
-  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if (use_tmp_dst_buf) {
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      int bw = (xd->n8_w * 8) >> pd->subsampling_x;
-      int bh = (xd->n8_h * 8) >> pd->subsampling_y;
-      int row;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (is_hbd) {
-        uint16_t *final_buf16 = CONVERT_TO_SHORTPTR(final_buf[plane]);
-        uint16_t *bmc_buf16 = CONVERT_TO_SHORTPTR(pd->dst.buf);
-        for (row = 0; row < bh; ++row)
-          memcpy(final_buf16 + row * final_stride[plane],
-                 bmc_buf16 + row * pd->dst.stride, bw * sizeof(uint16_t));
-      } else {
-#endif
-      for (row = 0; row < bh; ++row)
-        memcpy(final_buf[plane] + row * final_stride[plane],
-               pd->dst.buf + row * pd->dst.stride, bw);
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
-  }
-
   // handle above row
-  for (i = 0; above_available && i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col);
-       i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    int overlap;
-    MODE_INFO *above_mi = xd->mi[mi_col_offset +
-                                 mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+  if (xd->up_available) {
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = VPXMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
 
-    mi_step = VPXMIN(xd->n8_w,
-                     num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+    assert(miw > 0);
 
-    if (!is_neighbor_overlappable(above_mbmi))
-      continue;
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_w, num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
 
-    overlap = num_4x4_blocks_high_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-      int bh = overlap >> pd->subsampling_y;
-      int row, col;
-      int dst_stride = use_tmp_dst_buf ? final_stride[plane] : pd->dst.stride;
-      uint8_t *dst = use_tmp_dst_buf ?
-          &final_buf[plane][(i * MI_SIZE) >> pd->subsampling_x] :
-          &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
-      int tmp_stride = tmp_stride1[plane];
-      uint8_t *tmp = &tmp_buf1[plane][(i * MI_SIZE) >> pd->subsampling_x];
-      const uint8_t *mask[2];
-
-      setup_obmc_mask(bh, mask);
+      if (is_neighbor_overlappable(above_mbmi)) {
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+          const int bh = overlap >> pd->subsampling_y;
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst =
+              &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
+          const int tmp_stride = above_stride[plane];
+          const uint8_t *const tmp =
+              &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
+          const uint8_t *const mask = vp10_get_obmc_mask(bh);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (is_hbd) {
-        uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-        uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
-
-        for (row = 0; row < bh; ++row) {
-          for (col = 0; col < bw; ++col)
-            dst16[col] = ROUND_POWER_OF_TWO(mask[0][row] * dst16[col] +
-                                            mask[1][row] * tmp16[col], 6);
-
-          dst16 += dst_stride;
-          tmp16 += tmp_stride;
+          if (is_hbd)
+            vpx_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride,
+                                       tmp, tmp_stride, mask, bh, bw, xd->bd);
+          else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            vpx_blend_a64_vmask(dst, dst_stride, dst, dst_stride,
+                                tmp, tmp_stride, mask, bh, bw);
         }
-      } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      for (row = 0; row < bh; ++row) {
-        for (col = 0; col < bw; ++col)
-          dst[col] = ROUND_POWER_OF_TWO(mask[0][row] * dst[col] +
-                                        mask[1][row] * tmp[col], 6);
-        dst += dst_stride;
-        tmp += tmp_stride;
       }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
-  }  // each mi in the above row
+      i += mi_step;
+    } while (i < miw);
+  }
 
-  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start))
-    return;
   // handle left column
-  for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row);
-       i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int overlap;
-    MODE_INFO *left_mi = xd->mi[mi_col_offset +
-                                mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+  if (xd->left_available) {
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = VPXMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
 
-    mi_step = VPXMIN(xd->n8_h,
-                     num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+    assert(mih > 0);
 
-    if (!is_neighbor_overlappable(left_mbmi))
-      continue;
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_h, num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
 
-    overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      int bw = overlap >> pd->subsampling_x;
-      int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
-      int row, col;
-      int dst_stride = use_tmp_dst_buf ? final_stride[plane] : pd->dst.stride;
-      uint8_t *dst = use_tmp_dst_buf ?
-          &final_buf[plane][(i * MI_SIZE * dst_stride) >> pd->subsampling_y] :
-          &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
-      int tmp_stride = tmp_stride2[plane];
-      uint8_t *tmp = &tmp_buf2[plane]
-                              [(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
-      const uint8_t *mask[2];
-
-      setup_obmc_mask(bw, mask);
+      if (is_neighbor_overlappable(left_mbmi)) {
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = overlap >> pd->subsampling_x;
+          const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst =
+              &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
+          const int tmp_stride = left_stride[plane];
+          const uint8_t *const tmp =
+              &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
+          const uint8_t *const mask = vp10_get_obmc_mask(bw);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (is_hbd) {
-        uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-        uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
-
-        for (row = 0; row < bh; ++row) {
-          for (col = 0; col < bw; ++col)
-            dst16[col] = ROUND_POWER_OF_TWO(mask[0][col] * dst16[col] +
-                                            mask[1][col] * tmp16[col], 6);
-          dst16 += dst_stride;
-          tmp16 += tmp_stride;
+          if (is_hbd)
+            vpx_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride,
+                                       tmp, tmp_stride, mask, bh, bw, xd->bd);
+          else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            vpx_blend_a64_hmask(dst, dst_stride, dst, dst_stride,
+                                tmp, tmp_stride, mask, bh, bw);
         }
-      } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      for (row = 0; row < bh; ++row) {
-        for (col = 0; col < bw; ++col)
-          dst[col] = ROUND_POWER_OF_TWO(mask[0][col] * dst[col] +
-                                        mask[1][col] * tmp[col], 6);
-        dst += dst_stride;
-        tmp += tmp_stride;
       }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
-  }  // each mi in the left column
+      i += mi_step;
+    } while (i < mih);
+  }
 }
 
 #if CONFIG_EXT_INTER
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index ac4a004..c32596e 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -562,16 +562,13 @@
 #endif  // CONFIG_EXT_INTERP
 
 #if CONFIG_OBMC
-void setup_obmc_mask(int length, const uint8_t *mask[2]);
+const uint8_t* vp10_get_obmc_mask(int length);
 void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
                                       MACROBLOCKD *xd, int mi_row, int mi_col,
-                                      int use_tmp_dst_buf,
-                                      uint8_t *final_buf[MAX_MB_PLANE],
-                                      int final_stride[MAX_MB_PLANE],
-                                      uint8_t *tmp_buf1[MAX_MB_PLANE],
-                                      int tmp_stride1[MAX_MB_PLANE],
-                                      uint8_t *tmp_buf2[MAX_MB_PLANE],
-                                      int tmp_stride2[MAX_MB_PLANE]);
+                                      uint8_t *above[MAX_MB_PLANE],
+                                      int above_stride[MAX_MB_PLANE],
+                                      uint8_t *left[MAX_MB_PLANE],
+                                      int left_stride[MAX_MB_PLANE]);
 void vp10_build_prediction_by_above_preds(VP10_COMMON *cm,
                                           MACROBLOCKD *xd,
                                           int mi_row, int mi_col,
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index c4487a9..2f341b5 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1385,7 +1385,7 @@
                                           dst_buf2, dst_stride2);
       vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
                             mi_row, mi_col);
-      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0, NULL, NULL,
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
                                        dst_buf1, dst_stride1,
                                        dst_buf2, dst_stride2);
     }
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index 88f1236..4c9f8a5 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -165,6 +165,9 @@
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
+  // Store the best motion vector during motion search
+  int_mv best_mv;
+
   // Strong color activity detection. Used in RTC coding mode to enhance
   // the visual quality at the boundary of moving color objects.
   uint8_t color_sensitivity[2];
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 69f766a..ff1ee6b 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -5091,7 +5091,7 @@
                                           dst_stride2);
       vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
                             mi_row, mi_col);
-      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0, NULL, NULL,
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
                                        dst_buf1, dst_stride1,
                                        dst_buf2, dst_stride2);
     }
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 519cd03..608cace 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -447,7 +447,7 @@
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp10_full_search_fn_t full_search_sad;
+  vp10_full_search_fn_t full_search_sad;  // It is currently unused.
   vp10_diamond_search_fn_t diamond_search_sad;
   vp10_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
diff --git a/vp10/encoder/mbgraph.c b/vp10/encoder/mbgraph.c
index 5e66ce5..46cff80 100644
--- a/vp10/encoder/mbgraph.c
+++ b/vp10/encoder/mbgraph.c
@@ -25,7 +25,6 @@
 
 static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi,
                                               const MV *ref_mv,
-                                              MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
@@ -51,8 +50,7 @@
 
   /*cpi->sf.search_method == HEX*/
   vp10_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
-                 cond_cost_list(cpi, cost_list),
-                 &v_fn_ptr, 0, ref_mv, dst_mv);
+                  cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -60,7 +58,7 @@
     int distortion;
     unsigned int sse;
     cpi->find_fractional_mv_step(
-        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        x, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list),
         NULL, NULL,
@@ -74,7 +72,7 @@
 #endif  // CONFIG_EXT_INTER
   xd->mi[0]->mbmi.mode = NEWMV;
 
-  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
+  xd->mi[0]->mbmi.mv[0] = x->best_mv;
 #if CONFIG_EXT_INTER
   xd->mi[0]->mbmi.ref_frame[1] = NONE;
 #endif  // CONFIG_EXT_INTER
@@ -92,40 +90,40 @@
 }
 
 static int do_16x16_motion_search(VP10_COMP *cpi, const MV *ref_mv,
-                                  int_mv *dst_mv, int mb_row, int mb_col) {
+                                  int mb_row, int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
-  MV tmp_mv;
+  MV best_mv;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-  dst_mv->as_int = 0;
+  best_mv.col = best_mv.row = 0;
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
   if (tmp_err < err) {
     err = tmp_err;
-    dst_mv->as_mv = tmp_mv;
+    best_mv = x->best_mv.as_mv;
   }
 
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
     unsigned int tmp_err;
-    MV zero_ref_mv = {0, 0}, tmp_mv;
+    MV zero_ref_mv = {0, 0};
 
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
-                                        mb_row, mb_col);
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
     if (tmp_err < err) {
-      dst_mv->as_mv = tmp_mv;
       err = tmp_err;
+      best_mv = x->best_mv.as_mv;
     }
   }
 
+  x->best_mv.as_mv = best_mv;
   return err;
 }
 
@@ -213,8 +211,8 @@
     xd->plane[0].pre[0].stride = golden_ref->y_stride;
     g_motion_error = do_16x16_motion_search(cpi,
                                             prev_golden_ref_mv,
-                                            &stats->ref[GOLDEN_FRAME].m.mv,
                                             mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
     stats->ref[GOLDEN_FRAME].err = INT_MAX;
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index e7bd700..afbf3e9 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -315,6 +315,7 @@
   unsigned int sse;                                                        \
   unsigned int whichdir;                                                   \
   int thismse;                                                             \
+  MV *bestmv = &x->best_mv.as_mv;                                          \
   const unsigned int halfiters = iters_per_step;                           \
   const unsigned int quarteriters = iters_per_step;                        \
   const unsigned int eighthiters = iters_per_step;                         \
@@ -411,8 +412,8 @@
 }
 
 int vp10_find_best_sub_pixel_tree_pruned_evenmore(
-    const MACROBLOCK *x,
-    MV *bestmv, const MV *ref_mv,
+    MACROBLOCK *x,
+    const MV *ref_mv,
     int allow_hp,
     int error_per_bit,
     const vp10_variance_fn_ptr_t *vfp,
@@ -491,20 +492,20 @@
   return besterr;
 }
 
-int vp10_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
-                                             MV *bestmv, const MV *ref_mv,
-                                             int allow_hp,
-                                             int error_per_bit,
-                                             const vp10_variance_fn_ptr_t *vfp,
-                                             int forced_stop,
-                                             int iters_per_step,
-                                             int *cost_list,
-                                             int *mvjcost, int *mvcost[2],
-                                             int *distortion,
-                                             unsigned int *sse1,
-                                             const uint8_t *second_pred,
-                                             int w, int h,
-                                             int use_upsampled_ref) {
+int vp10_find_best_sub_pixel_tree_pruned_more(MACROBLOCK *x,
+                                              const MV *ref_mv,
+                                              int allow_hp,
+                                              int error_per_bit,
+                                              const vp10_variance_fn_ptr_t *vfp,
+                                              int forced_stop,
+                                              int iters_per_step,
+                                              int *cost_list,
+                                              int *mvjcost, int *mvcost[2],
+                                              int *distortion,
+                                              unsigned int *sse1,
+                                              const uint8_t *second_pred,
+                                              int w, int h,
+                                              int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void) use_upsampled_ref;
 
@@ -568,19 +569,19 @@
   return besterr;
 }
 
-int vp10_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
-                                        MV *bestmv, const MV *ref_mv,
-                                        int allow_hp,
-                                        int error_per_bit,
-                                        const vp10_variance_fn_ptr_t *vfp,
-                                        int forced_stop,
-                                        int iters_per_step,
-                                        int *cost_list,
-                                        int *mvjcost, int *mvcost[2],
-                                        int *distortion,
-                                        unsigned int *sse1,
-                                        const uint8_t *second_pred,
-                                        int w, int h, int use_upsampled_ref) {
+int vp10_find_best_sub_pixel_tree_pruned(MACROBLOCK *x,
+                                         const MV *ref_mv,
+                                         int allow_hp,
+                                         int error_per_bit,
+                                         const vp10_variance_fn_ptr_t *vfp,
+                                         int forced_stop,
+                                         int iters_per_step,
+                                         int *cost_list,
+                                         int *mvjcost, int *mvcost[2],
+                                         int *distortion,
+                                         unsigned int *sse1,
+                                         const uint8_t *second_pred,
+                                         int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void) use_upsampled_ref;
 
@@ -724,19 +725,19 @@
   return besterr;
 }
 
-int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
-                                 MV *bestmv, const MV *ref_mv,
-                                 int allow_hp,
-                                 int error_per_bit,
-                                 const vp10_variance_fn_ptr_t *vfp,
-                                 int forced_stop,
-                                 int iters_per_step,
-                                 int *cost_list,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h, int use_upsampled_ref) {
+int vp10_find_best_sub_pixel_tree(MACROBLOCK *x,
+                                  const MV *ref_mv,
+                                  int allow_hp,
+                                  int error_per_bit,
+                                  const vp10_variance_fn_ptr_t *vfp,
+                                  int forced_stop,
+                                  int iters_per_step,
+                                  int *cost_list,
+                                  int *mvjcost, int *mvcost[2],
+                                  int *distortion,
+                                  unsigned int *sse1,
+                                  const uint8_t *second_pred,
+                                  int w, int h, int use_upsampled_ref) {
   const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
@@ -744,6 +745,7 @@
   unsigned int sse;
   unsigned int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
+  MV *bestmv = &x->best_mv.as_mv;
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
 
@@ -1037,19 +1039,18 @@
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 //
-static int vp10_pattern_search(const MACROBLOCK *x,
-                               MV *start_mv,
-                               int search_param,
-                               int sad_per_bit,
-                               int do_init_search,
-                               int *cost_list,
-                               const vp10_variance_fn_ptr_t *vfp,
-                               int use_mvcost,
-                               const MV *center_mv,
-                               MV *best_mv,
-                               const int num_candidates[MAX_PATTERN_SCALES],
-                               const MV candidates[MAX_PATTERN_SCALES]
-                                                  [MAX_PATTERN_CANDIDATES]) {
+static int pattern_search(MACROBLOCK *x,
+                          MV *start_mv,
+                          int search_param,
+                          int sad_per_bit,
+                          int do_init_search,
+                          int *cost_list,
+                          const vp10_variance_fn_ptr_t *vfp,
+                          int use_mvcost,
+                          const MV *center_mv,
+                          const int num_candidates[MAX_PATTERN_SCALES],
+                          const MV candidates[MAX_PATTERN_SCALES]
+                                              [MAX_PATTERN_CANDIDATES]) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
@@ -1294,8 +1295,8 @@
       calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_mv, cost_list);
     }
   }
-  best_mv->row = br;
-  best_mv->col = bc;
+  x->best_mv.as_mv.row = br;
+  x->best_mv.as_mv.col = bc;
   return bestsad;
 }
 
@@ -1332,15 +1333,15 @@
                                  x->mvcost, x->errorperbit) : 0);
 }
 
-int vp10_hex_search(const MACROBLOCK *x,
-                   MV *start_mv,
-                   int search_param,
-                   int sad_per_bit,
-                   int do_init_search,
-                   int *cost_list,
-                   const vp10_variance_fn_ptr_t *vfp,
-                   int use_mvcost,
-                   const MV *center_mv, MV *best_mv) {
+int vp10_hex_search(MACROBLOCK *x,
+                    MV *start_mv,
+                    int search_param,
+                    int sad_per_bit,
+                    int do_init_search,
+                    int *cost_list,
+                    const vp10_variance_fn_ptr_t *vfp,
+                    int use_mvcost,
+                    const MV *center_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1361,22 +1362,20 @@
     {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
       { -1024, 0}},
   };
-  return vp10_pattern_search(x, start_mv, search_param, sad_per_bit,
-                             do_init_search, cost_list, vfp, use_mvcost,
-                             center_mv, best_mv, hex_num_candidates,
-                             hex_candidates);
+  return pattern_search(x, start_mv, search_param, sad_per_bit,
+                        do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv, hex_num_candidates, hex_candidates);
 }
 
-int vp10_bigdia_search(const MACROBLOCK *x,
-                      MV *start_mv,
-                      int search_param,
-                      int sad_per_bit,
-                      int do_init_search,
-                      int *cost_list,
-                      const vp10_variance_fn_ptr_t *vfp,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv) {
+static int bigdia_search(MACROBLOCK *x,
+                         MV *start_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp10_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1403,25 +1402,23 @@
     {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
       {-512, 512}, {-1024, 0}},
   };
-  return vp10_pattern_search(x, start_mv, search_param, sad_per_bit,
-                             do_init_search, cost_list, vfp, use_mvcost,
-                             center_mv, best_mv, bigdia_num_candidates,
-                             bigdia_candidates);
+  return pattern_search(x, start_mv, search_param, sad_per_bit,
+                        do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv, bigdia_num_candidates, bigdia_candidates);
 }
 
-int vp10_square_search(const MACROBLOCK *x,
-                      MV *start_mv,
-                      int search_param,
-                      int sad_per_bit,
-                      int do_init_search,
-                      int *cost_list,
-                      const vp10_variance_fn_ptr_t *vfp,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv) {
+static int square_search(MACROBLOCK *x,
+                         MV *start_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp10_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
   };
   // Note that the largest candidate step at each scale is 2^scale
   static const MV square_candidates[MAX_PATTERN_SCALES]
@@ -1445,40 +1442,37 @@
     {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
-  return vp10_pattern_search(x, start_mv, search_param, sad_per_bit,
-                             do_init_search, cost_list, vfp, use_mvcost,
-                             center_mv, best_mv, square_num_candidates,
-                             square_candidates);
+  return pattern_search(x, start_mv, search_param, sad_per_bit,
+                        do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv, square_num_candidates, square_candidates);
 }
 
-int vp10_fast_hex_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        int do_init_search,  // must be zero for fast_hex
-                        int *cost_list,
-                        const vp10_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv) {
+static int fast_hex_search(MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list,
+                           const vp10_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv) {
   return vp10_hex_search(
       x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
-      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+      do_init_search, cost_list, vfp, use_mvcost, center_mv);
 }
 
-int vp10_fast_dia_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        int do_init_search,
-                        int *cost_list,
-                        const vp10_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv) {
-  return vp10_bigdia_search(
+static int fast_dia_search(MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,
+                           int *cost_list,
+                           const vp10_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv) {
+  return bigdia_search(
       x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
-      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+      do_init_search, cost_list, vfp, use_mvcost, center_mv);
 }
 
 #undef CHECK_BETTER
@@ -1924,12 +1918,12 @@
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-int vp10_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
-                           MV *mvp_full, int step_param,
-                           int sadpb, int further_steps, int do_refine,
-                           int *cost_list,
-                           const vp10_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, MV *dst_mv) {
+static int full_pixel_diamond(VP10_COMP *cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param,
+                              int sadpb, int further_steps, int do_refine,
+                              int *cost_list,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
@@ -1937,7 +1931,7 @@
                                         fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
-  *dst_mv = temp_mv;
+  x->best_mv.as_mv = temp_mv;
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
@@ -1962,7 +1956,7 @@
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        *dst_mv = temp_mv;
+        x->best_mv.as_mv = temp_mv;
       }
     }
   }
@@ -1970,20 +1964,20 @@
   // final 1-away diamond refining search
   if (do_refine) {
     const int search_range = 8;
-    MV best_mv = *dst_mv;
+    MV best_mv = x->best_mv.as_mv;
     thissme = vp10_refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp10_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
       bestsme = thissme;
-      *dst_mv = best_mv;
+      x->best_mv.as_mv = best_mv;
     }
   }
 
   // Return cost list.
   if (cost_list) {
-    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
   }
   return bestsme;
 }
@@ -1994,7 +1988,8 @@
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
 static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x,
-                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const MV *centre_mv_full, int sadpb,
+                                 int *cost_list,
                                  const vp10_variance_fn_ptr_t *fn_ptr,
                                  const MV *ref_mv, MV *dst_mv) {
   const SPEED_FEATURES *const sf = &cpi->sf;
@@ -2053,9 +2048,9 @@
 }
 
 int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp10_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
+                           int sad_per_bit, int distance,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
+                           const MV *center_mv, MV *best_mv) {
   int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2086,9 +2081,9 @@
 }
 
 int vp10_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp10_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
+                           int sad_per_bit, int distance,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
+                           const MV *center_mv, MV *best_mv) {
   int r;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2151,9 +2146,9 @@
 }
 
 int vp10_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp10_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
+                           int sad_per_bit, int distance,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
+                           const MV *center_mv, MV *best_mv) {
   int r;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2318,29 +2313,30 @@
 
 // This function is called when we do joint motion search in comp_inter_inter
 // mode.
-int vp10_refining_search_8p_c(const MACROBLOCK *x,
-                             MV *ref_mv, int error_per_bit,
-                             int search_range,
-                             const vp10_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv,
-                             const uint8_t *second_pred) {
+int vp10_refining_search_8p_c(MACROBLOCK *x,
+                              int error_per_bit,
+                              int search_range,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv,
+                              const uint8_t *second_pred) {
   const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
                            {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  MV *best_mv = &x->best_mv.as_mv;
   unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+      get_buf_from_mv(in_what, best_mv), in_what->stride, second_pred) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
     for (j = 0; j < 8; ++j) {
-      const MV mv = {ref_mv->row + neighbors[j].row,
-                     ref_mv->col + neighbors[j].col};
+      const MV mv = {best_mv->row + neighbors[j].row,
+                     best_mv->col + neighbors[j].col};
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
@@ -2358,8 +2354,8 @@
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->row += neighbors[best_site].row;
-      ref_mv->col += neighbors[best_site].col;
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
     }
   }
   return best_sad;
@@ -2378,15 +2374,15 @@
 }
 
 int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full,
-                          int step_param, int error_per_bit,
-                          int *cost_list,
-                          const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd) {
+                           BLOCK_SIZE bsize, MV *mvp_full,
+                           int step_param, int error_per_bit,
+                           int *cost_list, const MV *ref_mv,
+                           int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = sf->mv.search_method;
   vp10_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+
   if (cost_list) {
     cost_list[0] = INT_MAX;
     cost_list[1] = INT_MAX;
@@ -2400,29 +2396,29 @@
 
   switch (method) {
     case FAST_DIAMOND:
-      var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
       break;
     case FAST_HEX:
-      var = vp10_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
       break;
     case HEX:
       var = vp10_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+                            cost_list, fn_ptr, 1, ref_mv);
       break;
     case SQUARE:
-      var = vp10_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv);
       break;
     case BIGDIA:
-      var = vp10_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv);
       break;
     case NSTEP:
-      var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                                   MAX_MVSEARCH_STEPS - 1 - step_param,
-                                   1, cost_list, fn_ptr, ref_mv, tmp_mv);
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param,
+                               1, cost_list, fn_ptr, ref_mv);
 
       // Should we allow a follow on exhaustive search?
       if (is_exhaustive_allowed(cpi, x)) {
@@ -2434,13 +2430,13 @@
         if (var > exhuastive_thr) {
             int var_ex;
           MV tmp_mv_ex;
-          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+          var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv,
                                          error_per_bit, cost_list, fn_ptr,
                                          ref_mv, &tmp_mv_ex);
 
           if (var_ex < var) {
             var = var_ex;
-            *tmp_mv = tmp_mv_ex;
+            x->best_mv.as_mv = tmp_mv_ex;
           }
         }
       }
@@ -2452,7 +2448,7 @@
   }
 
   if (method != NSTEP && rd && var < var_max)
-    var = vp10_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+    var = vp10_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
 
   return var;
 }
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index 704e26c..f97f6c7 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -82,31 +82,24 @@
 
 // Perform integral projection based motion estimation.
 unsigned int vp10_int_pro_motion_estimation(const struct VP10_COMP *cpi,
-                                           MACROBLOCK *x,
-                                           BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
+                                            MACROBLOCK *x,
+                                            BLOCK_SIZE bsize,
+                                            int mi_row, int mi_col);
 
-typedef int (integer_mv_pattern_search_fn) (
-    const MACROBLOCK *x,
-    MV *ref_mv,
-    int search_param,
-    int error_per_bit,
-    int do_init_search,
-    int *cost_list,
-    const vp10_variance_fn_ptr_t *vf,
-    int use_mvcost,
-    const MV *center_mv,
-    MV *best_mv);
 
-integer_mv_pattern_search_fn vp10_hex_search;
-integer_mv_pattern_search_fn vp10_bigdia_search;
-integer_mv_pattern_search_fn vp10_square_search;
-integer_mv_pattern_search_fn vp10_fast_hex_search;
-integer_mv_pattern_search_fn vp10_fast_dia_search;
+int vp10_hex_search(MACROBLOCK *x,
+                    MV *start_mv,
+                    int search_param,
+                    int sad_per_bit,
+                    int do_init_search,
+                    int *cost_list,
+                    const vp10_variance_fn_ptr_t *vfp,
+                    int use_mvcost,
+                    const MV *center_mv);
 
 typedef int (fractional_mv_step_fp) (
-    const MACROBLOCK *x,
-    MV *bestmv, const MV *ref_mv,
+    MACROBLOCK *x,
+    const MV *ref_mv,
     int allow_hp,
     int error_per_bit,
     const vp10_variance_fn_ptr_t *vfp,
@@ -124,39 +117,32 @@
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned_evenmore;
 
 typedef int (*vp10_full_search_fn_t)(const MACROBLOCK *x,
-                                    const MV *ref_mv, int sad_per_bit,
-                                    int distance,
-                                    const vp10_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
+                                     const MV *ref_mv, int sad_per_bit,
+                                     int distance,
+                                     const vp10_variance_fn_ptr_t *fn_ptr,
+                                     const MV *center_mv, MV *best_mv);
 
-typedef int (*vp10_refining_search_fn_t)(const MACROBLOCK *x,
-                                        MV *ref_mv, int sad_per_bit,
-                                        int distance,
+typedef int (*vp10_diamond_search_fn_t)(const MACROBLOCK *x,
+                                        const search_site_config *cfg,
+                                        MV *ref_mv, MV *best_mv,
+                                        int search_param, int sad_per_bit,
+                                        int *num00,
                                         const vp10_variance_fn_ptr_t *fn_ptr,
                                         const MV *center_mv);
 
-typedef int (*vp10_diamond_search_fn_t)(const MACROBLOCK *x,
-                                       const search_site_config *cfg,
-                                       MV *ref_mv, MV *best_mv,
-                                       int search_param, int sad_per_bit,
-                                       int *num00,
-                                       const vp10_variance_fn_ptr_t *fn_ptr,
-                                       const MV *center_mv);
-
-int vp10_refining_search_8p_c(const MACROBLOCK *x,
-                             MV *ref_mv, int error_per_bit,
-                             int search_range,
-                             const vp10_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv, const uint8_t *second_pred);
+int vp10_refining_search_8p_c(MACROBLOCK *x,
+                              int error_per_bit,
+                              int search_range,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv, const uint8_t *second_pred);
 
 struct VP10_COMP;
 
 int vp10_full_pixel_search(struct VP10_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full,
-                          int step_param, int error_per_bit,
-                          int *cost_list,
-                          const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd);
+                           BLOCK_SIZE bsize, MV *mvp_full,
+                           int step_param, int error_per_bit,
+                           int *cost_list, const MV *ref_mv,
+                           int var_max, int rd);
 
 #if CONFIG_EXT_INTER
 int vp10_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index b96e6e4..45c10cd 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@@ -4753,7 +4754,7 @@
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
-    MV tmp_mv;
+    MV *const best_mv = &x->best_mv.as_mv;
     int search_range = 3;
 
     int tmp_col_min = x->mv_col_min;
@@ -4814,23 +4815,22 @@
     vp10_set_mv_search_range(x, &ref_mv[id].as_mv);
 
     // Use the mv result from the single mode as mv predictor.
-    tmp_mv = frame_mv[refs[id]].as_mv;
+    *best_mv = frame_mv[refs[id]].as_mv;
 
-    tmp_mv.col >>= 3;
-    tmp_mv.row >>= 3;
+    best_mv->col >>= 3;
+    best_mv->row >>= 3;
 
 #if CONFIG_REF_MV
     vp10_set_mvcost(x, refs[id]);
 #endif
 
     // Small-range full-pixel motion search.
-    bestsme = vp10_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                       search_range,
-                                       &cpi->fn_ptr[bsize],
-                                       &ref_mv[id].as_mv, second_pred);
+    bestsme = vp10_refining_search_8p_c(x, sadpb, search_range,
+                                        &cpi->fn_ptr[bsize],
+                                        &ref_mv[id].as_mv, second_pred);
     if (bestsme < INT_MAX)
-      bestsme = vp10_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
-                                      second_pred, &cpi->fn_ptr[bsize], 1);
+      bestsme = vp10_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+                                       second_pred, &cpi->fn_ptr[bsize], 1);
 
     x->mv_col_min = tmp_col_min;
     x->mv_col_max = tmp_col_max;
@@ -4859,8 +4859,7 @@
               pd->pre[0].stride)) << 3];
 
         bestsme = cpi->find_fractional_mv_step(
-            x, &tmp_mv,
-            &ref_mv[id].as_mv,
+            x, &ref_mv[id].as_mv,
             cpi->common.allow_high_precision_mv,
             x->errorperbit,
             &cpi->fn_ptr[bsize],
@@ -4875,8 +4874,7 @@
       } else {
         (void) block;
         bestsme = cpi->find_fractional_mv_step(
-            x, &tmp_mv,
-            &ref_mv[id].as_mv,
+            x, &ref_mv[id].as_mv,
             cpi->common.allow_high_precision_mv,
             x->errorperbit,
             &cpi->fn_ptr[bsize],
@@ -4893,7 +4891,7 @@
       xd->plane[0].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = tmp_mv;
+      frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
     } else {
       break;
@@ -5196,11 +5194,6 @@
                 run_mv_search)
 #endif  // CONFIG_EXT_INTER
             ) {
-#if CONFIG_EXT_INTER
-          MV *const new_mv = &mode_mv[this_mode][0].as_mv;
-#else
-          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
-#endif  // CONFIG_EXT_INTER
           int step_param = 0;
           int bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
@@ -5268,8 +5261,7 @@
           bestsme = vp10_full_pixel_search(
               cpi, x, bsize, &mvp_full, step_param, sadpb,
               cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
-              &bsi->ref_mv[0]->as_mv, new_mv,
-              INT_MAX, 1);
+              &bsi->ref_mv[0]->as_mv, INT_MAX, 1);
 
           if (bestsme < INT_MAX) {
             int distortion;
@@ -5294,9 +5286,7 @@
                   pd->pre[0].stride)) << 3];
 
               cpi->find_fractional_mv_step(
-                  x,
-                  new_mv,
-                  &bsi->ref_mv[0]->as_mv,
+                  x, &bsi->ref_mv[0]->as_mv,
                   cm->allow_high_precision_mv,
                   x->errorperbit, &cpi->fn_ptr[bsize],
                   cpi->sf.mv.subpel_force_stop,
@@ -5311,9 +5301,7 @@
               pd->pre[0] = backup_pred;
             } else {
               cpi->find_fractional_mv_step(
-                  x,
-                  new_mv,
-                  &bsi->ref_mv[0]->as_mv,
+                  x, &bsi->ref_mv[0]->as_mv,
                   cm->allow_high_precision_mv,
                   x->errorperbit, &cpi->fn_ptr[bsize],
                   cpi->sf.mv.subpel_force_stop,
@@ -5327,14 +5315,20 @@
 
             // save motion search result for use in compound prediction
 #if CONFIG_EXT_INTER
-            seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
 #else
-            seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[i][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
 #endif  // CONFIG_EXT_INTER
           }
 
           if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
+            x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv;
+
+#if CONFIG_EXT_INTER
+          mode_mv[this_mode][0] = x->best_mv;
+#else
+          mode_mv[NEWMV][0] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
@@ -5903,7 +5897,7 @@
                                  int ref_idx,
                                  int mv_idx,
 #endif  // CONFIG_EXT_INTER
-                                 int_mv *tmp_mv, int *rate_mv) {
+                                 int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const VP10_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -5985,7 +5979,7 @@
         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
           x->pred_mv[ref].row = 0;
           x->pred_mv[ref].col = 0;
-          tmp_mv->as_int = INVALID_MV;
+          x->best_mv.as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
             int i;
@@ -6005,7 +5999,7 @@
 
   bestsme = vp10_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
                                    cond_cost_list(cpi, cost_list),
-                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+                                   &ref_mv, INT_MAX, 1);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -6027,7 +6021,7 @@
                        upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
                        NULL, pd->subsampling_x, pd->subsampling_y);
 
-      bestsme = cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+      bestsme = cpi->find_fractional_mv_step(x, &ref_mv,
                                              cm->allow_high_precision_mv,
                                              x->errorperbit,
                                              &cpi->fn_ptr[bsize],
@@ -6041,7 +6035,7 @@
       // Restore the reference frames.
       pd->pre[ref_idx] = backup_pred;
     } else {
-      cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
+      cpi->find_fractional_mv_step(x, &ref_mv,
                                    cm->allow_high_precision_mv,
                                    x->errorperbit,
                                    &cpi->fn_ptr[bsize],
@@ -6052,11 +6046,11 @@
                                    &dis, &x->pred_sse[ref], NULL, 0, 0, 0);
     }
   }
-  *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+  *rate_mv = vp10_mv_bit_cost(&x->best_mv.as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   if (cpi->sf.adaptive_motion_search)
-    x->pred_mv[ref] = tmp_mv->as_mv;
+    x->pred_mv[ref] = x->best_mv.as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -6990,34 +6984,32 @@
       }
 #endif  // CONFIG_EXT_INTER
     } else {
-      int_mv tmp_mv;
-
 #if CONFIG_EXT_INTER
       if (is_comp_interintra_pred) {
-        tmp_mv = single_newmvs[mv_idx][refs[0]];
+        x->best_mv = single_newmvs[mv_idx][refs[0]];
         rate_mv = single_newmvs_rate[mv_idx][refs[0]];
       } else {
         single_motion_search(cpi, x, bsize, mi_row, mi_col,
-                             0, mv_idx, &tmp_mv, &rate_mv);
-        single_newmvs[mv_idx][refs[0]] = tmp_mv;
+                             0, mv_idx, &rate_mv);
+        single_newmvs[mv_idx][refs[0]] = x->best_mv;
         single_newmvs_rate[mv_idx][refs[0]] = rate_mv;
       }
 #else
-      single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
-      single_newmv[refs[0]] = tmp_mv;
+      single_motion_search(cpi, x, bsize, mi_row, mi_col, &rate_mv);
+      single_newmv[refs[0]] = x->best_mv;
 #endif  // CONFIG_EXT_INTER
 
-      if (tmp_mv.as_int == INVALID_MV)
+      if (x->best_mv.as_int == INVALID_MV)
         return INT64_MAX;
 
-      frame_mv[refs[0]] = tmp_mv;
-      xd->mi[0]->bmi[0].as_mv[0] = tmp_mv;
+      frame_mv[refs[0]] = x->best_mv;
+      xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
       // Estimate the rate implications of a new mv but discount this
       // under certain circumstances where we want to help initiate a weak
       // motion field, where the distortion gain for a single block may not
       // be enough to overcome the cost of a new mv.
-      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+      if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
         rate_mv = VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
       }
     }
@@ -7780,8 +7772,7 @@
         vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 #endif  // CONFIG_EXT_INTER
       }
-      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0,
-                                       NULL, NULL,
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
                                        dst_buf1, dst_stride1,
                                        dst_buf2, dst_stride2);
       model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
@@ -8399,6 +8390,18 @@
 }
 #endif  // CONFIG_EXT_INTRA
 
+#if CONFIG_OBMC
+static void calc_target_weighted_pred(
+    const VP10_COMMON *cm,
+    const MACROBLOCK *x,
+    const MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    const uint8_t *above, int above_stride,
+    const uint8_t *left, int left_stride,
+    int32_t *mask_buf,
+    int32_t *wsrc_buf);
+#endif  // CONFIG_OBMC
+
 void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
                                 TileDataEnc *tile_data,
                                 MACROBLOCK *x,
@@ -9579,7 +9582,7 @@
       vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 #if CONFIG_OBMC
       if (mbmi->motion_variation == OBMC_CAUSAL)
-        vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0, NULL, NULL,
+        vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
                                          dst_buf1, dst_stride1,
                                          dst_buf2, dst_stride2);
 #endif  // CONFIG_OBMC
@@ -10980,189 +10983,225 @@
 }
 
 #if CONFIG_OBMC
-void calc_target_weighted_pred(VP10_COMMON *cm,
-                               MACROBLOCK *x,
-                               MACROBLOCKD *xd,
-                               int mi_row, int mi_col,
-                               uint8_t *above_buf, int above_stride,
-                               uint8_t *left_buf,  int left_stride,
-                               int32_t *mask_buf,
-                               int32_t *weighted_src_buf) {
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int row, col, i, mi_step;
-  int bw = 8 * xd->n8_w;
-  int bh = 8 * xd->n8_h;
+// This function has a structure similar to vp10_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+//  PObmc(x,y) =
+//    VPX_BLEND_A64(Mh(x),
+//                  VPX_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+//                  PLeft(x, y))
+//
+// Scaling up by VPX_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+//  VPX_BLEND_A64_MAX_ALPHA * VPX_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+//    Mh(x) * Mv(y) * P(x,y) +
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      VPX_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+//  Cv(y) = VPX_BLEND_A64_MAX_ALPHA - Mv(y)
+//  Ch(y) = VPX_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+//  wsrc(x, y) =
+//    VPX_BLEND_A64_MAX_ALPHA * VPX_BLEND_A64_MAX_ALPHA * src(x, y) -
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      VPX_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+//  mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+//  error(x, y) =
+//    wsrc(x, y) - mask(x, y) * P(x, y) / (VPX_BLEND_A64_MAX_ALPHA ** 2)
+//
+static void calc_target_weighted_pred(
+    const VP10_COMMON *cm,
+    const MACROBLOCK *x,
+    const MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    const uint8_t *above, int above_stride,
+    const uint8_t *left,  int left_stride,
+    int32_t *mask_buf,
+    int32_t *wsrc_buf) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int row, col, i;
+  const int bw = 8 * xd->n8_w;
+  const int bh = 8 * xd->n8_h;
+  const int wsrc_stride = bw;
   const int mask_stride = bw;
-  const int weighted_src_stride = bw;
-  int32_t *dst = weighted_src_buf;
-  int32_t *mask2d = mask_buf;
-  uint8_t *src;
+  const int src_scale = VPX_BLEND_A64_MAX_ALPHA * VPX_BLEND_A64_MAX_ALPHA;
 #if CONFIG_VP9_HIGHBITDEPTH
-  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  for (row = 0; row < bh; ++row) {
-    for (col = 0; col < bw; ++col) {
-      dst[col] = 0;
-      mask2d[col] = 64;
-    }
-    dst += weighted_src_stride;
-    mask2d += mask_stride;
-  }
+
+  // plane 0 should not be subsampled
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+
+  vp10_zero_array(wsrc_buf, bw * bh);
+  for (i = 0; i < bw * bh; ++i)
+    mask_buf[i] = VPX_BLEND_A64_MAX_ALPHA;
 
   // handle above row
   if (xd->up_available) {
-    for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
-      int mi_row_offset = -1;
-      int mi_col_offset = i;
-      MODE_INFO *above_mi = xd->mi[mi_col_offset +
-                                   mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
-      int overlap = num_4x4_blocks_high_lookup[bsize] << 1;
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = VPXMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
+    const uint8_t *const mask1d = vp10_get_obmc_mask(overlap);
 
-      mi_step = VPXMIN(xd->n8_w,
-                       num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+    assert(miw > 0);
+
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_w, num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+      const int neighbor_bw = mi_step * MI_SIZE;
 
       if (is_neighbor_overlappable(above_mbmi)) {
-        const struct macroblockd_plane *pd = &xd->plane[0];
-        int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-        int bh = overlap >> pd->subsampling_y;
-        int dst_stride = weighted_src_stride;
-        int32_t *dst = weighted_src_buf + (i * MI_SIZE >> pd->subsampling_x);
-        int tmp_stride = above_stride;
-        uint8_t *tmp = above_buf + (i * MI_SIZE >> pd->subsampling_x);
-        int mask2d_stride = mask_stride;
-        int32_t *mask2d = mask_buf + (i * MI_SIZE >> pd->subsampling_x);
-        const uint8_t *mask1d[2];
+        const int tmp_stride = above_stride;
+        int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
+        int32_t *mask = mask_buf + (i * MI_SIZE);
 
-        setup_obmc_mask(bh, mask1d);
+        if (!is_hbd) {
+          const uint8_t *tmp = above;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (is_hbd) {
-          uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
-
-          for (row = 0; row < bh; ++row) {
-            for (col = 0; col < bw; ++col) {
-              dst[col] = mask1d[1][row] * tmp16[col];
-              mask2d[col] = mask1d[0][row];
+          for (row = 0; row < overlap; ++row) {
+            const uint8_t m0 = mask1d[row];
+            const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+            for (col = 0; col < neighbor_bw; ++col) {
+              wsrc[col] = m1 * tmp[col];
+              mask[col] = m0;
             }
-            dst += dst_stride;
-            tmp16 += tmp_stride;
-            mask2d += mask2d_stride;
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
           }
-        } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        for (row = 0; row < bh; ++row) {
-          for (col = 0; col < bw; ++col) {
-            dst[col] = mask1d[1][row] * tmp[col];
-            mask2d[col] = mask1d[0][row];
-          }
-          dst += dst_stride;
-          tmp += tmp_stride;
-          mask2d += mask2d_stride;
-        }
 #if CONFIG_VP9_HIGHBITDEPTH
-        }
+        } else {
+          const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
+
+          for (row = 0; row < overlap; ++row) {
+            const uint8_t m0 = mask1d[row];
+            const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+            for (col = 0; col < neighbor_bw; ++col) {
+              wsrc[col] = m1 * tmp[col];
+              mask[col] = m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
       }
-    }  // each mi in the above row
+
+      above += neighbor_bw;
+      i += mi_step;
+    } while (i < miw);
+  }
+
+  for (i = 0; i < bw * bh; ++i) {
+    wsrc_buf[i] *= VPX_BLEND_A64_MAX_ALPHA;
+    mask_buf[i] *= VPX_BLEND_A64_MAX_ALPHA;
   }
 
   // handle left column
-  dst = weighted_src_buf;
-  mask2d = mask_buf;
-  for (row = 0; row < bh; ++row) {
-    for (col = 0; col < bw; ++col) {
-      dst[col] = dst[col] << 6;
-      mask2d[col] = mask2d[col] << 6;
-    }
-    dst += weighted_src_stride;
-    mask2d += mask_stride;
-  }
-
   if (xd->left_available) {
-    for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-      int mi_row_offset = i;
-      int mi_col_offset = -1;
-      int overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
-      MODE_INFO *left_mi = xd->mi[mi_col_offset +
-                                  mi_row_offset * xd->mi_stride];
-      MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = VPXMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
+    const uint8_t *const mask1d = vp10_get_obmc_mask(overlap);
 
-      mi_step = VPXMIN(xd->n8_h,
-                       num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+    assert(mih > 0);
+
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_h, num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+      const int neighbor_bh = mi_step * MI_SIZE;
 
       if (is_neighbor_overlappable(left_mbmi)) {
-        const struct macroblockd_plane *pd = &xd->plane[0];
-        int bw = overlap >> pd->subsampling_x;
-        int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
-        int dst_stride = weighted_src_stride;
-        int32_t *dst = weighted_src_buf +
-                   (i * MI_SIZE * dst_stride >> pd->subsampling_y);
-        int tmp_stride = left_stride;
-        uint8_t *tmp = left_buf +
-                       (i * MI_SIZE * tmp_stride >> pd->subsampling_y);
-        int mask2d_stride = mask_stride;
-        int32_t *mask2d = mask_buf +
-                          (i * MI_SIZE * mask2d_stride >> pd->subsampling_y);
-        const uint8_t *mask1d[2];
+        const int tmp_stride = left_stride;
+        int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
+        int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
 
-        setup_obmc_mask(bw, mask1d);
+        if (!is_hbd) {
+          const uint8_t *tmp = left;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (is_hbd) {
-          uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
-
-          for (row = 0; row < bh; ++row) {
-            for (col = 0; col < bw; ++col) {
-              dst[col] = (dst[col] >> 6) * mask1d[0][col] +
-                         (tmp16[col] << 6) * mask1d[1][col];
-              mask2d[col] = (mask2d[col] >> 6) * mask1d[0][col];
+          for (row = 0; row < neighbor_bh; ++row) {
+            for (col = 0; col < overlap; ++col) {
+              const uint8_t m0 = mask1d[col];
+              const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+              wsrc[col] = (wsrc[col] >> VPX_BLEND_A64_ROUND_BITS) * m0 +
+                          (tmp[col] << VPX_BLEND_A64_ROUND_BITS) * m1;
+              mask[col] = (mask[col] >> VPX_BLEND_A64_ROUND_BITS) * m0;
             }
-            dst += dst_stride;
-            tmp16 += tmp_stride;
-            mask2d += mask2d_stride;
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
           }
-        } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        for (row = 0; row < bh; ++row) {
-          for (col = 0; col < bw; ++col) {
-            dst[col] = (dst[col] >> 6) * mask1d[0][col] +
-                       (tmp[col] << 6) * mask1d[1][col];
-            mask2d[col] = (mask2d[col] >> 6) * mask1d[0][col];
-          }
-          dst += dst_stride;
-          tmp += tmp_stride;
-          mask2d += mask2d_stride;
-        }
 #if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
+
+          for (row = 0; row < neighbor_bh; ++row) {
+            for (col = 0; col < overlap; ++col) {
+              const uint8_t m0 = mask1d[col];
+              const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+              wsrc[col] = (wsrc[col] >> VPX_BLEND_A64_ROUND_BITS) * m0 +
+                          (tmp[col] << VPX_BLEND_A64_ROUND_BITS) * m1;
+              mask[col] = (mask[col] >> VPX_BLEND_A64_ROUND_BITS) * m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         }
-#endif  //  CONFIG_VP9_HIGHBITDEPTH
       }
-    }  // each mi in the left column
+
+      left += neighbor_bh * left_stride;
+      i += mi_step;
+    } while (i < mih);
   }
 
-  dst = weighted_src_buf;
-  src = x->plane[0].src.buf;
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (is_hbd) {
-    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+  if (!is_hbd) {
+    const uint8_t *src = x->plane[0].src.buf;
 
     for (row = 0; row < bh; ++row) {
-      for (col = 0; col < bw; ++col)
-        dst[col] = (src16[col] << 12) - dst[col];
-      dst += weighted_src_stride;
-      src16 += x->plane[0].src.stride;
+      for (col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += wsrc_stride;
+      src += x->plane[0].src.stride;
     }
-  } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  for (row = 0; row < bh; ++row) {
-    for (col = 0; col < bw; ++col)
-      dst[col] = (src[col] << 12) - dst[col];
-    dst += weighted_src_stride;
-    src += x->plane[0].src.stride;
-  }
 #if CONFIG_VP9_HIGHBITDEPTH
-  }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += wsrc_stride;
+      src += x->plane[0].src.stride;
+    }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 }
 #endif  // CONFIG_OBMC
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index b660e23..da70a22 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -90,16 +90,6 @@
                                    int use_fast_coef_casting);
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_OBMC
-void calc_target_weighted_pred(VP10_COMMON *cm,
-                               MACROBLOCK *x,
-                               MACROBLOCKD *xd,
-                               int mi_row, int mi_col,
-                               uint8_t *above_buf, int above_stride,
-                               uint8_t *left_buf, int left_stride,
-                               int32_t *mask_buf, int32_t *weighted_src_buf);
-#endif  // CONFIG_OBMC
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c
index 39be057..d125dae 100644
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@@ -288,7 +288,6 @@
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
@@ -315,12 +314,11 @@
 
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   vp10_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                 cond_cost_list(cpi, cost_list),
-                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
+                  cond_cost_list(cpi, cost_list),
+                  &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(x, ref_mv,
-                                         &best_ref_mv1,
+  bestsme = cpi->find_fractional_mv_step(x, &best_ref_mv1,
                                          cpi->common.allow_high_precision_mv,
                                          x->errorperbit,
                                          &cpi->fn_ptr[BLOCK_16X16],
@@ -329,6 +327,8 @@
                                          NULL, NULL,
                                          &distortion, &sse, NULL, 0, 0, 0);
 
+  x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
+
   // Restore input state
   x->plane[0].src = src;
   xd->plane[0].pre[0] = pre;