Merge "Add single motion search for OBMC predictor" into nextgenv2
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 86959b0..1e8679b 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -487,6 +487,7 @@
 #endif  // CONFIG_EXT_INTERP
 
 #if CONFIG_OBMC
+void setup_obmc_mask(int length, const uint8_t *mask[2]);
 void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
                                       MACROBLOCKD *xd, int mi_row, int mi_col,
                                       int use_tmp_dst_buf,
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index fc55133..43b5401 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -1219,6 +1219,49 @@
 MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad4x4)
 #endif  // CONFIG_EXT_INTER
 
+#if CONFIG_OBMC
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF)                                   \
+  cpi->fn_ptr[BT].osdf            = OSDF;                                  \
+  cpi->fn_ptr[BT].ovf             = OVF;                                   \
+  cpi->fn_ptr[BT].osvf            = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                      \
+static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,     \
+                                   const int *wsrc, int wsrc_stride,       \
+                                   const int *msk, int msk_stride) {       \
+  return fnname(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride);      \
+}                                                                          \
+static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride,    \
+                                    const int *wsrc, int wsrc_stride,      \
+                                    const int *msk, int msk_stride) {      \
+  return fnname(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride) >> 2; \
+}                                                                          \
+static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride,    \
+                                    const int *wsrc, int wsrc_stride,      \
+                                    const int *msk, int msk_stride) {      \
+  return fnname(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride) >> 4; \
+}
+
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad4x4)
+#endif  // CONFIG_OBMC
+
 static void  highbd_set_var_fns(VP10_COMP *const cpi) {
   VP10_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
@@ -1454,6 +1497,74 @@
                     vpx_highbd_masked_variance4x4,
                     vpx_highbd_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits8,
+                    vpx_highbd_obmc_variance128x128,
+                    vpx_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits8,
+                    vpx_highbd_obmc_variance128x64,
+                    vpx_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits8,
+                    vpx_highbd_obmc_variance64x128,
+                    vpx_highbd_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits8,
+                    vpx_highbd_obmc_variance64x64,
+                    vpx_highbd_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits8,
+                    vpx_highbd_obmc_variance64x32,
+                    vpx_highbd_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits8,
+                    vpx_highbd_obmc_variance32x64,
+                    vpx_highbd_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits8,
+                    vpx_highbd_obmc_variance32x32,
+                    vpx_highbd_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits8,
+                    vpx_highbd_obmc_variance32x16,
+                    vpx_highbd_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits8,
+                    vpx_highbd_obmc_variance16x32,
+                    vpx_highbd_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits8,
+                    vpx_highbd_obmc_variance16x16,
+                    vpx_highbd_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits8,
+                    vpx_highbd_obmc_variance8x16,
+                    vpx_highbd_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits8,
+                    vpx_highbd_obmc_variance16x8,
+                    vpx_highbd_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits8,
+                    vpx_highbd_obmc_variance8x8,
+                    vpx_highbd_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits8,
+                    vpx_highbd_obmc_variance4x8,
+                    vpx_highbd_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits8,
+                    vpx_highbd_obmc_variance8x4,
+                    vpx_highbd_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits8,
+                    vpx_highbd_obmc_variance4x4,
+                    vpx_highbd_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
         break;
 
       case VPX_BITS_10:
@@ -1687,6 +1798,74 @@
                     vpx_highbd_10_masked_variance4x4,
                     vpx_highbd_10_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits10,
+                    vpx_highbd_10_obmc_variance128x128,
+                    vpx_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits10,
+                    vpx_highbd_10_obmc_variance128x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits10,
+                    vpx_highbd_10_obmc_variance64x128,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits10,
+                    vpx_highbd_10_obmc_variance64x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits10,
+                    vpx_highbd_10_obmc_variance64x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits10,
+                    vpx_highbd_10_obmc_variance32x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits10,
+                    vpx_highbd_10_obmc_variance32x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits10,
+                    vpx_highbd_10_obmc_variance32x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits10,
+                    vpx_highbd_10_obmc_variance16x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits10,
+                    vpx_highbd_10_obmc_variance16x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits10,
+                    vpx_highbd_10_obmc_variance8x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits10,
+                    vpx_highbd_10_obmc_variance16x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits10,
+                    vpx_highbd_10_obmc_variance8x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits10,
+                    vpx_highbd_10_obmc_variance4x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits10,
+                    vpx_highbd_10_obmc_variance8x4,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits10,
+                    vpx_highbd_10_obmc_variance4x4,
+                    vpx_highbd_10_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
         break;
 
       case VPX_BITS_12:
@@ -1920,6 +2099,75 @@
                     vpx_highbd_12_masked_variance4x4,
                     vpx_highbd_12_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits12,
+                    vpx_highbd_12_obmc_variance128x128,
+                    vpx_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits12,
+                    vpx_highbd_12_obmc_variance128x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits12,
+                    vpx_highbd_12_obmc_variance64x128,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits12,
+                    vpx_highbd_12_obmc_variance64x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits12,
+                    vpx_highbd_12_obmc_variance64x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits12,
+                    vpx_highbd_12_obmc_variance32x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits12,
+                    vpx_highbd_12_obmc_variance32x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits12,
+                    vpx_highbd_12_obmc_variance32x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits12,
+                    vpx_highbd_12_obmc_variance16x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits12,
+                    vpx_highbd_12_obmc_variance16x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits12,
+                    vpx_highbd_12_obmc_variance8x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits12,
+                    vpx_highbd_12_obmc_variance16x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits12,
+                    vpx_highbd_12_obmc_variance8x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits12,
+                    vpx_highbd_12_obmc_variance4x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits12,
+                    vpx_highbd_12_obmc_variance8x4,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits12,
+                    vpx_highbd_12_obmc_variance4x4,
+                    vpx_highbd_12_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
         break;
 
       default:
@@ -2414,6 +2662,48 @@
       vpx_sub_pixel_avg_variance4x4,
       vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
 
+#if CONFIG_OBMC
+#define OBFP(BT, OSDF, OVF, OSVF)         \
+  cpi->fn_ptr[BT].osdf            = OSDF; \
+  cpi->fn_ptr[BT].ovf             = OVF;  \
+  cpi->fn_ptr[BT].osvf            = OSVF;
+
+#if CONFIG_EXT_PARTITION
+  OBFP(BLOCK_128X128, vpx_obmc_sad128x128, vpx_obmc_variance128x128,
+       vpx_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, vpx_obmc_sad128x64, vpx_obmc_variance128x64,
+       vpx_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, vpx_obmc_sad64x128, vpx_obmc_variance64x128,
+       vpx_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  OBFP(BLOCK_64X64, vpx_obmc_sad64x64, vpx_obmc_variance64x64,
+       vpx_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, vpx_obmc_sad64x32, vpx_obmc_variance64x32,
+       vpx_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, vpx_obmc_sad32x64, vpx_obmc_variance32x64,
+       vpx_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, vpx_obmc_sad32x32, vpx_obmc_variance32x32,
+       vpx_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, vpx_obmc_sad32x16, vpx_obmc_variance32x16,
+       vpx_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, vpx_obmc_sad16x32, vpx_obmc_variance16x32,
+       vpx_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, vpx_obmc_sad16x16, vpx_obmc_variance16x16,
+       vpx_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, vpx_obmc_sad16x8, vpx_obmc_variance16x8,
+       vpx_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, vpx_obmc_sad8x16, vpx_obmc_variance8x16,
+       vpx_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, vpx_obmc_sad8x8, vpx_obmc_variance8x8,
+       vpx_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, vpx_obmc_sad4x8, vpx_obmc_variance4x8,
+       vpx_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, vpx_obmc_sad8x4, vpx_obmc_variance8x4,
+       vpx_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, vpx_obmc_sad4x4, vpx_obmc_variance4x4,
+       vpx_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
+
 #if CONFIG_EXT_INTER
 #define MBFP(BT, MSDF, MVF, MSVF)         \
   cpi->fn_ptr[BT].msdf            = MSDF; \
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index dd0c311..14a11c3 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -3253,3 +3253,544 @@
   return bestsme;
 }
 #endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+/* returns subpixel variance error function */
+#define DIST(r, c)                                                     \
+  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,         \
+            src_stride, mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c)                                                      \
+  (mvcost ?                                                            \
+    ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +                         \
+      mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *                 \
+      error_per_bit + 4096) >> 13 : 0)
+
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_obmc_pref_error(xd,                            \
+                                        mask, mask_stride,             \
+                                        vfp, z, src_stride,            \
+                                        upre(y, y_stride, r, c),       \
+                                        y_stride,                      \
+                                        w, h, &sse);                   \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+static unsigned int setup_obmc_center_error(const int *mask,
+                                            int mask_stride,
+                                            const MV *bestmv,
+                                            const MV *ref_mv,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            const int *const wsrc,
+                                            const int wsrc_stride,
+                                            const uint8_t *const y,
+                                            int y_stride,
+                                            int offset,
+                                            int *mvjcost, int *mvcost[2],
+                                            unsigned int *sse1,
+                                            int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->ovf(y + offset, y_stride, wsrc, wsrc_stride,
+                     mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_obmc_pref_error(const MACROBLOCKD *xd,
+                                     const int *mask, int mask_stride,
+                                     const vp10_variance_fn_ptr_t *vfp,
+                                     const int *const wsrc,
+                                     const int wsrc_stride,
+                                     const uint8_t *const y, int y_stride,
+                                     int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, wsrc_stride,
+                       mask, mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->ovf(pred, w, wsrc, wsrc_stride, mask, mask_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+                        const MACROBLOCKD *xd,
+                        const int *mask, int mask_stride,
+                        const MV *bestmv, const MV *ref_mv,
+                        int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+                        const int *const wsrc, const int wsrc_stride,
+                        const uint8_t *const y, int y_stride,
+                        int w, int h, int offset, int *mvjcost, int *mvcost[2],
+                        unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_obmc_pref_error(xd, mask, mask_stride, vfp,
+                                                   wsrc, wsrc_stride,
+                                                   y + offset, y_stride,
+                                                   w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_obmc_sub_pixel_tree_up(VP10_COMP *cpi, MACROBLOCK *x,
+                                          const int *wsrc, int wsrc_stride,
+                                          const int *mask, int mask_stride,
+                                          int mi_row, int mi_col,
+                                          MV *bestmv, const MV *ref_mv,
+                                          int allow_hp, int error_per_bit,
+                                          const vp10_variance_fn_ptr_t *vfp,
+                                          int forced_stop, int iters_per_step,
+                                          int *mvjcost, int *mvcost[2],
+                                          int *distortion, unsigned int *sse1,
+                                          int is_second,
+                                          int use_upsampled_ref) {
+  const int *const z = wsrc;
+  const int *const src_address = z;
+  const int src_stride = wsrc_stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        w, h, (offset << 3),
+        mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_obmc_pref_error(xd, mask, mask_stride,
+                                              vfp, src_address, src_stride,
+                                              pre_address, y_stride,
+                                              w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride,
+                              mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = {tr, tc};
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_obmc_pref_error(xd, mask, mask_stride,
+                                            vfp, src_address, src_stride,
+                                            pre_address, y_stride,
+                                            w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x,
+                               const int *wsrc, int wsrc_stride,
+                               const int *mask, int mask_stride,
+                               const MV *best_mv, const MV *center_mv,
+                               const vp10_variance_fn_ptr_t *vfp,
+                               int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride,
+                  wsrc, wsrc_stride, mask, mask_stride, &unused) +
+         (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                    x->mvcost, x->errorperbit) : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x,
+                             const int *wsrc, int wsrc_stride,
+                             const int *mask, int mask_stride,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp10_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, int is_second) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride,
+                                       wsrc, wsrc_stride, mask, mask_stride) +
+                         mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+                                        in_what->stride, wsrc, wsrc_stride,
+                                        mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x,
+                            const search_site_config *cfg,
+                            const int *wsrc, int wsrc_stride,
+                            const int *mask, int mask_stride,
+                            MV *ref_mv, MV *best_mv,
+                            int search_param,
+                            int sad_per_bit, int *num00,
+                            const vp10_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, in_what->stride,
+                          wsrc, wsrc_stride, mask, mask_stride) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+                              wsrc, wsrc_stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+                                 in_what->stride, wsrc, wsrc_stride,
+                                 mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int vp10_obmc_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
+                                 const int *wsrc, int wsrc_stride,
+                                 const int *mask, int mask_stride,
+                                 MV *mvp_full, int step_param,
+                                 int sadpb, int further_steps, int do_refine,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv,
+                                 int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = obmc_diamond_search_sad(x, &cpi->ss_cfg,
+                                        wsrc, wsrc_stride,
+                                        mask, mask_stride,
+                                        mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
+                                        fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_obmc_mvpred_var(x, wsrc, wsrc_stride, mask, mask_stride,
+                                  &temp_mv, ref_mv, fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg,
+                                        wsrc, wsrc_stride,
+                                        mask, mask_stride,
+                                        mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_obmc_mvpred_var(x, wsrc, wsrc_stride, mask, mask_stride,
+                                      &temp_mv, ref_mv, fn_ptr, 1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = obmc_refining_search_sad(x, wsrc, wsrc_stride, mask, mask_stride,
+                                       &best_mv, sadpb, search_range,
+                                       fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, wsrc_stride, mask, mask_stride,
+                                    &best_mv, ref_mv, fn_ptr, 1, is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_OBMC
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index c12e7af..1b4e613 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -195,6 +195,29 @@
                                    const MV *ref_mv, MV *dst_mv,
                                    int is_second);
 #endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+int vp10_obmc_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
+                                 const int *wsrc, int wsrc_stride,
+                                 const int *mask, int mask_stride,
+                                 MV *mvp_full, int step_param,
+                                 int sadpb, int further_steps, int do_refine,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv,
+                                 int is_second);
+int vp10_find_best_obmc_sub_pixel_tree_up(struct VP10_COMP *cpi, MACROBLOCK *x,
+                                          const int *wsrc, int wsrc_stride,
+                                          const int *mask, int mask_stride,
+                                          int mi_row, int mi_col,
+                                          MV *bestmv, const MV *ref_mv,
+                                          int allow_hp, int error_per_bit,
+                                          const vp10_variance_fn_ptr_t *vfp,
+                                          int forced_stop, int iters_per_step,
+                                          int *mvjcost, int *mvcost[2],
+                                          int *distortion, unsigned int *sse1,
+                                          int is_second,
+                                          int use_upsampled_ref);
+#endif  // CONFIG_OBMC
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 32c64cb..6d7c1a8 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -6036,6 +6036,149 @@
   }
 }
 
+#if CONFIG_OBMC
+static void single_motion_search_obmc(VP10_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                      const int* wsrc, int wsrc_stride,
+                                      const int* mask, int mask_stride,
+#if CONFIG_EXT_INTER
+                                      int ref_idx,
+                                      int mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                      int_mv *tmp_mv, int_mv pred_mv,
+                                      int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP10_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+#if CONFIG_EXT_INTER
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+#else
+  int ref = mbmi->ref_frame[0];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  int ref_idx = 0;
+#endif  // CONFIG_EXT_INTER
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp10_get_scaled_ref_frame(cpi,
+                                                                         ref);
+
+#if CONFIG_REF_MV
+  vp10_set_mvcost(x, ref);
+#endif
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    vp10_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp10_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp10_init_search_range(x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+    int boffset =  2 * (b_width_log2_lookup[cm->sb_size] -
+         VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[ref_idx] = backup_yv12[i];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv.as_mv;
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp10_obmc_full_pixel_diamond(cpi, x, wsrc, wsrc_stride,
+                                         mask, mask_stride,
+                                         &mvp_full, step_param, sadpb,
+                                         MAX_MVSEARCH_STEPS - 1 - step_param,
+                                         1, &cpi->fn_ptr[bsize],
+                                         &ref_mv, &tmp_mv->as_mv, ref_idx);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;
+    vp10_find_best_obmc_sub_pixel_tree_up(cpi, x,
+                                          wsrc, wsrc_stride,
+                                          mask, mask_stride,
+                                          mi_row, mi_col,
+                                          &tmp_mv->as_mv, &ref_mv,
+                                          cm->allow_high_precision_mv,
+                                          x->errorperbit,
+                                          &cpi->fn_ptr[bsize],
+                                          cpi->sf.mv.subpel_force_stop,
+                                          cpi->sf.mv.subpel_iters_per_step,
+                                          x->nmvjointcost, x->mvcost,
+                                          &dis, &x->pred_sse[ref],
+                                          ref_idx,
+                                          cpi->sf.use_upsampled_references);
+  }
+  *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+#endif  // CONFIG_OBMC
+
 #if CONFIG_EXT_INTER
 static void do_masked_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
                                     const uint8_t *mask, int mask_stride,
@@ -6370,10 +6513,10 @@
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
 #if CONFIG_OBMC
-                                 uint8_t *dst_buf1[3],
-                                 int dst_stride1[3],
-                                 uint8_t *dst_buf2[3],
-                                 int dst_stride2[3],
+                                 uint8_t *dst_buf1[3], int dst_stride1[3],
+                                 uint8_t *dst_buf2[3], int dst_stride2[3],
+                                 int *wsrc, int wsrc_strides,
+                                 int *mask2d, int mask2d_strides,
 #endif  // CONFIG_OBMC
 #if CONFIG_EXT_INTER
                                  int_mv single_newmvs[2][MAX_REF_FRAMES],
@@ -6435,6 +6578,7 @@
   MB_MODE_INFO best_mbmi;
 #if CONFIG_EXT_INTER
   int rate2_bmc_nocoeff;
+  int rate_mv_bmc;
   MB_MODE_INFO best_bmc_mbmi;
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_OBMC
@@ -6873,6 +7017,7 @@
 #if CONFIG_EXT_INTER
 #if CONFIG_OBMC
   best_bmc_mbmi = *mbmi;
+  rate_mv_bmc = rate_mv;
   rate2_bmc_nocoeff = *rate2;
   if (cm->interp_filter == SWITCHABLE)
     rate2_bmc_nocoeff += rs;
@@ -7386,14 +7531,45 @@
   for (mbmi->obmc = 0; mbmi->obmc <= allow_obmc; mbmi->obmc++) {
     int64_t tmp_rd, tmp_dist;
     int tmp_rate;
+#if CONFIG_EXT_INTER
+    int tmp_rate2 = mbmi->obmc ? rate2_bmc_nocoeff : rate2_nocoeff;
+#else
+    int tmp_rate2 = rate2_nocoeff;
+#endif  // CONFIG_EXT_INTER
 
     if (mbmi->obmc) {
 #if CONFIG_EXT_INTER
       *mbmi = best_bmc_mbmi;
-      assert(!mbmi->use_wedge_interinter);
-      vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
       mbmi->obmc = 1;
 #endif  // CONFIG_EXT_INTER
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+        int_mv tmp_mv;
+        int_mv pred_mv;
+        int tmp_rate_mv = 0;
+
+        pred_mv.as_int = mbmi->mv[0].as_int;
+        single_motion_search_obmc(cpi, x, bsize, mi_row, mi_col,
+                                  wsrc, wsrc_strides,
+                                  mask2d, mask2d_strides,
+#if CONFIG_EXT_INTER
+                                  0, mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                  &tmp_mv, pred_mv, &tmp_rate_mv);
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+          tmp_rate_mv = VPXMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        }
+#if CONFIG_EXT_INTER
+        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
+        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif  // CONFIG_EXT_INTER
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#if CONFIG_EXT_INTER
+      } else {
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#endif  // CONFIG_EXT_INTER
+      }
       vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0,
                                        NULL, NULL,
                                        dst_buf1, dst_stride1,
@@ -7415,11 +7591,7 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     x->skip = 0;
 
-#if CONFIG_EXT_INTER
-    *rate2 = mbmi->obmc ? rate2_bmc_nocoeff : rate2_nocoeff;
-#else
-    *rate2 = rate2_nocoeff;
-#endif  // CONFIG_EXT_INTER
+    *rate2 = tmp_rate2;
     if (allow_obmc)
       *rate2 += cpi->obmc_cost[bsize][mbmi->obmc];
     *distortion = 0;
@@ -7927,9 +8099,13 @@
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, int, weighted_src_buf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int, mask2d_buf[MAX_SB_SQUARE]);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
   int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
   int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+  int weighted_src_stride = MAX_SB_SIZE;
+  int mask2d_stride = MAX_SB_SIZE;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -8031,6 +8207,11 @@
   vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                       dst_stride2);
   vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  calc_target_weighted_pred(cm, x, xd, mi_row, mi_col,
+                            dst_buf1[0], dst_stride1[0],
+                            dst_buf2[0], dst_stride2[0],
+                            mask2d_buf, mask2d_stride,
+                            weighted_src_buf, weighted_src_stride);
 #endif  // CONFIG_OBMC
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -8577,6 +8758,8 @@
 #if CONFIG_OBMC
                                   dst_buf1, dst_stride1,
                                   dst_buf2, dst_stride2,
+                                  weighted_src_buf, weighted_src_stride,
+                                  mask2d_buf, mask2d_stride,
 #endif  // CONFIG_OBMC
 #if CONFIG_EXT_INTER
                                   single_newmvs,
@@ -8688,6 +8871,9 @@
 #if CONFIG_OBMC
                                            dst_buf1, dst_stride1,
                                            dst_buf2, dst_stride2,
+                                           weighted_src_buf,
+                                           weighted_src_stride,
+                                           mask2d_buf, mask2d_stride,
 #endif  // CONFIG_OBMC
 #if CONFIG_EXT_INTER
                                            dummy_single_newmvs,
@@ -10245,3 +10431,194 @@
   store_coding_context(x, ctx, best_ref_index,
                        best_pred_diff, 0);
 }
+
+#if CONFIG_OBMC
+void calc_target_weighted_pred(VP10_COMMON *cm,
+                               MACROBLOCK *x,
+                               MACROBLOCKD *xd,
+                               int mi_row, int mi_col,
+                               uint8_t *above_buf, int above_stride,
+                               uint8_t *left_buf,  int left_stride,
+                               int *mask_buf, int mask_stride,
+                               int *weighted_src_buf, int weighted_src_stride) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int row, col, i, mi_step;
+  int bw = 8 * xd->n8_w;
+  int bh = 8 * xd->n8_h;
+  int *dst = weighted_src_buf;
+  int *mask2d = mask_buf;
+  uint8_t *src;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  for (row = 0; row < bh; ++row) {
+    for (col = 0; col < bw; ++col) {
+      dst[col] = 0;
+      mask2d[col] = 64;
+    }
+    dst += weighted_src_stride;
+    mask2d += mask_stride;
+  }
+
+  // handle above row
+#if CONFIG_EXT_TILE
+  if (mi_row > 0 && (mi_row - 1 >= tile->mi_row_start)) {
+#else
+  if (mi_row > 0) {
+#endif  // CONFIG_EXT_TILE
+    for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+      int mi_row_offset = -1;
+      int mi_col_offset = i;
+      MODE_INFO *above_mi = xd->mi[mi_col_offset +
+                                   mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+      int overlap = num_4x4_blocks_high_lookup[bsize] << 1;
+
+      mi_step = VPXMIN(xd->n8_w,
+                       num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(above_mbmi)) {
+        const struct macroblockd_plane *pd = &xd->plane[0];
+        int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+        int bh = overlap >> pd->subsampling_y;
+        int dst_stride = weighted_src_stride;
+        int *dst = weighted_src_buf + (i * MI_SIZE >> pd->subsampling_x);
+        int tmp_stride = above_stride;
+        uint8_t *tmp = above_buf + (i * MI_SIZE >> pd->subsampling_x);
+        int mask2d_stride = mask_stride;
+        int *mask2d = mask_buf + (i * MI_SIZE >> pd->subsampling_x);
+        const uint8_t *mask1d[2];
+
+        setup_obmc_mask(bh, mask1d);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (is_hbd) {
+          uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+          for (row = 0; row < bh; ++row) {
+            for (col = 0; col < bw; ++col) {
+              dst[col] = mask1d[1][row] * tmp16[col];
+              mask2d[col] = mask1d[0][row];
+            }
+            dst += dst_stride;
+            tmp16 += tmp_stride;
+            mask2d += mask2d_stride;
+          }
+        } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        for (row = 0; row < bh; ++row) {
+          for (col = 0; col < bw; ++col) {
+            dst[col] = mask1d[1][row] * tmp[col];
+            mask2d[col] = mask1d[0][row];
+          }
+          dst += dst_stride;
+          tmp += tmp_stride;
+          mask2d += mask2d_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }  // each mi in the above row
+  }
+
+  // handle left column
+  dst = weighted_src_buf;
+  mask2d = mask_buf;
+  for (row = 0; row < bh; ++row) {
+    for (col = 0; col < bw; ++col) {
+      dst[col] = dst[col] << 6;
+      mask2d[col] = mask2d[col] << 6;
+    }
+    dst += weighted_src_stride;
+    mask2d += mask_stride;
+  }
+
+  if (mi_col > 0 && (mi_col - 1 >= tile->mi_col_start)) {
+    for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+      int mi_row_offset = i;
+      int mi_col_offset = -1;
+      int overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
+      MODE_INFO *left_mi = xd->mi[mi_col_offset +
+                                  mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+
+      mi_step = VPXMIN(xd->n8_h,
+                       num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(left_mbmi)) {
+        const struct macroblockd_plane *pd = &xd->plane[0];
+        int bw = overlap >> pd->subsampling_x;
+        int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+        int dst_stride = weighted_src_stride;
+        int *dst = weighted_src_buf +
+                   (i * MI_SIZE * dst_stride >> pd->subsampling_y);
+        int tmp_stride = left_stride;
+        uint8_t *tmp = left_buf +
+                       (i * MI_SIZE * tmp_stride >> pd->subsampling_y);
+        int mask2d_stride = mask_stride;
+        int *mask2d = mask_buf +
+                      (i * MI_SIZE * mask2d_stride >> pd->subsampling_y);
+        const uint8_t *mask1d[2];
+
+        setup_obmc_mask(bw, mask1d);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (is_hbd) {
+          uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+          for (row = 0; row < bh; ++row) {
+            for (col = 0; col < bw; ++col) {
+              dst[col] = (dst[col] >> 6) * mask1d[0][col] +
+                         (tmp16[col] << 6) * mask1d[1][col];
+              mask2d[col] = (mask2d[col] >> 6) * mask1d[0][col];
+            }
+            dst += dst_stride;
+            tmp16 += tmp_stride;
+            mask2d += mask2d_stride;
+          }
+        } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        for (row = 0; row < bh; ++row) {
+          for (col = 0; col < bw; ++col) {
+            dst[col] = (dst[col] >> 6) * mask1d[0][col] +
+                       (tmp[col] << 6) * mask1d[1][col];
+            mask2d[col] = (mask2d[col] >> 6) * mask1d[0][col];
+          }
+          dst += dst_stride;
+          tmp += tmp_stride;
+          mask2d += mask2d_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif  //  CONFIG_VP9_HIGHBITDEPTH
+      }
+    }  // each mi in the left column
+  }
+
+  dst = weighted_src_buf;
+  src = x->plane[0].src.buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (is_hbd) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col)
+        dst[col] = (src16[col] << 12) - dst[col];
+      dst += weighted_src_stride;
+      src16 += x->plane[0].src.stride;
+    }
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  for (row = 0; row < bh; ++row) {
+    for (col = 0; col < bw; ++col)
+      dst[col] = (src[col] << 12) - dst[col];
+    dst += weighted_src_stride;
+    src += x->plane[0].src.stride;
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+#endif  // CONFIG_OBMC
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index 53920bc..2ca39a5 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -108,6 +108,17 @@
   return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
 }
 
+#if CONFIG_OBMC
+void calc_target_weighted_pred(VP10_COMMON *cm,
+                               MACROBLOCK *x,
+                               MACROBLOCKD *xd,
+                               int mi_row, int mi_col,
+                               uint8_t *above_buf, int above_stride,
+                               uint8_t *left_buf, int left_stride,
+                               int *mask_buf, int mask_stride,
+                               int *weighted_src_buf, int weighted_src_stride);
+#endif  // CONFIG_OBMC
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index c500206..b3ed410 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -450,3 +450,109 @@
 HIGHBD_MASKSADMXN(4, 4)
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+// a: pred
+// b: target weighted prediction (has been *4096 to keep precision)
+// m: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride,
+                                    const int *b, int b_stride,
+                                    const int *m, int m_stride,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      int abs_diff = abs(b[x] - a[x] * m[x]);
+      sad += (abs_diff + 2048) >> 12;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                      \
+unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride,    \
+                                       const int *wsrc, int wsrc_stride,      \
+                                       const int *msk, int msk_stride) {      \
+  return obmc_sad(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride, m, n); \
+}
+
+#if CONFIG_EXT_PARTITION
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride,
+                                           const int *b, int b_stride,
+                                           const int *m, int m_stride,
+                                           int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      int abs_diff = abs(b[x] - a[x] * m[x]);
+      sad += (abs_diff + 2048) >> 12;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                               \
+unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref,             \
+                                              int ref_stride,                 \
+                                              const int *wsrc,                \
+                                              int wsrc_stride,                \
+                                              const int *msk,                 \
+                                              int msk_stride) {               \
+  return highbd_obmc_sad(ref, ref_stride, wsrc, wsrc_stride,                  \
+                         msk, msk_stride, m, n);                              \
+}
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_OBMC
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index cc99d25..ab3d8bb 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdlib.h>
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -1022,3 +1023,322 @@
 #endif  // CONFIG_EXT_PARTITION
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+void obmc_variance(const uint8_t *a, int  a_stride,
+                   const int *b, int  b_stride,
+                   const int *m, int  m_stride,
+                   int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int scaled_diff = b[j] - a[j] * m[j];
+      int abs_diff = (abs(scaled_diff) + 2048) >> 12;
+      int diff = (scaled_diff >= 0) ? abs_diff : -abs_diff;
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+}
+
+#define OBMC_VAR(W, H) \
+unsigned int vpx_obmc_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                            const int *b, int b_stride, \
+                                            const int *m, int m_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  obmc_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define OBMC_SUBPIX_VAR(W, H) \
+unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c(                        \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint8_t temp2[H * W];                                                       \
+                                                                              \
+  var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W,     \
+                                    bilinear_filters_2t[xoffset]);            \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,               \
+                                     bilinear_filters_2t[yoffset]);           \
+                                                                              \
+  return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, wsrc_stride,          \
+                                        msk, msk_stride, sse);                \
+}
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_obmc_variance64(const uint8_t *a8, int  a_stride,
+                            const int *b, int  b_stride,
+                            const int *m, int  m_stride,
+                            int w, int h, uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int scaled_diff = b[j] - a[j] * m[j];
+      int abs_diff = (abs(scaled_diff) + 2048) >> 12;
+      int diff = (scaled_diff >= 0) ? abs_diff : -abs_diff;
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+}
+
+void highbd_obmc_variance(const uint8_t *a8, int  a_stride,
+                          const int *b, int  b_stride,
+                          const int *m, int  m_stride,
+                          int  w, int  h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(a8, a_stride, b, b_stride, m, m_stride,
+                         w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+void highbd_10_obmc_variance(const uint8_t *a8, int  a_stride,
+                             const int *b, int  b_stride,
+                             const int *m, int  m_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(a8, a_stride, b, b_stride, m, m_stride,
+                         w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+void highbd_12_obmc_variance(const uint8_t *a8, int  a_stride,
+                             const int *b, int  b_stride,
+                             const int *m, int  m_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(a8, a_stride, b, b_stride, m, m_stride,
+                         w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H)                                                 \
+unsigned int vpx_highbd_obmc_variance##W##x##H##_c(const uint8_t *a,          \
+                                                   int a_stride,              \
+                                                   const int *b,              \
+                                                   int b_stride,              \
+                                                   const int *m,              \
+                                                   int m_stride,              \
+                                                   unsigned int *sse) {       \
+  int sum;                                                                    \
+  highbd_obmc_variance(a, a_stride, b, b_stride, m, m_stride,                 \
+                       W, H, sse, &sum);                                      \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c(const uint8_t *a,       \
+                                                      int a_stride,           \
+                                                      const int *b,           \
+                                                      int b_stride,           \
+                                                      const int *m,           \
+                                                      int m_stride,           \
+                                                      unsigned int *sse) {    \
+  int sum;                                                                    \
+  highbd_10_obmc_variance(a, a_stride, b, b_stride, m, m_stride,              \
+                          W, H, sse, &sum);                                   \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c(const uint8_t *a,       \
+                                                      int a_stride,           \
+                                                      const int *b,           \
+                                                      int b_stride,           \
+                                                      const int *m,           \
+                                                      int m_stride,           \
+                                                      unsigned int *sse) {    \
+  int sum;                                                                    \
+  highbd_12_obmc_variance(a, a_stride, b, b_stride, m, m_stride,              \
+                          W, H, sse, &sum);                                   \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                          \
+unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c(                 \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),     \
+                                               W, wsrc, wsrc_stride,          \
+                                               msk, msk_stride, sse);         \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(              \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                  W, wsrc, wsrc_stride,       \
+                                                  msk, msk_stride, sse);      \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(              \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                  W, wsrc, wsrc_stride,       \
+                                                  msk, msk_stride, sse);      \
+}
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_OBMC
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index dea2af9..88ab5e3 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -98,6 +98,30 @@
                                                        unsigned int *sse);
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
 
+#if CONFIG_VP10 && CONFIG_OBMC
+typedef unsigned int(*vpx_obmc_sad_fn_t)(const uint8_t *pred,
+                                         int pred_stride,
+                                         const int *wsrc,
+                                         int wsrc_stride,
+                                         const int *msk,
+                                         int msk_stride);
+typedef unsigned int (*vpx_obmc_variance_fn_t)(const uint8_t *pred,
+                                               int pred_stride,
+                                               const int *wsrc,
+                                               int wsrc_stride,
+                                               const int *msk,
+                                               int msk_stride,
+                                               unsigned int *sse);
+typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)(const uint8_t *pred,
+                                                     int pred_stride,
+                                                     int xoffset, int yoffset,
+                                                     const int *wsrc,
+                                                     int wsrc_stride,
+                                                     const int *msk,
+                                                     int msk_stride,
+                                                     unsigned int *sse);
+#endif  // CONFIG_VP10 && CONFIG_OBMC
+
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t               sdf;
@@ -126,6 +150,11 @@
   vpx_masked_variance_fn_t       mvf;
   vpx_masked_subpixvariance_fn_t msvf;
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+  vpx_obmc_sad_fn_t              osdf;
+  vpx_obmc_variance_fn_t         ovf;
+  vpx_obmc_subpixvariance_fn_t   osvf;
+#endif  // CONFIG_OBMC
 } vp10_variance_fn_ptr_t;
 #endif  // CONFIG_VP10
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 6e566c8..ad524a2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1095,6 +1095,25 @@
 }
 
 #
+# OBMC SAD
+#
+if (vpx_config("CONFIG_OBMC") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride";
+    specialize "vpx_obmc_sad${w}x${h}";
+  }
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride";
+      specialize "vpx_highbd_obmc_sad${w}x${h}";
+    }
+  }
+}
+
+#
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
@@ -1365,6 +1384,31 @@
 }
 
 #
+# OBMC Variance / OBMC Subpixel Variance
+#
+if (vpx_config("CONFIG_OBMC") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_obmc_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, int xoffset, int  yoffset, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+    specialize "vpx_obmc_variance${w}x${h}";
+    specialize "vpx_obmc_sub_pixel_variance${w}x${h}";
+  }
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "vpx_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+        add_proto qw/unsigned int/, "vpx_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, int xoffset, int  yoffset, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+        specialize "vpx_highbd${bd}obmc_variance${w}x${h}";
+        specialize "vpx_highbd${bd}obmc_sub_pixel_variance${w}x${h}";
+      }
+    }
+  }
+}
+
+#
 # Specialty Subpixel
 #
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";