Merge "Refactoring and cosmetic changes to ext-inter expt" into nextgenv2
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 0092e8c..78aabe6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -74,6 +74,10 @@
   return res;
 }
 
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
 static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
                              int l2w, int l2h, int src_stride_coeff,
                              int ref_stride_coeff, uint32_t *sse_ptr,
@@ -87,14 +91,14 @@
     for (int x = 0; x < w; x++) {
       int diff;
       if (!use_high_bit_depth_) {
-        diff = ref[w * y * ref_stride_coeff + x] -
-               src[w * y * src_stride_coeff + x];
+        diff = src[w * y * src_stride_coeff + x] -
+               ref[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
-               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        diff = CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -328,8 +332,10 @@
                                            log2height_, stride_coeff,
                                            stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -361,8 +367,10 @@
                                            log2height_, src_stride_coeff,
                                            ref_stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -866,36 +874,36 @@
                       make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
 #endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
-                      // make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
-                      // make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
-                      // make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
+                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
+                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
+                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
                       make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
                       make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
-                      // make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
-                      // make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
-                      // make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
-                      // make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
-                      // make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
-                      // make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
+                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
+                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
+                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
+                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
+                      make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
+                      make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
                       make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
-                      // make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
-                      // make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
+                      make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
+                      make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
 #endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
-                      // make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
-                      // make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
-                      // make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
-                      // make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
-                      // make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
-                      // make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
-                      // make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
-                      // make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
-                      // make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
-                      // make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
-                      // make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
-                      // make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
-                      // make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
+                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
+                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
+                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
+                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
+                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
+                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
+                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
+                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
+                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
+                      make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
+                      make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
+                      make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
 #if CONFIG_VP10 && CONFIG_EXT_PARTITION
                       make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
                       make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
@@ -1138,25 +1146,25 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDVarianceTest,
     ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_sse2, 12),
-                    // make_tuple(6, 5, &vpx_highbd_12_variance64x32_sse2, 12),
-                    // make_tuple(5, 6, &vpx_highbd_12_variance32x64_sse2, 12),
-                    // make_tuple(5, 5, &vpx_highbd_12_variance32x32_sse2, 12),
+                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_sse2, 12),
+                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_sse2, 12),
+                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_sse2, 12),
                       make_tuple(5, 4, &vpx_highbd_12_variance32x16_sse2, 12),
                       make_tuple(4, 5, &vpx_highbd_12_variance16x32_sse2, 12),
-                    // make_tuple(4, 4, &vpx_highbd_12_variance16x16_sse2, 12),
-                    // make_tuple(4, 3, &vpx_highbd_12_variance16x8_sse2, 12),
-                    // make_tuple(3, 4, &vpx_highbd_12_variance8x16_sse2, 12),
-                    // make_tuple(3, 3, &vpx_highbd_12_variance8x8_sse2, 12),
-                    // make_tuple(6, 6, &vpx_highbd_10_variance64x64_sse2, 10),
-                    // make_tuple(6, 5, &vpx_highbd_10_variance64x32_sse2, 10),
-                    // make_tuple(5, 6, &vpx_highbd_10_variance32x64_sse2, 10),
-                    // make_tuple(5, 5, &vpx_highbd_10_variance32x32_sse2, 10),
-                    // make_tuple(5, 4, &vpx_highbd_10_variance32x16_sse2, 10),
-                    // make_tuple(4, 5, &vpx_highbd_10_variance16x32_sse2, 10),
-                    // make_tuple(4, 4, &vpx_highbd_10_variance16x16_sse2, 10),
-                    // make_tuple(4, 3, &vpx_highbd_10_variance16x8_sse2, 10),
-                    // make_tuple(3, 4, &vpx_highbd_10_variance8x16_sse2, 10),
-                    // make_tuple(3, 3, &vpx_highbd_10_variance8x8_sse2, 10),
+                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_sse2, 12),
+                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_sse2, 12),
+                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_sse2, 12),
+                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_sse2, 12),
+                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_sse2, 10),
+                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_sse2, 10),
+                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_sse2, 10),
+                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_sse2, 10),
+                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_sse2, 10),
+                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_sse2, 10),
+                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_sse2, 10),
+                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_sse2, 10),
+                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_sse2, 10),
+                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_sse2, 10),
                       make_tuple(6, 6, &vpx_highbd_8_variance64x64_sse2, 8),
                       make_tuple(6, 5, &vpx_highbd_8_variance64x32_sse2, 8),
                       make_tuple(5, 6, &vpx_highbd_8_variance32x64_sse2, 8),
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index ae11556..5cecf79 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -363,40 +363,12 @@
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                      PARTITION_TYPE partition) {
-  return subsize_lookup[partition][bsize];
+  if (partition == PARTITION_INVALID)
+    return PARTITION_INVALID;
+  else
+    return subsize_lookup[partition][bsize];
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
-static INLINE PARTITION_TYPE get_partition(const MODE_INFO *const mi,
-                                           int mi_stride, int mi_rows,
-                                           int mi_cols, int mi_row,
-                                           int mi_col, BLOCK_SIZE bsize) {
-  const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
-  MODE_INFO m = mi[mi_row * mi_stride + mi_col];
-  PARTITION_TYPE partition = partition_lookup[bsl][m.mbmi.sb_type];
-  if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
-      mi_row + bs < mi_rows && mi_col + bs < mi_cols) {
-    BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
-    BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
-    MODE_INFO m_right = mi[mi_row * mi_stride + mi_col + bs];
-    MODE_INFO m_below = mi[(mi_row + bs) * mi_stride + mi_col];
-    if (m.mbmi.sb_type == h) {
-      return m_below.mbmi.sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
-    } else if (m.mbmi.sb_type == v) {
-      return m_right.mbmi.sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
-    } else if (m_below.mbmi.sb_type == h) {
-      return PARTITION_HORZ_A;
-    } else if (m_right.mbmi.sb_type == v) {
-      return PARTITION_VERT_A;
-    } else {
-      return PARTITION_SPLIT;
-    }
-  }
-  return partition;
-}
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
 static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
   DCT_DCT,    // DC
   ADST_DCT,   // V
diff --git a/vp10/common/mfqe.c b/vp10/common/mfqe.c
index bd0b25b..52756bd 100644
--- a/vp10/common/mfqe.c
+++ b/vp10/common/mfqe.c
@@ -355,6 +355,12 @@
   const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
   // Last decoded frame and will store the MFQE result.
   YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  // TODO(any): Fix for ext parition types and 128 superblocks
+  assert(0);
+#endif  // CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
   // Loop through each super block.
   for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index cc443e7..d122495 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -372,7 +372,8 @@
   return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
 }
 
-static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP10_COMMON *cm) {
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+    const VP10_COMMON *const cm) {
   return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }
 
@@ -651,6 +652,51 @@
 }
 #endif
 
+static INLINE PARTITION_TYPE get_partition(const VP10_COMMON *const cm,
+                                           const int mi_row,
+                                           const int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+    return PARTITION_INVALID;
+  } else {
+    const int offset = mi_row * cm->mi_stride + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + offset;
+    const MB_MODE_INFO *const mbmi = &mi[0]->mbmi;
+    const int bsl = b_width_log2_lookup[bsize];
+    const PARTITION_TYPE partition = partition_lookup[bsl][mbmi->sb_type];
+#if !CONFIG_EXT_PARTITION_TYPES
+    return partition;
+#else
+    const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+
+    assert(cm->mi_grid_visible[offset] == &cm->mi[offset]);
+
+    if (partition != PARTITION_NONE &&
+        bsize > BLOCK_8X8 &&
+        mi_row + hbs < cm->mi_rows &&
+        mi_col + hbs < cm->mi_cols) {
+      const BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+      const BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+      const MB_MODE_INFO *const mbmi_right = &mi[hbs]->mbmi;
+      const MB_MODE_INFO *const mbmi_below = &mi[hbs * cm->mi_stride]->mbmi;
+      if (mbmi->sb_type == h) {
+        return mbmi_below->sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+      } else if (mbmi->sb_type == v) {
+        return mbmi_right->sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+      } else if (mbmi_below->sb_type == h) {
+        return PARTITION_HORZ_A;
+      } else if (mbmi_right->sb_type == v) {
+        return PARTITION_VERT_A;
+      } else {
+        return PARTITION_SPLIT;
+      }
+    }
+
+    return partition;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 43ee719..8cc6b84 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1026,7 +1026,11 @@
   set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
                  cm->mi_rows, cm->mi_cols);
 
+#if CONFIG_EXT_TILE
+  xd->up_available    = (mi_row_ori > tile->mi_row_start);
+#else
   xd->up_available    = (mi_row_ori != 0);
+#endif  // CONFIG_EXT_TILE
   xd->left_available  = (mi_col_ori > tile->mi_col_start);
 
   set_plane_n4(xd, bw, bh, bwl, bhl);
@@ -1292,17 +1296,15 @@
                                    int mi_row_top, int mi_col_top,
                                    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
                                    uint8_t *dst_buf[3], int dst_stride[3]) {
-  VP10_COMMON *const cm = &pbi->common;
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if !CONFIG_EXT_PARTITION_TYPES
-  MB_MODE_INFO *mbmi;
-#endif
-  int i, offset = mi_row * cm->mi_stride + mi_col;
+  const VP10_COMMON *const cm = &pbi->common;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
+  int i;
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
 
   DECLARE_ALIGNED(16, uint8_t,
@@ -1345,16 +1347,8 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
-#if CONFIG_EXT_PARTITION_TYPES
-  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                            mi_row, mi_col, bsize);
-#else
-  mbmi = &xd->mi[0]->mbmi;
-  partition = partition_lookup[bsl][mbmi->sb_type];
-#endif
-  subsize = get_subsize(bsize, partition);
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  xd->mi[0] = cm->mi + mi_offset;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index e46a0fc..f402acb 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1655,13 +1655,12 @@
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
-  const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  MODE_INFO *m = NULL;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
 #if CONFIG_SUPERTX
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *mbmi = NULL;
   const int pack_token = !supertx_enabled;
   TX_SIZE supertx_size;
   int plane;
@@ -1670,17 +1669,10 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
-
-  partition = partition_lookup[bsl][m->mbmi.sb_type];
-#if CONFIG_EXT_PARTITION_TYPES
-  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                            mi_row, mi_col, bsize);
-#endif
-  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
-  subsize = get_subsize(bsize, partition);
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
 #if CONFIG_SUPERTX
-  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+  xd->mi = cm->mi_grid_visible + mi_offset;
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[bsize],
                  mi_col, num_8x8_blocks_wide_lookup[bsize],
@@ -1731,59 +1723,59 @@
       case PARTITION_HORZ:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row, mi_col);
-        if (mi_row + bs < cm->mi_rows)
+        if (mi_row + hbs < cm->mi_rows)
           write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
-                                supertx_enabled, mi_row + bs, mi_col);
+                                supertx_enabled, mi_row + hbs, mi_col);
         break;
       case PARTITION_VERT:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row, mi_col);
-        if (mi_col + bs < cm->mi_cols)
+        if (mi_col + hbs < cm->mi_cols)
           write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
-                                supertx_enabled, mi_row, mi_col + bs);
+                                supertx_enabled, mi_row, mi_col + hbs);
         break;
       case PARTITION_SPLIT:
         write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                                mi_row, mi_col, subsize);
         write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row, mi_col + bs, subsize);
+                               mi_row, mi_col + hbs, subsize);
         write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row + bs, mi_col, subsize);
+                               mi_row + hbs, mi_col, subsize);
         write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                               mi_row + bs, mi_col + bs, subsize);
+                               mi_row + hbs, mi_col + hbs, subsize);
         break;
 #if CONFIG_EXT_PARTITION_TYPES
       case PARTITION_HORZ_A:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row, mi_col + bs);
+                      mi_row, mi_col + hbs);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row + bs, mi_col);
+                      mi_row + hbs, mi_col);
         break;
       case PARTITION_HORZ_B:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row + bs, mi_col);
+                      mi_row + hbs, mi_col);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row + bs, mi_col + bs);
+                      mi_row + hbs, mi_col + hbs);
         break;
       case PARTITION_VERT_A:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row + bs, mi_col);
+                      mi_row + hbs, mi_col);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row, mi_col + bs);
+                      mi_row, mi_col + hbs);
         break;
       case PARTITION_VERT_B:
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row, mi_col + bs);
+                      mi_row, mi_col + hbs);
         write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
-                      mi_row + bs, mi_col + bs);
+                      mi_row + hbs, mi_col + hbs);
         break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
       default:
@@ -1792,15 +1784,15 @@
   }
 #if CONFIG_SUPERTX
   if (partition != PARTITION_NONE && supertx_enabled && pack_token &&
-      !m->mbmi.skip) {
+      !mbmi->skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const int mbmi_txb_size = txsize_to_bsize[m->mbmi.tx_size];
+      const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
       const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi_txb_size];
       const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi_txb_size];
       int row, col;
-      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
-                         : m->mbmi.tx_size;
+      TX_SIZE tx = plane ? get_uv_tx_size(mbmi, &xd->plane[plane])
+                         : mbmi->tx_size;
       BLOCK_SIZE txb_size = txsize_to_bsize[tx];
       int bw = num_4x4_blocks_wide_lookup[txb_size];
 
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 3da16cc..d1230d3 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -373,7 +373,11 @@
   assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
   set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
                  cm->mi_rows, cm->mi_cols);
+#if CONFIG_EXT_TILE
+  xd->up_available    = (mi_row_ori > tile->mi_row_start);
+#else
   xd->up_available    = (mi_row_ori != 0);
+#endif  // CONFIG_EXT_TILE
   xd->left_available  = (mi_col_ori > tile->mi_col_start);
 
   // R/D setup.
@@ -2264,35 +2268,24 @@
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree) {
-  VP10_COMMON *const cm = &cpi->common;
+  const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  int ctx;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize = bsize;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
 
+  assert(bsize >= BLOCK_8X8);
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
-
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-  if (output_enabled && bsize != BLOCK_4X4)
+  if (output_enabled)
     td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
@@ -2583,12 +2576,11 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = cm->mi_stride;
-  const int bsl = b_width_log2_lookup[bsize];
-  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
-  const int bss = (1 << bsl) / 4;
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
+  const int hbs = bs / 2;
   int i, pl;
-  PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE subsize;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
@@ -2616,9 +2608,6 @@
   vp10_rd_cost_reset(&none_rdc);
   vp10_rd_cost_reset(&chosen_rdc);
 
-  partition = partition_lookup[bsl][bs_type];
-  subsize = get_subsize(bsize, partition);
-
   pc_tree->partitioning = partition;
 
 #if CONFIG_VAR_TX
@@ -2643,7 +2632,7 @@
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        MODE_INFO *this_mi = mi_8x8[jj * hbs * mis + ii * hbs];
         if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -2653,8 +2642,8 @@
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + (mi_step >> 1) < cm->mi_rows &&
-        mi_col + (mi_step >> 1) < cm->mi_cols) {
+        mi_row + hbs < cm->mi_rows &&
+        mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
 #if CONFIG_SUPERTX
@@ -2705,7 +2694,7 @@
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
+          bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) {
         RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
         int rt_nocoef = 0;
@@ -2715,7 +2704,7 @@
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
-                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         mi_row + hbs, mi_col, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
 #endif
@@ -2748,7 +2737,7 @@
 #endif
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
+          bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) {
         RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
         int rt_nocoef = 0;
@@ -2758,7 +2747,7 @@
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
-                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+                         mi_row, mi_col + hbs, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
 #endif
@@ -2801,8 +2790,8 @@
       last_part_rate_nocoef = 0;
 #endif
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (mi_step >> 1);
-        int y_idx = (i >> 1) * (mi_step >> 1);
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
@@ -2813,7 +2802,7 @@
 
         vp10_rd_cost_init(&tmp_rdc);
         rd_use_partition(cpi, td, tile_data,
-                         mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mi_8x8 + jj * hbs * mis + ii * hbs, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist,
 #if CONFIG_SUPERTX
@@ -2853,10 +2842,10 @@
       && cpi->sf.adjust_partitioning_from_last_frame
       && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
-      && (mi_row + mi_step < cm->mi_rows ||
-          mi_row + (mi_step >> 1) == cm->mi_rows)
-      && (mi_col + mi_step < cm->mi_cols ||
-          mi_col + (mi_step >> 1) == cm->mi_cols)) {
+      && (mi_row + bs < cm->mi_rows ||
+          mi_row + hbs == cm->mi_rows)
+      && (mi_col + bs < cm->mi_cols ||
+          mi_col + hbs == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
@@ -2870,8 +2859,8 @@
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (mi_step >> 1);
-      int y_idx = (i >> 1) * (mi_step >> 1);
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
       RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
       int rt_nocoef = 0;
@@ -5164,29 +5153,20 @@
 static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
                           int mi_row, int mi_col, BLOCK_SIZE bsize,
                           PC_TREE *pc_tree) {
-  VP10_COMMON *const cm = &cpi->common;
+  const VP10_COMMON *const cm = &cpi->common;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize = bsize;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
   int i;
 #endif
 
+  assert(bsize >= BLOCK_8X8);
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return 1;
 
-  if (bsize >= BLOCK_8X8)
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  else
-    subsize = BLOCK_4X4;
-
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-
   switch (partition) {
     case PARTITION_NONE:
       return check_intra_b(&pc_tree->none);
@@ -5522,14 +5502,15 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
+  const int ctx =  partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
 
-  int i, ctx;
+  int i;
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
@@ -5537,6 +5518,12 @@
   int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
   int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
   int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
@@ -5564,23 +5551,8 @@
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-  if (output_enabled && bsize != BLOCK_4X4 && bsize < top_bsize)
-      cm->counts.partition[ctx][partition]++;
+  if (output_enabled && bsize < top_bsize)
+    cm->counts.partition[ctx][partition]++;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 4327d97..0c8ec43 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -367,8 +367,8 @@
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                               y_stride);
+      vpx_highbd_comp_avg_pred_c(comp_pred16, second_pred, w, h, y + offset,
+                                 y_stride);
       besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                         sse1);
     } else {
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index e7f746f..8628b99 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -180,8 +180,7 @@
   if (bsize == BLOCK_8X8)
     partition = PARTITION_NONE;
   else
-    partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                              mi_row, mi_col, bsize);
+    partition = get_partition(cm, mi_row, mi_col, bsize);
   switch (partition) {
     case PARTITION_NONE:
       count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 24f42df..90c8bed 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -433,7 +433,7 @@
   return *sse; \
 }
 
-static void highbd_var_filter_block2d_bil_first_pass(
+void highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -459,7 +459,7 @@
   }
 }
 
-static void highbd_var_filter_block2d_bil_second_pass(
+void highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -551,8 +551,8 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
                                           dst_stride, sse); \
@@ -573,8 +573,8 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -595,8 +595,8 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -635,9 +635,9 @@
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 161d647..4ad23f8 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -130,6 +130,24 @@
 } vp10_variance_fn_ptr_t;
 #endif  // CONFIG_VP10
 
+void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
+void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e371849..a9805d7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -350,6 +350,7 @@
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d01e81d..10a5280 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1316,10 +1316,17 @@
       if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
         specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
       }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse4_1";
+      }
       if ($w != 128 && $h != 128 && $w != 4) {
         specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
         specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
       }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+      }
     }
   }
 }  # CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000..18ecc7e
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
+  a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
+  a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
+  a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
+
+  b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
+  b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
+  b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
+  b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
+                                         int a_stride,
+                                         const uint8_t *b,
+                                         int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+// Sub-pixel
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+                             CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+                             CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+                             CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}