Merge changes I92819356,I50b5a313,I807e60c6,I8a8df9fd into nextgenv2

* changes:
  Branch dct to new implementation for bd12
  Change dct32x32's range
  Fit dct's stage range into 32-bit when bitdepth is 12
  Pass tx_type into get_tx_scale
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index cd0dca2..777ac49 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -164,6 +164,7 @@
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
     // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+    ASSERT_TRUE(0) << "Encode/Decode mismatch found";
   }
 
   void SetErrorFrames(int num, unsigned int *list) {
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index a3f0152..48edf1e 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -15,12 +15,16 @@
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #if CONFIG_VP10
 #include "vp10/common/blockd.h"
 #elif CONFIG_VP9
 #include "vp9/common/vp9_blockd.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define USE_SPEED_TEST (0)
 
 typedef void (*SubtractFunc)(int rows, int cols,
                              int16_t *diff_ptr, ptrdiff_t diff_stride,
@@ -108,4 +112,151 @@
 INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_msa));
 #endif
+
+typedef void (*HBDSubtractFunc)(int rows, int cols,
+                                int16_t *diff_ptr, ptrdiff_t diff_stride,
+                                const uint8_t *src_ptr, ptrdiff_t src_stride,
+                                const uint8_t *pred_ptr, ptrdiff_t pred_stride,
+                                int bd);
+
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = GET_PARAM(0);
+    block_height_ = GET_PARAM(1);
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(2));
+    func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    const size_t max_width = 128;
+    const size_t max_block_size = max_width * max_width;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(int16_t)));
+  }
+
+  virtual void TearDown() {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void RunForSpeed();
+  void CheckResult();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VP10HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_,
+          src_, block_width_, pred_, block_width_, bit_depth_);
+  }
+}
+
+void VP10HBDSubtractBlockTest::CheckResult() {
+  const int test_num = 100;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (i = 0; i < test_num; ++i) {
+    for (j = 0; j < max_block_size; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_,
+          src_, block_width_, pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VP10HBDSubtractBlockTest, CheckResult) {
+  CheckResult();
+}
+
+#if USE_SPEED_TEST
+TEST_P(VP10HBDSubtractBlockTest, CheckSpeed) {
+  RunForSpeed();
+}
+#endif  // USE_SPEED_TEST
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP10HBDSubtractBlockTest, ::testing::Values(
+    make_tuple(4, 4, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(4, 4, 12, vpx_highbd_subtract_block_c),
+    make_tuple(4, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(4, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 4, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 4, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 128, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 128, 12, vpx_highbd_subtract_block_c),
+    make_tuple(128, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(128, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(128, 128, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(128, 128, 12, vpx_highbd_subtract_block_c)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 97c5516..79f4e10 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -74,6 +74,10 @@
   return res;
 }
 
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
 static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
                              int l2w, int l2h, int src_stride_coeff,
                              int ref_stride_coeff, uint32_t *sse_ptr,
@@ -87,14 +91,14 @@
     for (int x = 0; x < w; x++) {
       int diff;
       if (!use_high_bit_depth_) {
-        diff = ref[w * y * ref_stride_coeff + x] -
-               src[w * y * src_stride_coeff + x];
+        diff = src[w * y * src_stride_coeff + x] -
+               ref[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
-               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        diff = CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -309,15 +313,15 @@
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
-    if (!use_high_bit_depth_) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth_) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
-    } else {
-      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
-      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
+      }
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -328,8 +332,10 @@
                                            log2height_, stride_coeff,
                                            stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -346,8 +352,8 @@
         ref_[ref_ind] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
-        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask_;
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
@@ -361,8 +367,10 @@
                                            log2height_, src_stride_coeff,
                                            ref_stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -915,6 +923,15 @@
                       make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
                       make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxHBDVarianceTest,
+    ::testing::Values(
+         make_tuple(2, 2, &vpx_highbd_8_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
@@ -1117,6 +1134,22 @@
         make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0)));
 #endif  // CONFIG_USE_X86INC
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxSubpelVarianceTest,
+    ::testing::Values(
+         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_sse4_1, 12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1, 8),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1, 10),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/vp10_ans_test.cc b/test/vp10_ans_test.cc
index 0e75157..9c93dd8 100644
--- a/test/vp10_ans_test.cc
+++ b/test/vp10_ans_test.cc
@@ -218,14 +218,16 @@
   tree[2 * i - 1] = sym;
 }
 
-// treep are the probabilites of tree nodes like:
-//          *
-//         / \
-//    -sym0  *
-//          / \
-//     -sym1  *
-//           / \
-//      -sym2  -sym3
+/* The treep array contains the probabilities of nodes of a tree structured
+ * like:
+ *          *
+ *         / \
+ *    -sym0   *
+ *           / \
+ *       -sym1  *
+ *             / \
+ *        -sym2  -sym3
+ */
 void tab2tree(const rans_sym *tab, int tab_size, vpx_prob *treep) {
   const unsigned basep = 256;
   unsigned pleft = basep;
diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc
index 2d09e0d..d6643e5 100644
--- a/test/vp10_fwd_txfm1d_test.cc
+++ b/test/vp10_fwd_txfm1d_test.cc
@@ -12,23 +12,28 @@
 #include "test/vp10_txfm_test.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::base;
+using libvpx_test::reference_hybrid_1d;
+using libvpx_test::TYPE_TXFM;
+using libvpx_test::TYPE_DCT;
+using libvpx_test::TYPE_ADST;
 
 namespace {
-static int txfm_type_num = 2;
-static TYPE_TXFM txfm_type_ls[2] = {TYPE_DCT, TYPE_ADST};
+const int txfm_type_num = 2;
+const TYPE_TXFM txfm_type_ls[2] = {TYPE_DCT, TYPE_ADST};
 
-static int txfm_size_num = 5;
-static int txfm_size_ls[5] = {4, 8, 16, 32, 64};
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
 
-static TxfmFunc fwd_txfm_func_ls[2][5] = {
+const TxfmFunc fwd_txfm_func_ls[2][5] = {
     {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new,
      vp10_fdct64_new},
     {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new,
      NULL}};
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
-static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
-static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+const int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
+const int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
 
 TEST(vp10_fwd_txfm1d, round_shift) {
   EXPECT_EQ(round_shift(7, 1), 4);
diff --git a/test/vp10_fwd_txfm2d_sse4_test.cc b/test/vp10_fwd_txfm2d_sse4_test.cc
index d3882cd..ab9450b 100644
--- a/test/vp10_fwd_txfm2d_sse4_test.cc
+++ b/test/vp10_fwd_txfm2d_sse4_test.cc
@@ -8,6 +8,9 @@
 #include "vp10/common/vp10_fwd_txfm2d_cfg.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::Fwd_Txfm2d_Func;
+using libvpx_test::base;
+using libvpx_test::bd;
 
 namespace {
 
@@ -58,8 +61,8 @@
       }
     }
 
-    txfm2d_func_c(input, output_c, cfg.txfm_size, &cfg, 10);
-    txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, &cfg, 10);
+    txfm2d_func_c(input, output_c, cfg.txfm_size, &cfg, bd);
+    txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, &cfg, bd);
     for (int r = 0; r < txfm_size; r++) {
       for (int c = 0; c < txfm_size; c++) {
         EXPECT_EQ(output_c[r * txfm_size + c],
diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
index 137f653..668103b 100644
--- a/test/vp10_fwd_txfm2d_test.cc
+++ b/test/vp10_fwd_txfm2d_test.cc
@@ -18,6 +18,13 @@
 #include "./vp10_rtcd.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::base;
+using libvpx_test::bd;
+using libvpx_test::compute_avg_abs_error;
+using libvpx_test::Fwd_Txfm2d_Func;
+using libvpx_test::TYPE_TXFM;
+using libvpx_test::TYPE_DCT;
+using libvpx_test::TYPE_ADST;
 
 namespace {
 
@@ -36,8 +43,8 @@
     {&fwd_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
 
 const Fwd_Txfm2d_Func fwd_txfm_func_ls[5] = {
-    vp10_fwd_txfm2d_4x4, vp10_fwd_txfm2d_8x8, vp10_fwd_txfm2d_16x16,
-    vp10_fwd_txfm2d_32x32, vp10_fwd_txfm2d_64x64};
+    vp10_fwd_txfm2d_4x4_c, vp10_fwd_txfm2d_8x8_c, vp10_fwd_txfm2d_16x16_c,
+    vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c};
 
 const int txfm_type_num = 4;
 const TYPE_TXFM type_ls_0[4] = {TYPE_DCT, TYPE_DCT, TYPE_ADST, TYPE_ADST};
@@ -106,4 +113,4 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-}  // anonymous namespace
+}  // namespace
diff --git a/test/vp10_inv_txfm1d_test.cc b/test/vp10_inv_txfm1d_test.cc
index 2e9e58d..98b2777 100644
--- a/test/vp10_inv_txfm1d_test.cc
+++ b/test/vp10_inv_txfm1d_test.cc
@@ -13,27 +13,28 @@
 #include "vp10/common/vp10_inv_txfm1d.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::base;
 
 namespace {
-static int txfm_type_num = 2;
-static int txfm_size_num = 5;
-static int txfm_size_ls[5] = {4, 8, 16, 32, 64};
+const int txfm_type_num = 2;
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
 
-static TxfmFunc fwd_txfm_func_ls[2][5] = {
+const TxfmFunc fwd_txfm_func_ls[2][5] = {
     {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new,
      vp10_fdct64_new},
     {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new,
      NULL}};
 
-static TxfmFunc inv_txfm_func_ls[2][5] = {
+const TxfmFunc inv_txfm_func_ls[2][5] = {
     {vp10_idct4_new, vp10_idct8_new, vp10_idct16_new, vp10_idct32_new,
      vp10_idct64_new},
     {vp10_iadst4_new, vp10_iadst8_new, vp10_iadst16_new, vp10_iadst32_new,
      NULL}};
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
-static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
-static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+const int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
+const int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
 
 TEST(vp10_inv_txfm1d, round_trip) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
index 9257244..7acb329 100644
--- a/test/vp10_inv_txfm2d_test.cc
+++ b/test/vp10_inv_txfm2d_test.cc
@@ -19,6 +19,11 @@
 #include "vp10/common/vp10_inv_txfm2d_cfg.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::base;
+using libvpx_test::bd;
+using libvpx_test::compute_avg_abs_error;
+using libvpx_test::Fwd_Txfm2d_Func;
+using libvpx_test::Inv_Txfm2d_Func;
 
 namespace {
 
@@ -116,4 +121,4 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-}  // anonymous namespace
+}  // namespace
diff --git a/test/vp10_txfm_test.h b/test/vp10_txfm_test.h
index a3a4258..c5bbb48 100644
--- a/test/vp10_txfm_test.h
+++ b/test/vp10_txfm_test.h
@@ -23,6 +23,7 @@
 #include "test/acm_random.h"
 #include "vp10/common/vp10_txfm.h"
 
+namespace libvpx_test {
 typedef enum {
   TYPE_DCT = 0,
   TYPE_ADST,
@@ -109,5 +110,5 @@
 
 static const int bd = 10;
 static const int base = (1 << bd);
-
+}  // namespace libvpx_test
 #endif  // VP10_TXFM_TEST_H_
diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index dd58e6d..abdc72b 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@@ -134,7 +134,8 @@
     // TODO(geza.lore): These are bigger than they need to be.
     // cm->tile_width would be enough but it complicates indexing a
     // little elsewhere.
-    const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+    const int aligned_mi_cols =
+        ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
     int i;
 
     for (i = 0 ; i < MAX_MB_PLANE ; i++) {
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 821d67c..4ed7f81 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -70,6 +70,16 @@
     return WEDGE_BITS_BIG;
 }
 
+static INLINE int is_interinter_wedge_used(BLOCK_SIZE sb_type) {
+  (void) sb_type;
+  return get_wedge_bits(sb_type) > 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+  (void) sb_type;
+  return 0;  // get_wedge_bits(sb_type) > 0;
+}
+
 static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWFROMNEARMV;
 }
@@ -166,7 +176,7 @@
 #if CONFIG_VAR_TX
   // TODO(jingning): This effectively assigned a separate entry for each
   // 8x8 block. Apparently it takes much more space than needed.
-  TX_SIZE inter_tx_size[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
 #endif
   int8_t skip;
   int8_t has_no_coeffs;
@@ -190,8 +200,8 @@
 #endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_EXT_INTER
-  PREDICTION_MODE interintra_mode;
-  PREDICTION_MODE interintra_uv_mode;
+  INTERINTRA_MODE interintra_mode;
+  INTERINTRA_MODE interintra_uv_mode;
   // TODO(debargha): Consolidate these flags
   int use_wedge_interintra;
   int interintra_wedge_index;
@@ -315,15 +325,15 @@
   const YV12_BUFFER_CONFIG *cur_buf;
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
 
   PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[MI_BLOCK_SIZE];
+  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
 
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT *left_txfm_context;
-  TXFM_CONTEXT left_txfm_context_buffer[MI_BLOCK_SIZE];
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
 
   TX_SIZE max_tx_size;
 #if CONFIG_SUPERTX
@@ -353,40 +363,12 @@
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                      PARTITION_TYPE partition) {
-  return subsize_lookup[partition][bsize];
+  if (partition == PARTITION_INVALID)
+    return PARTITION_INVALID;
+  else
+    return subsize_lookup[partition][bsize];
 }
 
-#if CONFIG_EXT_PARTITION_TYPES
-static INLINE PARTITION_TYPE get_partition(const MODE_INFO *const mi,
-                                           int mi_stride, int mi_rows,
-                                           int mi_cols, int mi_row,
-                                           int mi_col, BLOCK_SIZE bsize) {
-  const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
-  MODE_INFO m = mi[mi_row * mi_stride + mi_col];
-  PARTITION_TYPE partition = partition_lookup[bsl][m.mbmi.sb_type];
-  if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
-      mi_row + bs < mi_rows && mi_col + bs < mi_cols) {
-    BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
-    BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
-    MODE_INFO m_right = mi[mi_row * mi_stride + mi_col + bs];
-    MODE_INFO m_below = mi[(mi_row + bs) * mi_stride + mi_col];
-    if (m.mbmi.sb_type == h) {
-      return m_below.mbmi.sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
-    } else if (m.mbmi.sb_type == v) {
-      return m_right.mbmi.sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
-    } else if (m_below.mbmi.sb_type == h) {
-      return PARTITION_HORZ_A;
-    } else if (m_right.mbmi.sb_type == v) {
-      return PARTITION_VERT_A;
-    } else {
-      return PARTITION_SPLIT;
-    }
-  }
-  return partition;
-}
-#endif  // CONFIG_EXT_PARTITION_TYPES
-
 static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
   DCT_DCT,    // DC
   ADST_DCT,   // V
@@ -409,15 +391,16 @@
 #endif  // CONFIG_SUPERTX
 
 #if CONFIG_EXT_TX
-#define ALLOW_INTRA_EXT_TX       1
+#define ALLOW_INTRA_EXT_TX          1
 // whether masked transforms are used for 32X32
-#define USE_MSKTX_FOR_32X32      0
+#define USE_MSKTX_FOR_32X32         0
+#define USE_REDUCED_TXSET_FOR_16X16 1
 
 static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
   1, 16, 12, 2
 };
 static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
-  1, 12, 10
+  1, 12, 5
 };
 
 #if EXT_TX_SIZES == 4
@@ -426,7 +409,11 @@
   if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
   if (tx_size == TX_32X32)
     return is_inter ? 3 - 2 * USE_MSKTX_FOR_32X32 : 0;
-  return ((is_inter || tx_size < TX_16X16) ? 1 : 2);
+#if USE_REDUCED_TXSET_FOR_16X16
+  return (tx_size == TX_16X16 ? 2 : 1);
+#else
+  return (tx_size == TX_16X16 && !is_inter ? 2 : 1);
+#endif  // USE_REDUCED_TXSET_FOR_16X16
 }
 
 static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
@@ -462,7 +449,7 @@
   { 0, 0, 0, 0, },  // unused
   { 1, 1, 0, 0, },
   { 0, 0, 1, 0, },
-  { 0, 0, 0, 0, },
+  { 0, 0, 0, 1, },
 };
 #endif  // EXT_TX_SIZES == 4
 
@@ -470,14 +457,14 @@
 static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
   {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
   {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
-  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
 };
 
 // Transform types used in each inter set
 static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
   {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
   {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
-  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
   {1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
 };
 
@@ -701,6 +688,16 @@
           && is_interintra_allowed_ref(mbmi->ref_frame);
 }
 
+static INLINE int is_interintra_allowed_bsize_group(const int group) {
+  int i;
+  for (i = 0; i < BLOCK_SIZES; i++) {
+    if (size_group_lookup[i] == group &&
+        is_interintra_allowed_bsize(i))
+      return 1;
+  }
+  return 0;
+}
+
 static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
   return (mbmi->ref_frame[1] == INTRA_FRAME) && is_interintra_allowed(mbmi);
 }
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 29d5419..f1c8e30 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -268,24 +268,29 @@
   {25,  29,  50, 192, 192, 128, 180, 180},   // 6 = two intra neighbours
 };
 
-static const vpx_prob default_interintra_prob[BLOCK_SIZES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
-#if CONFIG_EXT_PARTITION
-  192, 192, 192
-#endif  // CONFIG_EXT_PARTITION
+static const vpx_prob default_interintra_prob[BLOCK_SIZE_GROUPS] = {
+  208, 208, 208, 208,
+};
+
+static const vpx_prob
+    default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1] = {
+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
 };
 
 static const vpx_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+  208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
 #if CONFIG_EXT_PARTITION
-  192, 192, 192
+  208, 208, 208
 #endif  // CONFIG_EXT_PARTITION
 };
 
 static const vpx_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
-  192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
+  208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
 #if CONFIG_EXT_PARTITION
-  192, 192, 192
+  208, 208, 208
 #endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_EXT_INTER
@@ -310,7 +315,7 @@
   -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
   -D45_PRED, 14,                    /* 6 = D45_NODE */
   -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED            /* 8 = D153_NODE */
 };
 
 const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
@@ -325,6 +330,18 @@
 };
 
 #if CONFIG_EXT_INTER
+const vpx_tree_index vp10_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)] = {
+  -II_DC_PRED, 2,                   /* 0 = II_DC_NODE     */
+  -II_TM_PRED, 4,                   /* 1 = II_TM_NODE     */
+  -II_V_PRED, 6,                    /* 2 = II_V_NODE      */
+  8, 12,                            /* 3 = II_COM_NODE    */
+  -II_H_PRED, 10,                   /* 4 = II_H_NODE      */
+  -II_D135_PRED, -II_D117_PRED,     /* 5 = II_D135_NODE   */
+  -II_D45_PRED, 14,                 /* 6 = II_D45_NODE    */
+  -II_D63_PRED, 16,                 /* 7 = II_D63_NODE    */
+  -II_D153_PRED, -II_D207_PRED      /* 8 = II_D153_NODE   */
+};
+
 const vpx_tree_index vp10_inter_compound_mode_tree
       [TREE_SIZE(INTER_COMPOUND_MODES)] = {
   -INTER_COMPOUND_OFFSET(ZERO_ZEROMV), 2,
@@ -965,17 +982,12 @@
     -FLIPADST_DCT, -DCT_FLIPADST,
     18, 20,
     -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST
+    -ADST_FLIPADST, -FLIPADST_ADST,
   }, {
     -IDTX, 2,
     -DCT_DCT, 4,
-    6, 12,
-    8, 10,
+    -ADST_ADST, 6,
     -ADST_DCT, -DCT_ADST,
-    -FLIPADST_DCT, -DCT_FLIPADST,
-    14, 16,
-    -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST
   }
 };
 
@@ -1077,50 +1089,50 @@
     },
   }, {
     {
-      {   8, 176, 128, 128, 128, 128, 128, 128, 128, },
-      {  10,  28, 176, 192, 208, 128, 128, 128, 128, },
-      {  10,  28, 176, 192,  48, 128, 128, 128, 128, },
-      {   9, 160, 128, 128, 128, 128, 128, 128, 128, },
-      {   8,  28,  96, 128, 128, 128, 160, 192, 128, },
-      {   7,  28, 160, 176, 192, 128, 128, 128, 128, },
-      {   7,  20, 160, 176,  64, 128, 128, 128, 128, },
-      {  10,  23, 160, 176,  64, 128, 128, 128, 128, },
-      {   8,  29, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  20,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
     }, {
-      {   2, 176, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  28, 176, 192, 208, 128, 128, 128, 128, },
-      {   4,  28, 176, 192,  48, 128, 128, 128, 128, },
-      {   8, 160, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28,  96, 128, 128, 128, 160, 192, 128, },
-      {   3,  28, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  26, 160, 176,  64, 128, 128, 128, 128, },
-      {   9,  24, 160, 176,  64, 128, 128, 128, 128, },
-      {   5,  24, 160, 176, 192, 128, 128, 128, 128, },
-      {   2,  25,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
     }, {
-      {   2, 176, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  28, 176, 192, 208, 128, 128, 128, 128, },
-      {   1,  28, 176, 192,  48, 128, 128, 128, 128, },
-      {   4, 160, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28,  96, 128, 128, 128, 160, 192, 128, },
-      {   2,  28, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  29, 160, 176,  64, 128, 128, 128, 128, },
-      {   4,  27, 160, 176,  64, 128, 128, 128, 128, },
-      {   2,  34, 160, 176, 192, 128, 128, 128, 128, },
-      {   1,  25,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
 #if EXT_TX_SIZES == 4
     }, {
-      {   2, 176, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  12, 160, 176, 192, 128, 128, 128, 128, },
-      {   1,  17, 160, 176,  64, 128, 128, 128, 128, },
-      {   4,  41, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  17,  96, 128, 128, 128, 160, 192, 128, },
-      {   2,  14, 160, 176, 192, 128, 128, 128, 128, },
-      {   3,  19, 160, 176,  64, 128, 128, 128, 128, },
-      {   4,  27, 160, 176,  64, 128, 128, 128, 128, },
-      {   2,  34, 160, 176, 192, 128, 128, 128, 128, },
-      {   1,  15,  96, 128, 128, 128, 160, 192, 128, },
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
 #endif
     },
   },
@@ -1152,11 +1164,11 @@
 #if CONFIG_EXT_INTRA
 static const vpx_prob
 default_intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1] = {
-    { 98,  63,  60,  },
-    { 98,  82,  80,  },
-    { 94,  65, 103,  },
-    { 49,  25,  24,  },
-    { 72,  38,  50,  },
+  { 98,  63,  60,  },
+  { 98,  82,  80,  },
+  { 94,  65, 103,  },
+  { 49,  25,  24,  },
+  { 72,  38,  50,  },
 };
 static const vpx_prob default_ext_intra_probs[2] = {230, 230};
 
@@ -1211,6 +1223,7 @@
 #if CONFIG_EXT_INTER
   vp10_copy(fc->inter_compound_mode_probs, default_inter_compound_mode_probs);
   vp10_copy(fc->interintra_prob, default_interintra_prob);
+  vp10_copy(fc->interintra_mode_prob, default_interintra_mode_prob);
   vp10_copy(fc->wedge_interintra_prob, default_wedge_interintra_prob);
   vp10_copy(fc->wedge_interinter_prob, default_wedge_interinter_prob);
 #endif  // CONFIG_EXT_INTER
@@ -1317,18 +1330,23 @@
                          pre_fc->inter_compound_mode_probs[i],
                          counts->inter_compound_mode[i],
                          fc->inter_compound_mode_probs[i]);
-  for (i = 0; i < BLOCK_SIZES; ++i) {
-    if (is_interintra_allowed_bsize(i))
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+    if (is_interintra_allowed_bsize_group(i))
       fc->interintra_prob[i] = mode_mv_merge_probs(pre_fc->interintra_prob[i],
                                                    counts->interintra[i]);
   }
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+    vpx_tree_merge_probs(
+        vp10_interintra_mode_tree, pre_fc->interintra_mode_prob[i],
+        counts->interintra_mode[i], fc->interintra_mode_prob[i]);
+  }
   for (i = 0; i < BLOCK_SIZES; ++i) {
-    if (is_interintra_allowed_bsize(i) && get_wedge_bits(i))
+    if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
       fc->wedge_interintra_prob[i] = mode_mv_merge_probs(
           pre_fc->wedge_interintra_prob[i], counts->wedge_interintra[i]);
   }
   for (i = 0; i < BLOCK_SIZES; ++i) {
-    if (get_wedge_bits(i))
+    if (is_interinter_wedge_used(i))
       fc->wedge_interinter_prob[i] = mode_mv_merge_probs(
           pre_fc->wedge_interinter_prob[i], counts->wedge_interinter[i]);
   }
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 8219dc5..f8e507e 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -34,6 +34,7 @@
 #define PALETTE_MAX_SIZE 8
 #define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
 #define PALETTE_Y_MODE_CONTEXTS 3
+#define PALETTE_MAX_BLOCK_SIZE (64 * 64)
 
 struct VP10Common;
 
@@ -70,7 +71,8 @@
 #if CONFIG_EXT_INTER
   vpx_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
                                     [INTER_COMPOUND_MODES - 1];
-  vpx_prob interintra_prob[BLOCK_SIZES];
+  vpx_prob interintra_prob[BLOCK_SIZE_GROUPS];
+  vpx_prob interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1];
   vpx_prob wedge_interintra_prob[BLOCK_SIZES];
   vpx_prob wedge_interinter_prob[BLOCK_SIZES];
 #endif  // CONFIG_EXT_INTER
@@ -137,7 +139,8 @@
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
 #if CONFIG_EXT_INTER
   unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
-  unsigned int interintra[BLOCK_SIZES][2];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
   unsigned int wedge_interintra[BLOCK_SIZES][2];
   unsigned int wedge_interinter[BLOCK_SIZES][2];
 #endif  // CONFIG_EXT_INTER
@@ -195,6 +198,8 @@
 extern const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 #if CONFIG_EXT_INTER
+extern const vpx_tree_index vp10_interintra_mode_tree
+                            [TREE_SIZE(INTERINTRA_MODES)];
 extern const vpx_tree_index vp10_inter_compound_mode_tree
                             [TREE_SIZE(INTER_COMPOUND_MODES)];
 #endif  // CONFIG_EXT_INTER
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 5615cee..01f1e78 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -20,24 +20,34 @@
 
 #undef MAX_SB_SIZE
 
+// Max superblock size
 #if CONFIG_EXT_PARTITION
 # define MAX_SB_SIZE_LOG2 7
 #else
 # define MAX_SB_SIZE_LOG2 6
 #endif  // CONFIG_EXT_PARTITION
-
-#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SIZE   (1 << MAX_SB_SIZE_LOG2)
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
-#define MI_SIZE_LOG2 3
-#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
 
-#define MI_BLOCK_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
-#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
+// Pixels per Mode Info (MI) unit
+#define MI_SIZE_LOG2  3
+#define MI_SIZE       (1 << MI_SIZE_LOG2)
 
-#define MI_MASK (MI_BLOCK_SIZE - 1)
-#define MI_MASK_2 (MI_BLOCK_SIZE * 2 - 1)
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE      (1 << MAX_MIB_SIZE_LOG2)
 
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK    (MAX_MIB_SIZE - 1)
+#define MAX_MIB_MASK_2  (MAX_MIB_SIZE * 2 - 1)
+
+// Maximum number of tile rows and tile columns
 #if CONFIG_EXT_TILE
 # define  MAX_TILE_ROWS 1024
 # define  MAX_TILE_COLS 1024
@@ -184,9 +194,11 @@
   VP9_LAST4_FLAG = 1 << 3,
   VP9_GOLD_FLAG = 1 << 4,
   VP9_ALT_FLAG = 1 << 5,
+  VP9_REFFRAME_ALL = (1 << 6) - 1
 #else
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
+  VP9_REFFRAME_ALL = (1 << 3) - 1
 #endif  // CONFIG_EXT_REFS
 } VP9_REFFRAME;
 
@@ -252,6 +264,23 @@
 
 #define INTRA_MODES (TM_PRED + 1)
 
+#if CONFIG_EXT_INTER
+typedef enum {
+  II_DC_PRED = 0,
+  II_V_PRED,
+  II_H_PRED,
+  II_D45_PRED,
+  II_D135_PRED,
+  II_D117_PRED,
+  II_D153_PRED,
+  II_D207_PRED,
+  II_D63_PRED,
+  II_TM_PRED,
+  INTERINTRA_MODES
+} INTERINTRA_MODE;
+
+#endif  // CONFIG_EXT_INTER
+
 #if CONFIG_EXT_INTRA
 typedef enum {
   FILTER_DC_PRED,
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index fe9b13c..23c131d 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -731,10 +731,8 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    int index = shift_y;
     for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[index], filter_level, w);
-      index += 8;
+      memset(&lfm->lfl_y[i][shift_y], filter_level, w);
     }
   }
 
@@ -813,10 +811,8 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    int index = shift_y;
     for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[index], filter_level, w);
-      index += 8;
+      memset(&lfm->lfl_y[i][shift_y], filter_level, w);
     }
   }
 
@@ -867,10 +863,8 @@
   const int shift_32_uv[] = {0, 2, 8, 10};
   const int shift_16_uv[] = {0, 1, 4, 5};
   int i;
-  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
-                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
-  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
-                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+  const int max_rows = VPXMIN(cm->mi_rows - mi_row, MAX_MIB_SIZE);
+  const int max_cols = VPXMIN(cm->mi_cols - mi_col, MAX_MIB_SIZE);
 #if CONFIG_EXT_PARTITION
   assert(0 && "Not yet updated");
 #endif  // CONFIG_EXT_PARTITION
@@ -1044,14 +1038,14 @@
   lfm->above_uv[TX_4X4] &= ~above_border_uv;
 
   // We do some special edge handling.
-  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
     const uint64_t rows = cm->mi_rows - mi_row;
 
     // Each pixel inside the border gets a 1,
     const uint64_t mask_y =
-      (((uint64_t) 1 << (rows << MI_BLOCK_SIZE_LOG2)) - 1);
+      (((uint64_t) 1 << (rows << MAX_MIB_SIZE_LOG2)) - 1);
     const uint16_t mask_uv =
-      (((uint16_t) 1 << (((rows + 1) >> 1) << (MI_BLOCK_SIZE_LOG2 - 1))) - 1);
+      (((uint16_t) 1 << (((rows + 1) >> 1) << (MAX_MIB_SIZE_LOG2 - 1))) - 1);
 
     // Remove values completely outside our border.
     for (i = 0; i < TX_32X32; i++) {
@@ -1075,7 +1069,7 @@
     }
   }
 
-  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
     const uint64_t columns = cm->mi_cols - mi_col;
 
     // Each pixel inside the border gets a 1, the multiply copies the border
@@ -1210,31 +1204,30 @@
 
 void vp10_filter_block_plane_non420(VP10_COMMON *cm,
                                     struct macroblockd_plane *plane,
-                                    MODE_INFO **mi_8x8,
+                                    MODE_INFO **mib,
                                     int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
   const int col_step = 1 << ss_x;
-  const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
-  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
+  unsigned int mask_16x16[MAX_MIB_SIZE] = {0};
+  unsigned int mask_8x8[MAX_MIB_SIZE] = {0};
+  unsigned int mask_4x4[MAX_MIB_SIZE] = {0};
+  unsigned int mask_4x4_int[MAX_MIB_SIZE] = {0};
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
   int r, c;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
     unsigned int mask_4x4_c = 0;
     unsigned int border_mask;
 
     // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
+    for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = mib[c];
       const MB_MODE_INFO *mbmi = &mi[0].mbmi;
       const BLOCK_SIZE sb_type = mbmi->sb_type;
       const int skip_this = mbmi->skip && is_inter_block(mbmi);
@@ -1267,8 +1260,7 @@
 
       int tx_size_mask = 0;
       // Filter level can vary per MI
-      if (!(lfl[(r << MI_BLOCK_SIZE_LOG2) + (c >> ss_x)] =
-            get_filter_level(&cm->lf_info, mbmi)))
+      if (!(lfl[r][c >> ss_x] = get_filter_level(&cm->lf_info, mbmi)))
         continue;
 
       if (tx_size == TX_32X32)
@@ -1288,10 +1280,10 @@
       tx_size_r = VPXMIN(tx_size,
                          cm->above_txfm_context[mi_col + c]);
       tx_size_c = VPXMIN(tx_size,
-                         cm->left_txfm_context[(mi_row + r) & MI_MASK]);
+                         cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
 
       cm->above_txfm_context[mi_col + c] = tx_size;
-      cm->left_txfm_context[(mi_row + r) & MI_MASK] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
 #endif
 
       // Build masks based on the transform size of each block
@@ -1365,7 +1357,7 @@
           mask_8x8_c & border_mask,
           mask_4x4_c & border_mask,
           mask_4x4_int[r],
-          &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2],
+          &cm->lf_info, &lfl[r][0],
           (int)cm->bit_depth);
     } else {
       filter_selectively_vert(dst->buf, dst->stride,
@@ -1373,7 +1365,7 @@
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask,
                               mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
+                              &cm->lf_info, &lfl[r][0]);
     }
 #else
     filter_selectively_vert(dst->buf, dst->stride,
@@ -1381,15 +1373,15 @@
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
+                            &cm->lf_info, &lfl[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
+    dst->buf += MI_SIZE * dst->stride;
+    mib += row_step * cm->mi_stride;
   }
 
   // Now do horizontal pass
   dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
@@ -1415,7 +1407,7 @@
           mask_8x8_r,
           mask_4x4_r,
           mask_4x4_int_r,
-          &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2],
+          &cm->lf_info, &lfl[r][0],
           (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride,
@@ -1423,7 +1415,7 @@
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
+                               &cm->lf_info, &lfl[r][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride,
@@ -1431,9 +1423,9 @@
                              mask_8x8_r,
                              mask_4x4_r,
                              mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << MI_BLOCK_SIZE_LOG2]);
+                             &cm->lf_info, &lfl[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    dst->buf += 8 * dst->stride;
+    dst->buf += MI_SIZE * dst->stride;
   }
 }
 
@@ -1452,7 +1444,7 @@
   assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
   // Vertical pass: do 2 rows at one time
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     unsigned int mask_16x16_l = mask_16x16 & 0xffff;
     unsigned int mask_8x8_l = mask_8x8 & 0xffff;
     unsigned int mask_4x4_l = mask_4x4 & 0xffff;
@@ -1464,24 +1456,24 @@
       highbd_filter_selectively_vert_row2(
           plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2], (int)cm->bit_depth);
+          &lfm->lfl_y[r][0], (int)cm->bit_depth);
     } else {
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
           mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
+          &lfm->lfl_y[r][0]);
     }
 #else
     filter_selectively_vert_row2(
         plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
         mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-        &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
+        &lfm->lfl_y[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    dst->buf += 16 * dst->stride;
-    mask_16x16 >>= 16;
-    mask_8x8 >>= 16;
-    mask_4x4 >>= 16;
-    mask_4x4_int >>= 16;
+    dst->buf += 2 * MI_SIZE * dst->stride;
+    mask_16x16 >>= 2 * MI_SIZE;
+    mask_8x8 >>= 2 * MI_SIZE;
+    mask_4x4 >>= 2 * MI_SIZE;
+    mask_4x4_int >>= 2 * MI_SIZE;
   }
 
   // Horizontal pass
@@ -1491,7 +1483,7 @@
   mask_4x4 = lfm->above_y[TX_4X4];
   mask_4x4_int = lfm->int_4x4_y;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
     unsigned int mask_16x16_r;
     unsigned int mask_8x8_r;
     unsigned int mask_4x4_r;
@@ -1511,24 +1503,24 @@
       highbd_filter_selectively_horiz(
           CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
           mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-          &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2],
+          &lfm->lfl_y[r][0],
           (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
+                               &lfm->lfl_y[r][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << MI_BLOCK_SIZE_LOG2]);
+                             &lfm->lfl_y[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    dst->buf += 8 * dst->stride;
-    mask_16x16 >>= 8;
-    mask_8x8 >>= 8;
-    mask_4x4 >>= 8;
-    mask_4x4_int >>= 8;
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE;
+    mask_8x8 >>= MI_SIZE;
+    mask_4x4 >>= MI_SIZE;
+    mask_4x4_int >>= MI_SIZE;
   }
 }
 
@@ -1546,16 +1538,13 @@
   uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
 
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  assert(plane->plane_type == PLANE_TYPE_UV);
 
   // Vertical pass: do 2 rows at one time
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-    if (plane->plane_type == 1) {
-      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] =
-          lfm->lfl_y[(r << MI_BLOCK_SIZE_LOG2) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] =
-          lfm->lfl_y[((r + 2) << MI_BLOCK_SIZE_LOG2) + (c << 1)];
-      }
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
     }
 
     {
@@ -1570,25 +1559,25 @@
         highbd_filter_selectively_vert_row2(
             plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
       } else {
         filter_selectively_vert_row2(
             plane->subsampling_x, dst->buf, dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1]);
+            &lfm->lfl_uv[r >> 1][0]);
       }
 #else
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_uv[r << 1]);
+          &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
+      dst->buf += 2 * MI_SIZE * dst->stride;
+      mask_16x16 >>= MI_SIZE;
+      mask_8x8 >>= MI_SIZE;
+      mask_4x4 >>= MI_SIZE;
+      mask_4x4_int >>= MI_SIZE;
     }
   }
 
@@ -1599,7 +1588,7 @@
   mask_4x4 = lfm->above_uv[TX_4X4];
   mask_4x4_int = lfm->above_int_4x4_uv;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r =
         skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
@@ -1622,23 +1611,24 @@
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
                                       mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+                                      &lfm->lfl_uv[r >> 1][0],
+                                      (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r << 1]);
+                               &lfm->lfl_uv[r >> 1][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfm->lfl_uv[r << 1]);
+                             &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    dst->buf += 8 * dst->stride;
-    mask_16x16 >>= 4;
-    mask_8x8 >>= 4;
-    mask_4x4 >>= 4;
-    mask_4x4_int >>= 4;
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE / 2;
+    mask_8x8 >>= MI_SIZE / 2;
+    mask_4x4 >>= MI_SIZE / 2;
+    mask_4x4_int >>= MI_SIZE / 2;
   }
 }
 
@@ -1653,12 +1643,12 @@
 # if CONFIG_VAR_TX
   memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
 # endif  // CONFIG_VAR_TX
-  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+  for (mi_row = start; mi_row < stop; mi_row += cm->mib_size) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 # if CONFIG_VAR_TX
-    memset(cm->left_txfm_context, TX_SIZES, MI_BLOCK_SIZE);
+    memset(cm->left_txfm_context, TX_SIZES, MAX_MIB_SIZE);
 # endif  // CONFIG_VAR_TX
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
       int plane;
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
@@ -1683,9 +1673,9 @@
   else
     path = LF_PATH_SLOW;
 
-  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       int plane;
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
diff --git a/vp10/common/loopfilter.h b/vp10/common/loopfilter.h
index 8fa0b80..2a88003 100644
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h
@@ -84,8 +84,8 @@
   uint16_t above_uv[TX_SIZES];
   uint16_t left_int_4x4_uv;
   uint16_t above_int_4x4_uv;
-  uint8_t lfl_y[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
-  uint8_t lfl_uv[MI_BLOCK_SIZE / 2 * MI_BLOCK_SIZE / 2];
+  uint8_t lfl_y[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  uint8_t lfl_uv[MAX_MIB_SIZE / 2][MAX_MIB_SIZE / 2];
 } LOOP_FILTER_MASK;
 
 /* assorted loopfilter functions which get used elsewhere */
diff --git a/vp10/common/mfqe.c b/vp10/common/mfqe.c
index c715ef7..52756bd 100644
--- a/vp10/common/mfqe.c
+++ b/vp10/common/mfqe.c
@@ -355,9 +355,15 @@
   const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
   // Last decoded frame and will store the MFQE result.
   YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  // TODO(any): Fix for ext parition types and 128 superblocks
+  assert(0);
+#endif  // CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
   // Loop through each super block.
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       MODE_INFO *mi;
       MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
       // Motion Info in last frame.
diff --git a/vp10/common/mvref_common.c b/vp10/common/mvref_common.c
index aa651a2..7c6633f 100644
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@@ -260,7 +260,7 @@
   // For each 4x4 group of blocks, when the bottom right is decoded the blocks
   // to the right have not been decoded therefore the bottom right does
   // not have a top right
-  while (bs < MI_BLOCK_SIZE) {
+  while (bs < MAX_MIB_SIZE) {
     if (mi_col & bs) {
       if ((mi_col & (2 * bs)) && (mi_row & (2 * bs))) {
         has_tr = 0;
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index bdd9ffe..3ac17e2 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -312,7 +312,7 @@
   int log2_tile_cols, log2_tile_rows;
 #endif  // !CONFIG_EXT_TILE
   int tile_cols, tile_rows;
-  int tile_width, tile_height;
+  int tile_width, tile_height;  // In MI units
 
   int byte_alignment;
   int skip_loop_filter;
@@ -332,7 +332,7 @@
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
-  TXFM_CONTEXT left_txfm_context[MI_BLOCK_SIZE];
+  TXFM_CONTEXT left_txfm_context[MAX_MIB_SIZE];
 #endif
   int above_context_alloc_cols;
 
@@ -343,6 +343,10 @@
 #if CONFIG_ANS
   rans_dec_lut token_tab[COEFF_PROB_MODELS];
 #endif  // CONFIG_ANS
+
+  BLOCK_SIZE sb_size;   // Size of the superblock used for this frame
+  int mib_size;         // Size of the superblock in units of MI blocks
+  int mib_size_log2;    // Log 2 of above.
 } VP10_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -372,7 +376,8 @@
   return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
 }
 
-static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP10_COMMON *cm) {
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+    const VP10_COMMON *const cm) {
   return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }
 
@@ -407,8 +412,12 @@
   bufs[new_idx].ref_count++;
 }
 
-static INLINE int mi_cols_aligned_to_sb(int n_mis) {
-  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
+static INLINE int mi_cols_aligned_to_sb(const VP10_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->mib_size_log2);
+}
+
+static INLINE int mi_rows_aligned_to_sb(const VP10_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->mib_size_log2);
 }
 
 static INLINE int frame_is_intra_only(const VP10_COMMON *const cm) {
@@ -440,7 +449,7 @@
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
-  const int left_idx = (mi_row * 2) & MI_MASK_2;
+  const int left_idx = (mi_row * 2) & MAX_MIB_MASK_2;
   int i;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -451,7 +460,7 @@
 
 static INLINE int calc_mi_size(int len) {
   // len is in mi units.
-  return len + MI_BLOCK_SIZE;
+  return len + MAX_MIB_SIZE;
 }
 
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
@@ -517,7 +526,8 @@
                                             BLOCK_SIZE subsize,
                                             BLOCK_SIZE bsize) {
   PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  PARTITION_CONTEXT *const left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
 #if CONFIG_EXT_PARTITION_TYPES
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -581,7 +591,8 @@
                                           int mi_row, int mi_col,
                                           BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const PARTITION_CONTEXT *left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
   const int bsl = mi_width_log2_lookup[bsize];
   int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
 
@@ -649,6 +660,58 @@
 }
 #endif
 
+static INLINE PARTITION_TYPE get_partition(const VP10_COMMON *const cm,
+                                           const int mi_row,
+                                           const int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+    return PARTITION_INVALID;
+  } else {
+    const int offset = mi_row * cm->mi_stride + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + offset;
+    const MB_MODE_INFO *const mbmi = &mi[0]->mbmi;
+    const int bsl = b_width_log2_lookup[bsize];
+    const PARTITION_TYPE partition = partition_lookup[bsl][mbmi->sb_type];
+#if !CONFIG_EXT_PARTITION_TYPES
+    return partition;
+#else
+    const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+
+    assert(cm->mi_grid_visible[offset] == &cm->mi[offset]);
+
+    if (partition != PARTITION_NONE &&
+        bsize > BLOCK_8X8 &&
+        mi_row + hbs < cm->mi_rows &&
+        mi_col + hbs < cm->mi_cols) {
+      const BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+      const BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+      const MB_MODE_INFO *const mbmi_right = &mi[hbs]->mbmi;
+      const MB_MODE_INFO *const mbmi_below = &mi[hbs * cm->mi_stride]->mbmi;
+      if (mbmi->sb_type == h) {
+        return mbmi_below->sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+      } else if (mbmi->sb_type == v) {
+        return mbmi_right->sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+      } else if (mbmi_below->sb_type == h) {
+        return PARTITION_HORZ_A;
+      } else if (mbmi_right->sb_type == v) {
+        return PARTITION_VERT_A;
+      } else {
+        return PARTITION_SPLIT;
+      }
+    }
+
+    return partition;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+  }
+}
+
+static INLINE void set_sb_size(VP10_COMMON *const cm,
+                               const BLOCK_SIZE sb_size) {
+  cm->sb_size = sb_size;
+  cm->mib_size = num_8x8_blocks_wide_lookup[cm->sb_size];
+  cm->mib_size_log2 = mi_width_log2_lookup[cm->sb_size];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index de91a21..2be4cf6 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -762,7 +762,7 @@
 }
 
 void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize) {
+                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
 #if CONFIG_EXT_INTER
@@ -1176,24 +1176,18 @@
 };
 
 #if CONFIG_EXT_PARTITION
-// TODO(debargha): What are the correct values here?
 static const uint8_t obmc_mask_64[2][64] = {
-    { 33, 33, 35, 35, 36, 36, 38, 38,
-      40, 40, 41, 41, 43, 43, 44, 44,
-      45, 45, 47, 47, 48, 48, 50, 50,
-      51, 51, 52, 52, 53, 53, 55, 55,
-      56, 56, 57, 57, 58, 58, 59, 59,
-      60, 60, 60, 60, 61, 61, 62, 62,
-      62, 62, 63, 63, 63, 63, 64, 64,
-      64, 64, 64, 64, 64, 64, 64, 64 },
-    { 31, 31, 29, 29, 28, 28, 26, 26,
-      24, 24, 23, 23, 21, 21, 20, 20,
-      19, 19, 17, 17, 16, 16, 14, 14,
-      13, 13, 12, 12, 11, 11,  9,  9,
-       8,  8,  7,  7,  6,  6,  5,  5,
-       4,  4,  4,  4,  3,  3,  2,  2,
-       2,  2,  1,  1,  1,  1,  0,  0,
-       0,  0,  0,  0,  0,  0,  0,  0 }
+    {
+      33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+      45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+      56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+      62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+    }, {
+      31, 30, 29, 29, 28, 27, 26, 25, 24, 24, 23, 22, 21, 20, 20, 20,
+      19, 18, 17, 17, 16, 15, 14, 13, 13, 13, 12, 12, 11, 10,  9,  8,
+      8,  8,  7,  7,  6,  6,  5, 4,  4,  4,  4,  4,  3,  2,  2,  2,
+      2,  2,  1,  1, 1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    }
 };
 #endif  // CONFIG_EXT_PARTITION
 
@@ -1599,7 +1593,45 @@
 #endif  // CONFIG_OBMC
 
 #if CONFIG_EXT_INTER
-static void combine_interintra(PREDICTION_MODE mode,
+#if CONFIG_EXT_PARTITION
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  128, 127, 125, 124, 123, 122, 120, 119,
+  118, 117, 116, 115, 113, 112, 111, 110,
+  109, 108, 107, 106, 105, 104, 103, 103,
+  102, 101, 100,  99,  98,  97,  97,  96,
+  95,  94,  94,  93,  92,  91,  91,  90,
+  89,  89,  88,  87,  87,  86,  86,  85,
+  84,  84,  83,  83,  82,  82,  81,  81,
+  80,  80,  79,  79,  78,  78,  77,  77,
+  76,  76,  75,  75,  75,  74,  74,  73,
+  73,  73,  72,  72,  72,  71,  71,  70,
+  70,  70,  69,  69,  69,  69,  68,  68,
+  68,  67,  67,  67,  67,  66,  66,  66,
+  66,  65,  65,  65,  65,  64,  64,  64,
+  64,  63,  63,  63,  63,  63,  62,  62,
+  62,  62,  62,  61,  61,  61,  61,  61,
+  61,  60,  60,  60,  60,  60,  60,  60,
+};
+static int ii_size_scales[BLOCK_SIZES] = {
+  32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+};
+#else
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  102, 100,  97,  95,  92,  90,  88,  86,
+  84,  82,  80,  78,  76,  74,  73,  71,
+  69,  68,  67,  65,  64,  62,  61,  60,
+  59,  58,  57,  55,  54,  53,  52,  52,
+  51,  50,  49,  48,  47,  47,  46,  45,
+  45,  44,  43,  43,  42,  41,  41,  40,
+  40,  39,  39,  38,  38,  38,  37,  37,
+  36,  36,  36,  35,  35,  35,  34,  34,
+};
+static int ii_size_scales[BLOCK_SIZES] = {
+  16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+};
+#endif  // CONFIG_EXT_PARTITION
+
+static void combine_interintra(INTERINTRA_MODE mode,
                                int use_wedge_interintra,
                                int wedge_index,
                                BLOCK_SIZE bsize,
@@ -1613,149 +1645,112 @@
   static const int scale_bits = 8;
   static const int scale_max = 256;
   static const int scale_round = 127;
-#if CONFIG_EXT_PARTITION
-  // TODO(debargha): Fill in the correct weights for 128 wide blocks.
-  static const int weights1d[MAX_SB_SIZE] = {
-      128, 128, 125, 125, 122, 122, 119, 119,
-      116, 116, 114, 114, 111, 111, 109, 109,
-      107, 107, 105, 105, 103, 103, 101, 101,
-       99,  99,  97,  97,  96,  96,  94,  94,
-       93,  93,  91,  91,  90,  90,  89,  89,
-       88,  88,  86,  86,  85,  85,  84,  84,
-       83,  83,  82,  82,  81,  81,  81,  81,
-       80,  80,  79,  79,  78,  78,  78,  78,
-       77,  77,  76,  76,  76,  76,  75,  75,
-       75,  75,  74,  74,  74,  74,  73,  73,
-       73,  73,  72,  72,  72,  72,  71,  71,
-       71,  71,  71,  71,  70,  70,  70,  70,
-       70,  70,  70,  70,  69,  69,  69,  69,
-       69,  69,  69,  69,  68,  68,  68,  68,
-       68,  68,  68,  68,  68,  68,  67,  67,
-       67,  67,  67,  67,  67,  67,  67,  67,
-  };
-  static int size_scales[BLOCK_SIZES] = {
-      32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
-  };
-#else
-  static const int weights1d[MAX_SB_SIZE] = {
-      128, 125, 122, 119, 116, 114, 111, 109,
-      107, 105, 103, 101,  99,  97,  96,  94,
-       93,  91,  90,  89,  88,  86,  85,  84,
-       83,  82,  81,  81,  80,  79,  78,  78,
-       77,  76,  76,  75,  75,  74,  74,  73,
-       73,  72,  72,  71,  71,  71,  70,  70,
-       70,  70,  69,  69,  69,  69,  68,  68,
-       68,  68,  68,  67,  67,  67,  67,  67,
-  };
-  static int size_scales[BLOCK_SIZES] = {
-      16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
-  };
-#endif  // CONFIG_EXT_PARTITION
-
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
-  const int size_scale = size_scales[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
   int i, j;
 
-  if (use_wedge_interintra && get_wedge_bits(bsize)) {
-    const uint8_t *mask = vp10_get_soft_mask(wedge_index, bsize, bh, bw);
-    for (i = 0; i < bh; ++i) {
-      for (j = 0; j < bw; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        comppred[i * compstride + j] =
-            (intrapred[i * intrastride + j] * m +
-             interpred[i * interstride + j] * ((1 << WEDGE_WEIGHT_BITS) - m) +
-             (1 << (WEDGE_WEIGHT_BITS - 1))) >> WEDGE_WEIGHT_BITS;
+  if (use_wedge_interintra) {
+    if (get_wedge_bits(bsize)) {
+      const uint8_t *mask = vp10_get_soft_mask(wedge_index, bsize, bh, bw);
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int m = mask[i * MASK_MASTER_STRIDE + j];
+          comppred[i * compstride + j] =
+              (intrapred[i * intrastride + j] * m +
+               interpred[i * interstride + j] * ((1 << WEDGE_WEIGHT_BITS) - m) +
+               (1 << (WEDGE_WEIGHT_BITS - 1))) >> WEDGE_WEIGHT_BITS;
+        }
       }
     }
     return;
   }
 
   switch (mode) {
-    case V_PRED:
+    case II_V_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = weights1d[i * size_scale];
+          int scale = ii_weights1d[i * size_scale];
           comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
                scale * intrapred[i * intrastride + j] + scale_round)
-               >> scale_bits;
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case H_PRED:
+    case II_H_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = weights1d[j * size_scale];
-            comppred[i * compstride + j] =
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
                scale * intrapred[i * intrastride + j] + scale_round)
-               >> scale_bits;
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D63_PRED:
-    case D117_PRED:
+    case II_D63_PRED:
+    case II_D117_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = (weights1d[i * size_scale] * 3 +
-                       weights1d[j * size_scale]) >> 2;
-            comppred[i * compstride + j] =
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >> 2;
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D207_PRED:
-    case D153_PRED:
+    case II_D207_PRED:
+    case II_D153_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = (weights1d[j * size_scale] * 3 +
-                       weights1d[i * size_scale]) >> 2;
-            comppred[i * compstride + j] =
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >> 2;
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D135_PRED:
+    case II_D135_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = weights1d[(i < j ? i : j) * size_scale];
-            comppred[i * compstride + j] =
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D45_PRED:
+    case II_D45_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = (weights1d[i * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
-            comppred[i * compstride + j] =
+          int scale = (ii_weights1d[i * size_scale] +
+                       ii_weights1d[j * size_scale]) >> 1;
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case TM_PRED:
-    case DC_PRED:
+    case II_TM_PRED:
+    case II_DC_PRED:
     default:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-            comppred[i * compstride + j] = (interpred[i * interstride + j] +
-                intrapred[i * intrastride + j]) >> 1;
+          comppred[i * compstride + j] = (interpred[i * interstride + j] +
+                                          intrapred[i * intrastride + j]) >> 1;
         }
       }
       break;
@@ -1763,7 +1758,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void combine_interintra_highbd(PREDICTION_MODE mode,
+static void combine_interintra_highbd(INTERINTRA_MODE mode,
                                       int use_wedge_interintra,
                                       int wedge_index,
                                       BLOCK_SIZE bsize,
@@ -1777,48 +1772,9 @@
   static const int scale_bits = 8;
   static const int scale_max = 256;
   static const int scale_round = 127;
-#if CONFIG_EXT_PARTITION
-  // TODO(debargha): Fill in the correct weights for 128 wide blocks.
-  static const int weights1d[MAX_SB_SIZE] = {
-      128, 128, 125, 125, 122, 122, 119, 119,
-      116, 116, 114, 114, 111, 111, 109, 109,
-      107, 107, 105, 105, 103, 103, 101, 101,
-       99,  99,  97,  97,  96,  96,  94,  94,
-       93,  93,  91,  91,  90,  90,  89,  89,
-       88,  88,  86,  86,  85,  85,  84,  84,
-       83,  83,  82,  82,  81,  81,  81,  81,
-       80,  80,  79,  79,  78,  78,  78,  78,
-       77,  77,  76,  76,  76,  76,  75,  75,
-       75,  75,  74,  74,  74,  74,  73,  73,
-       73,  73,  72,  72,  72,  72,  71,  71,
-       71,  71,  71,  71,  70,  70,  70,  70,
-       70,  70,  70,  70,  69,  69,  69,  69,
-       69,  69,  69,  69,  68,  68,  68,  68,
-       68,  68,  68,  68,  68,  68,  67,  67,
-       67,  67,  67,  67,  67,  67,  67,  67,
-  };
-  static int size_scales[BLOCK_SIZES] = {
-      32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
-  };
-#else
-  static const int weights1d[MAX_SB_SIZE] = {
-      128, 125, 122, 119, 116, 114, 111, 109,
-      107, 105, 103, 101,  99,  97,  96,  94,
-       93,  91,  90,  89,  88,  86,  85,  84,
-       83,  82,  81,  81,  80,  79,  78,  78,
-       77,  76,  76,  75,  75,  74,  74,  73,
-       73,  72,  72,  71,  71,  71,  70,  70,
-       70,  70,  69,  69,  69,  69,  68,  68,
-       68,  68,  68,  67,  67,  67,  67,  67,
-  };
-  static int size_scales[BLOCK_SIZES] = {
-      16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
-  };
-#endif  // CONFIG_EXT_PARTITION
-
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
-  const int size_scale = size_scales[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
   int i, j;
 
   uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
@@ -1826,105 +1782,107 @@
   uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
   (void) bd;
 
-  if (use_wedge_interintra && get_wedge_bits(bsize)) {
-    const uint8_t *mask = vp10_get_soft_mask(wedge_index, bsize, bh, bw);
-    for (i = 0; i < bh; ++i) {
-      for (j = 0; j < bw; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        comppred[i * compstride + j] =
-            (intrapred[i * intrastride + j] * m +
-             interpred[i * interstride + j] * ((1 << WEDGE_WEIGHT_BITS) - m) +
-             (1 << (WEDGE_WEIGHT_BITS - 1))) >> WEDGE_WEIGHT_BITS;
+  if (use_wedge_interintra) {
+    if (get_wedge_bits(bsize)) {
+      const uint8_t *mask = vp10_get_soft_mask(wedge_index, bsize, bh, bw);
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int m = mask[i * MASK_MASTER_STRIDE + j];
+          comppred[i * compstride + j] =
+              (intrapred[i * intrastride + j] * m +
+               interpred[i * interstride + j] * ((1 << WEDGE_WEIGHT_BITS) - m) +
+               (1 << (WEDGE_WEIGHT_BITS - 1))) >> WEDGE_WEIGHT_BITS;
+        }
       }
     }
     return;
   }
 
   switch (mode) {
-    case V_PRED:
+    case II_V_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = weights1d[i * size_scale];
+          int scale = ii_weights1d[i * size_scale];
           comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
                scale * intrapred[i * intrastride + j] + scale_round)
               >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case H_PRED:
+    case II_H_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = weights1d[j * size_scale];
-            comppred[i * compstride + j] =
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
                scale * intrapred[i * intrastride + j] + scale_round)
-               >> scale_bits;
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D63_PRED:
-    case D117_PRED:
+    case II_D63_PRED:
+    case II_D117_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = (weights1d[i * size_scale] * 3 +
-                       weights1d[j * size_scale]) >> 2;
-            comppred[i * compstride + j] =
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >> 2;
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D207_PRED:
-    case D153_PRED:
+    case II_D207_PRED:
+    case II_D153_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = (weights1d[j * size_scale] * 3 +
-                       weights1d[i * size_scale]) >> 2;
-            comppred[i * compstride + j] =
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >> 2;
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D135_PRED:
+    case II_D135_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = weights1d[(i < j ? i : j) * size_scale];
-            comppred[i * compstride + j] =
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case D45_PRED:
+    case II_D45_PRED:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-          int scale = (weights1d[i * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
-            comppred[i * compstride + j] =
+          int scale = (ii_weights1d[i * size_scale] +
+                       ii_weights1d[j * size_scale]) >> 1;
+          comppred[i * compstride + j] =
               ((scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j] + scale_round)
-                  >> scale_bits;
+               scale * intrapred[i * intrastride + j] + scale_round)
+              >> scale_bits;
         }
       }
-     break;
+      break;
 
-    case TM_PRED:
-    case DC_PRED:
+    case II_TM_PRED:
+    case II_DC_PRED:
     default:
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
-            comppred[i * compstride + j] = (interpred[i * interstride + j] +
-                intrapred[i * intrastride + j]) >> 1;
+          comppred[i * compstride + j] = (interpred[i * interstride + j] +
+                                          intrapred[i * intrastride + j]) >> 1;
         }
       }
       break;
@@ -1957,57 +1915,122 @@
     vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
                              ref, ref_stride, dst, dst_stride,
                              0, 0, plane);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      memcpy(src_216 - ref_stride, dst_216 - dst_stride,
+             sizeof(*src_216) * (4 << bhl));
+    } else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    {
+      memcpy(src_2 - ref_stride, dst_2 - dst_stride,
+             sizeof(*src_2) * (4 << bhl));
+    }
     vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
                              src_2, ref_stride, dst_2, dst_stride,
                              0, 1 << bwl, plane);
   } else {
+    int i;
     uint8_t *src_2 = ref + (4 << bhl);
     uint8_t *dst_2 = dst + (4 << bhl);
     vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
                              ref, ref_stride, dst, dst_stride,
                              0, 0, plane);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      for (i = 0; i < (4 << bwl); ++i)
+        src_216[i * ref_stride - 1] = dst_216[i * dst_stride - 1];
+    } else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    {
+      for (i = 0; i < (4 << bwl); ++i)
+        src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
+    }
     vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
                              src_2, ref_stride, dst_2, dst_stride,
                              1 << bhl, 0, plane);
   }
 }
 
-void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
-                                          uint8_t *ypred,
-                                          int ystride,
-                                          BLOCK_SIZE bsize) {
-  const int bw = 4 << b_width_log2_lookup[bsize];
+// Mapping of interintra to intra mode for use in the intra component
+static const int interintra_to_intra_mode[INTERINTRA_MODES] = {
+  DC_PRED,
+  V_PRED,
+  H_PRED,
+  D45_PRED,
+  D135_PRED,
+  D117_PRED,
+  D153_PRED,
+  D207_PRED,
+  D63_PRED,
+  TM_PRED
+};
+
+void vp10_build_intra_predictors_for_interintra(
+    MACROBLOCKD *xd,
+    BLOCK_SIZE bsize, int plane,
+    uint8_t *dst, int dst_stride) {
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      dst, dst_stride,
+      interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode],
+      bsize, plane);
+}
+
+void vp10_combine_interintra(MACROBLOCKD *xd,
+                             BLOCK_SIZE bsize, int plane,
+                             uint8_t *inter_pred, int inter_stride,
+                             uint8_t *intra_pred, int intra_stride) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
-    build_intra_predictors_for_interintra(
-        xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-        CONVERT_TO_BYTEPTR(intrapredictor), bw,
-        xd->mi[0]->mbmi.interintra_mode, bsize, 0);
     combine_interintra_highbd(xd->mi[0]->mbmi.interintra_mode,
                               xd->mi[0]->mbmi.use_wedge_interintra,
                               xd->mi[0]->mbmi.interintra_wedge_index,
                               bsize,
-                              bsize,
-                              xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                              ypred, ystride,
-                              CONVERT_TO_BYTEPTR(intrapredictor), bw, xd->bd);
+                              plane_bsize,
+                              xd->plane[plane].dst.buf,
+                              xd->plane[plane].dst.stride,
+                              inter_pred, inter_stride,
+                              intra_pred, intra_stride,
+                              xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
+                     xd->mi[0]->mbmi.use_wedge_interintra,
+                     xd->mi[0]->mbmi.interintra_wedge_index,
+                     bsize,
+                     plane_bsize,
+                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+                     inter_pred, inter_stride,
+                     intra_pred, intra_stride);
+}
+
+void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                          uint8_t *ypred,
+                                          int ystride,
+                                          BLOCK_SIZE bsize) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        DECLARE_ALIGNED(16, uint16_t,
+                        intrapredictor[MAX_SB_SQUARE]);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, 0, ypred, ystride,
+                            CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   {
     uint8_t intrapredictor[MAX_SB_SQUARE];
-    build_intra_predictors_for_interintra(
-        xd, xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-        intrapredictor, bw,
-        xd->mi[0]->mbmi.interintra_mode, bsize, 0);
-    combine_interintra(xd->mi[0]->mbmi.interintra_mode,
-                       xd->mi[0]->mbmi.use_wedge_interintra,
-                       xd->mi[0]->mbmi.interintra_wedge_index,
-                       bsize,
-                       bsize,
-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                       ypred, ystride, intrapredictor, bw);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, intrapredictor, MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, 0, ypred, ystride,
+                            intrapredictor, MAX_SB_SIZE);
   }
 }
 
@@ -2016,41 +2039,23 @@
                                           int ustride,
                                           int plane,
                                           BLOCK_SIZE bsize) {
-  const BLOCK_SIZE uvbsize = get_plane_block_size(bsize, &xd->plane[plane]);
-  const int bw = 4 << b_width_log2_lookup[uvbsize];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
-    build_intra_predictors_for_interintra(
-        xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
-        CONVERT_TO_BYTEPTR(uintrapredictor), bw,
-        xd->mi[0]->mbmi.interintra_uv_mode, bsize, plane);
-    combine_interintra_highbd(xd->mi[0]->mbmi.interintra_uv_mode,
-                              xd->mi[0]->mbmi.use_wedge_interintra,
-                              xd->mi[0]->mbmi.interintra_uv_wedge_index,
-                              bsize,
-                              uvbsize,
-                              xd->plane[plane].dst.buf,
-                              xd->plane[plane].dst.stride,
-                              upred, ustride,
-                              CONVERT_TO_BYTEPTR(uintrapredictor), bw, xd->bd);
+    DECLARE_ALIGNED(16, uint16_t,
+                    uintrapredictor[MAX_SB_SQUARE]);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, plane, CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, plane, upred, ustride,
+                            CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   {
     uint8_t uintrapredictor[MAX_SB_SQUARE];
-    build_intra_predictors_for_interintra(
-        xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
-        uintrapredictor, bw,
-        xd->mi[0]->mbmi.interintra_uv_mode, bsize, plane);
-    combine_interintra(xd->mi[0]->mbmi.interintra_uv_mode,
-                       xd->mi[0]->mbmi.use_wedge_interintra,
-                       xd->mi[0]->mbmi.interintra_uv_wedge_index,
-                       bsize,
-                       uvbsize,
-                       xd->plane[plane].dst.buf,
-                       xd->plane[plane].dst.stride,
-                       upred, ustride, uintrapredictor, bw);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, plane, uintrapredictor, MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, plane, upred, ustride,
+                            uintrapredictor, MAX_SB_SIZE);
   }
 }
 
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index eda1658..9067c4b 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -436,6 +436,25 @@
                                            int ustride, int vstride,
                                            BLOCK_SIZE bsize);
 
+void vp10_build_intra_predictors_for_interintra(
+    MACROBLOCKD *xd,
+    BLOCK_SIZE bsize, int plane,
+    uint8_t *intra_pred, int intra_stride);
+void vp10_combine_interintra(
+    MACROBLOCKD *xd,
+    BLOCK_SIZE bsize, int plane,
+    uint8_t *inter_pred, int inter_stride,
+    uint8_t *intra_pred, int intra_stride);
+void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                           uint8_t *upred,
+                                           uint8_t *vpred,
+                                           int ustride, int vstride,
+                                           BLOCK_SIZE bsize);
+void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                          uint8_t *ypred,
+                                          int ystride,
+                                          BLOCK_SIZE bsize);
+
 // Encoder only
 void vp10_build_inter_predictors_for_planes_single_buf(
     MACROBLOCKD *xd, BLOCK_SIZE bsize,
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 300005f..bafd0d6 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -272,21 +272,19 @@
       if (x + step < w)
         return 1;
 
-      mi_row = (mi_row & MI_MASK) >> hl;
-      mi_col = (mi_col & MI_MASK) >> wl;
+      mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+      mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
       // If top row of coding unit
       if (mi_row == 0)
         return 1;
 
       // If rightmost column of coding unit
-      if (((mi_col + 1) << wl) >= MI_BLOCK_SIZE)
+      if (((mi_col + 1) << wl) >= MAX_MIB_SIZE)
         return 0;
 
-      my_order =
-        order[((mi_row + 0) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col + 0];
-      tr_order =
-        order[((mi_row - 1) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col + 1];
+      my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+      tr_order = order[((mi_row - 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 1];
 
       return my_order > tr_order;
     } else {
@@ -315,17 +313,17 @@
     if (y + step < h)
       return 1;
 
-    mi_row = (mi_row & MI_MASK) >> hl;
-    mi_col = (mi_col & MI_MASK) >> wl;
+    mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+    mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
     if (mi_col == 0)
-      return (mi_row << (hl + !ss_y)) + y + step < (MI_BLOCK_SIZE << !ss_y);
+      return (mi_row << (hl + !ss_y)) + y + step < (MAX_MIB_SIZE << !ss_y);
 
-    if (((mi_row + 1) << hl) >= MI_BLOCK_SIZE)
+    if (((mi_row + 1) << hl) >= MAX_MIB_SIZE)
       return 0;
 
-    my_order = order[((mi_row + 0) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col + 0];
-    bl_order = order[((mi_row + 1) << (MI_BLOCK_SIZE_LOG2 - wl)) + mi_col - 1];
+    my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+    bl_order = order[((mi_row + 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col - 1];
 
     return bl_order < my_order;
   }
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 2644ecf..9fef038 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -793,8 +793,9 @@
 // -1 indicates the neighbor does not exist.
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
-  3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+  0,   0,   0,   0,   4,   0,   1,   4,   4,   5,   5,   1,
+  8,   8,   5,   8,   2,   2,   2,   5,   9,  12,   6,   9,
+  3,   6,  10,  13,   7,  10,  11,  14,   0,   0,
 };
 
 #if CONFIG_EXT_TX
@@ -813,25 +814,31 @@
 
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
-  3, 10, 10, 7, 7, 11, 11, 0, 0,
+  0,   0,   0,   0,   4,   4,   4,   0,   8,   8,   1,   4,
+  5,   8,   5,   1,   9,  12,   2,   5,   6,   9,   6,   2,
+  3,   6,  10,  13,   7,  10,  11,  14,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12,
-  12, 10, 10, 13, 13, 14, 14, 0, 0,
+  0,   0,   0,   0,   0,   1,   1,   1,   1,   4,   2,   2,
+  2,   5,   4,   5,   5,   8,   3,   6,   8,   9,   6,   9,
+  9,  12,   7,  10,  10,  13,  11,  14,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2,
-  2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26,
-  26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20,
-  20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21,
-  51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30,
-  30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54,
-  47, 47, 55, 55, 0, 0,
+  0,   0,   0,   0,   8,   8,   8,   0,  16,  16,   1,   8,
+  24,  24,   9,  16,   9,   1,  32,  32,  17,  24,   2,   9,
+  25,  32,  10,  17,  40,  40,  10,   2,  18,  25,  33,  40,
+  3,  10,  48,  48,  11,  18,  26,  33,  11,   3,  41,  48,
+  19,  26,  34,  41,   4,  11,  27,  34,  12,  19,  49,  56,
+  42,  49,  20,  27,  12,   4,  35,  42,   5,  12,  28,  35,
+  50,  57,  43,  50,  13,  20,  36,  43,  13,   5,  21,  28,
+  51,  58,  29,  36,   6,  13,  44,  51,  14,  21,  14,   6,
+  37,  44,  52,  59,  22,  29,   7,  14,  30,  37,  45,  52,
+  15,  22,  38,  45,  23,  30,  53,  60,  31,  38,  46,  53,
+  39,  46,  54,  61,  47,  54,  55,  62,   0,   0,
 };
 
 #if CONFIG_EXT_TX
@@ -860,24 +867,32 @@
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
-  4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
-  32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34,
-  40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36,
-  36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37,
-  51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53,
-  53, 54, 54, 61, 61, 62, 62, 0, 0,
+  0,   0,   0,   0,   1,   1,   0,   1,   1,   8,   2,   2,
+  8,   9,   2,   9,   3,   3,   9,  16,   3,  10,  16,  17,
+  4,   4,  10,  17,  17,  24,   4,  11,  11,  18,  18,  25,
+  24,  25,   5,   5,   5,  12,  12,  19,  25,  32,  19,  26,
+  6,   6,  26,  33,  32,  33,  13,  20,  20,  27,  33,  40,
+  6,  13,  27,  34,  40,  41,  34,  41,  21,  28,  28,  35,
+  41,  48,  14,  21,  35,  42,   7,  14,  48,  49,  29,  36,
+  42,  49,  36,  43,  22,  29,  49,  56,  15,  22,  43,  50,
+  50,  57,  37,  44,  30,  37,  44,  51,  23,  30,  51,  58,
+  45,  52,  38,  45,  52,  59,  31,  38,  53,  60,  39,  46,
+  46,  53,  47,  54,  54,  61,  55,  62,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17,
-  24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19,
-  26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5,
-  5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36,
-  43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51,
-  51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39,
-  46, 54, 61, 47, 54, 55, 62, 0, 0,
+  0,   0,   0,   0,   8,   0,   8,   8,   1,   8,   9,   1,
+  9,  16,  16,  17,   2,   9,  10,   2,  10,  17,  17,  24,
+  24,  25,   3,  10,  11,   3,  18,  25,  25,  32,  11,  18,
+  32,  33,   4,  11,  26,  33,  19,  26,  12,   4,  33,  40,
+  12,  19,  40,  41,   5,  12,  27,  34,  34,  41,  20,  27,
+  13,  20,  13,   5,  41,  48,  48,  49,  28,  35,  35,  42,
+  21,  28,   6,   6,   6,  13,  42,  49,  49,  56,  36,  43,
+  14,  21,  29,  36,   7,  14,  43,  50,  50,  57,  22,  29,
+  37,  44,  15,  22,  44,  51,  51,  58,  30,  37,  23,  30,
+  52,  59,  45,  52,  38,  45,  31,  38,  53,  60,  46,  53,
+  39,  46,  54,  61,  47,  54,  55,  62,   0,   0,
 };
 
 #if CONFIG_EXT_TX
@@ -992,113 +1007,143 @@
 
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
-  17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
-  18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3,
-  97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3,
-  51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83,
-  114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5,
-  130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37,
-  146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69,
-  6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38,
-  193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163,
-  209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194,
-  225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8,
-  71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40,
-  56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211,
-  72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41,
-  135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151,
-  197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10,
-  26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213,
-  58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90,
-  229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43,
-  184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138,
-  200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154,
-  44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201,
-  170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186,
-  217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233,
-  77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109,
-  46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78,
-  31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63,
-  110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219,
-  142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220,
-  220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221,
-  175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223,
-  223, 239, 239, 0, 0,
+  0,   0,   0,   0,  16,  16,  32,  32,  16,   0,  48,  48,
+  1,  16,  64,  64,  17,  32,  80,  80,  33,  48,  17,   1,
+  49,  64,  96,  96,   2,  17,  65,  80,  18,  33, 112, 112,
+  34,  49,  81,  96,  18,   2,  50,  65, 128, 128,   3,  18,
+  97, 112,  19,  34,  66,  81, 144, 144,  82,  97,  35,  50,
+  113, 128,  19,   3,  51,  66, 160, 160,   4,  19,  98, 113,
+  129, 144,  67,  82,  20,  35,  83,  98, 114, 129,  36,  51,
+  176, 176,  20,   4, 145, 160,  52,  67,  99, 114,   5,  20,
+  130, 145,  68,  83, 192, 192, 161, 176,  21,  36, 115, 130,
+  84,  99,  37,  52, 146, 161, 208, 208,  53,  68,  21,   5,
+  100, 115, 177, 192, 131, 146,  69,  84,   6,  21, 224, 224,
+  116, 131,  22,  37, 162, 177,  85, 100, 147, 162,  38,  53,
+  193, 208, 101, 116,  54,  69,  22,   6, 132, 147, 178, 193,
+  70,  85, 163, 178, 209, 224,   7,  22, 117, 132,  23,  38,
+  148, 163,  23,   7,  86, 101, 194, 209, 225, 240,  39,  54,
+  179, 194, 102, 117, 133, 148,  55,  70, 164, 179,   8,  23,
+  71,  86, 210, 225, 118, 133, 149, 164, 195, 210,  24,  39,
+  87, 102,  40,  55,  56,  71, 134, 149, 180, 195, 226, 241,
+  103, 118,  24,   8, 165, 180, 211, 226,  72,  87, 150, 165,
+  9,  24, 119, 134,  25,  40,  88, 103, 196, 211,  41,  56,
+  135, 150, 181, 196, 104, 119,  57,  72, 227, 242, 166, 181,
+  120, 135, 151, 166, 197, 212,  73,  88,  25,   9, 212, 227,
+  89, 104, 136, 151, 182, 197,  10,  25,  26,  41, 105, 120,
+  167, 182, 228, 243, 152, 167,  42,  57, 121, 136, 213, 228,
+  58,  73, 198, 213,  74,  89, 137, 152, 183, 198, 168, 183,
+  26,  10,  90, 105, 229, 244,  11,  26, 106, 121, 214, 229,
+  153, 168,  27,  42, 199, 214,  43,  58, 184, 199, 122, 137,
+  169, 184, 230, 245,  59,  74,  27,  11,  75,  90, 138, 153,
+  200, 215, 215, 230,  91, 106,  12,  27,  28,  43, 185, 200,
+  107, 122, 154, 169,  44,  59, 231, 246, 216, 231,  60,  75,
+  123, 138,  28,  12,  76,  91, 201, 216, 170, 185, 232, 247,
+  139, 154,  92, 107,  13,  28, 108, 123,  29,  44, 186, 201,
+  217, 232, 155, 170,  45,  60,  29,  13,  61,  76, 124, 139,
+  14,  14, 233, 248,  77,  92,  14,  29, 171, 186, 140, 155,
+  202, 217,  30,  45,  93, 108, 109, 124,  46,  61, 156, 171,
+  62,  77, 187, 202,  15,  30, 125, 140, 218, 233,  78,  93,
+  31,  46, 172, 187,  47,  62, 141, 156,  94, 109, 234, 249,
+  203, 218,  63,  78, 110, 125, 188, 203, 157, 172, 126, 141,
+  79,  94, 173, 188,  95, 110, 219, 234, 142, 157, 204, 219,
+  235, 250, 111, 126, 158, 173, 127, 142, 189, 204, 220, 235,
+  143, 158, 174, 189, 205, 220, 236, 251, 159, 174, 190, 205,
+  221, 236, 175, 190, 237, 252, 206, 221, 222, 237, 191, 206,
+  238, 253, 207, 222, 223, 238, 239, 254,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17,
-  16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19,
-  48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35,
-  64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51,
-  22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81,
-  67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68,
-  24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69,
-  98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99,
-  12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128,
-  41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101,
-  128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87,
-  42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144,
-  88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43,
-  160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160,
-  74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44,
-  120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135,
-  164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60,
-  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165,
-  166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152,
-  208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107,
-  196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46,
-  138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169,
-  123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226,
-  199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185,
-  155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214,
-  215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109,
-  156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125,
-  202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157,
-  157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188,
-  203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219,
-  219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235,
-  206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238,
-  238, 253, 253, 254, 254, 0, 0,
+  0,   0,   0,   0,   1,   1,   0,   1,   2,   2,   1,  16,
+  3,   3,   2,  17,  16,  17,   4,   4,  17,  32,   3,  18,
+  5,   5,  18,  33,  32,  33,   4,  19,  33,  48,   6,   6,
+  19,  34,   5,  20,  34,  49,  48,  49,   7,   7,  20,  35,
+  49,  64,   6,  21,  35,  50,  21,  36,  64,  65,   8,   8,
+  50,  65,  36,  51,   7,  22,  22,  37,  65,  80,  51,  66,
+  9,   9,  37,  52,   8,  23,  66,  81,  52,  67,  80,  81,
+  23,  38,  10,  10,  38,  53,  67,  82,  81,  96,  53,  68,
+  9,  24,  82,  97,  68,  83,  24,  39,  96,  97,  39,  54,
+  11,  11,  54,  69,  83,  98,  97, 112,  69,  84,  10,  25,
+  25,  40,  40,  55,  98, 113,  84,  99,  12,  12,  55,  70,
+  112, 113,  70,  85,  11,  26,  99, 114,  85, 100, 113, 128,
+  26,  41,  41,  56,  56,  71, 100, 115,  13,  13,  71,  86,
+  114, 129,  86, 101, 128, 129,  57,  72, 115, 130, 101, 116,
+  12,  27,  42,  57,  14,  14,  72,  87,  27,  42, 129, 144,
+  87, 102, 116, 131, 130, 145, 102, 117,  58,  73, 144, 145,
+  73,  88, 117, 132,  88, 103,  13,  28,  43,  58, 131, 146,
+  103, 118,  28,  43, 145, 160, 132, 147,  74,  89,  89, 104,
+  118, 133, 146, 161, 104, 119, 160, 161,  59,  74, 119, 134,
+  133, 148,  14,  29,  44,  59, 147, 162, 161, 176,  29,  44,
+  105, 120,  75,  90,  90, 105, 148, 163, 162, 177, 134, 149,
+  176, 177, 120, 135, 149, 164, 163, 178,  15,  30, 135, 150,
+  177, 192,  60,  75, 106, 121,  45,  60, 121, 136, 178, 193,
+  91, 106, 136, 151, 164, 179, 192, 193,  30,  45, 150, 165,
+  151, 166, 179, 194,  76,  91, 165, 180, 122, 137, 193, 208,
+  107, 122, 137, 152, 208, 209, 180, 195,  61,  76, 152, 167,
+  194, 209, 166, 181, 224, 224,  92, 107, 181, 196,  46,  61,
+  138, 153, 209, 224, 167, 182, 153, 168, 195, 210,  31,  46,
+  123, 138,  77,  92, 168, 183, 210, 225, 196, 211, 225, 240,
+  182, 197, 154, 169, 108, 123, 139, 154, 183, 198,  62,  77,
+  197, 212, 169, 184,  93, 108, 211, 226, 184, 199,  47,  62,
+  212, 227, 226, 241, 124, 139, 198, 213, 155, 170, 170, 185,
+  140, 155, 213, 228, 227, 242, 109, 124,  78,  93, 185, 200,
+  228, 243, 199, 214, 200, 215, 214, 229, 125, 140, 171, 186,
+  186, 201,  63,  78, 156, 171,  94, 109, 141, 156, 229, 244,
+  201, 216, 215, 230,  79,  94, 230, 245, 216, 231, 110, 125,
+  187, 202, 231, 246, 217, 232, 157, 172, 202, 217, 126, 141,
+  95, 110, 142, 157, 172, 187, 232, 247, 111, 126, 218, 233,
+  203, 218, 233, 248, 173, 188, 188, 203, 127, 142, 158, 173,
+  143, 158, 234, 249, 219, 234, 189, 204, 204, 219, 159, 174,
+  174, 189, 235, 250, 205, 220, 175, 190, 190, 205, 220, 235,
+  191, 206, 221, 236, 236, 251, 206, 221, 237, 252, 207, 222,
+  222, 237, 223, 238, 238, 253, 239, 254,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32,
-  2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64,
-  34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50,
-  4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51,
-  82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52,
-  6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68,
-  84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7,
-  129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145,
-  39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39,
-  8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24,
-  117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162,
-  9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118,
-  10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192,
-  10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193,
-  164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42,
-  74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58,
-  11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224,
-  195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12,
-  210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106,
-  225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168,
-  226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198,
-  123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107,
-  61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214,
-  228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139,
-  214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155,
-  215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140,
-  47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217,
-  187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141,
-  203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203,
-  234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235,
-  250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205,
-  236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223,
-  238, 239, 254, 0, 0,
+  0,   0,   0,   0,  16,   0,  16,  16,   1,  16,  17,   1,
+  32,  32,  17,  32,   2,  17,  18,   2,  48,  48,  18,  33,
+  33,  48,   3,  18,  49,  64,  64,  65,  34,  49,  19,   3,
+  19,  34,  50,  65,   4,  19,  65,  80,  80,  81,  35,  50,
+  20,   4,  20,  35,  66,  81,  81,  96,  51,  66,  96,  97,
+  5,  20,  36,  51,  82,  97,  21,  36,  67,  82,  97, 112,
+  21,   5,  52,  67, 112, 113,  37,  52,   6,  21,  83,  98,
+  98, 113,  68,  83,  22,   6, 113, 128,  22,  37,  53,  68,
+  84,  99,  99, 114, 128, 129, 114, 129,  69,  84,  38,  53,
+  7,  22,  23,   7, 129, 144,  23,  38,  54,  69, 100, 115,
+  85, 100, 115, 130, 144, 145, 130, 145,  39,  54,  70,  85,
+  8,  23,  55,  70, 116, 131, 101, 116, 145, 160,  24,  39,
+  24,   8,  86, 101, 131, 146, 160, 161, 146, 161,  71,  86,
+  40,  55,   9,  24, 117, 132, 102, 117, 161, 176, 132, 147,
+  56,  71,  87, 102,  25,  40, 147, 162,  25,   9, 176, 177,
+  162, 177,  72,  87,  41,  56, 118, 133, 133, 148, 103, 118,
+  10,  25, 148, 163,  57,  72,  88, 103, 177, 192,  26,  41,
+  163, 178, 192, 193,  26,  10, 119, 134,  73,  88, 149, 164,
+  104, 119, 134, 149,  42,  57, 178, 193, 164, 179,  11,  26,
+  58,  73, 193, 208,  89, 104, 135, 150, 120, 135,  27,  42,
+  74,  89, 208, 209, 150, 165, 179, 194, 165, 180, 105, 120,
+  194, 209,  43,  58,  27,  11, 136, 151,  90, 105, 151, 166,
+  180, 195,  59,  74, 121, 136, 209, 224, 195, 210, 224, 225,
+  166, 181, 106, 121,  75,  90,  12,  27, 181, 196,  28,  12,
+  210, 225, 152, 167, 167, 182, 137, 152,  28,  43, 196, 211,
+  122, 137,  91, 106, 225, 240,  44,  59,  13,  28, 107, 122,
+  182, 197, 168, 183, 211, 226, 153, 168, 226, 241,  60,  75,
+  197, 212, 138, 153,  29,  44,  76,  91,  29,  13, 183, 198,
+  123, 138,  45,  60, 212, 227, 198, 213, 154, 169, 169, 184,
+  227, 242,  92, 107,  61,  76, 139, 154,  14,  29,  30,  14,
+  184, 199, 213, 228, 108, 123, 199, 214, 228, 243,  77,  92,
+  30,  45, 170, 185, 155, 170, 185, 200,  93, 108, 124, 139,
+  214, 229,  46,  61, 200, 215, 229, 244,  15,  30, 109, 124,
+  62,  77, 140, 155, 215, 230,  31,  46, 171, 186, 186, 201,
+  201, 216,  78,  93, 230, 245, 125, 140,  47,  62, 216, 231,
+  156, 171,  94, 109, 231, 246, 141, 156,  63,  78, 202, 217,
+  187, 202, 110, 125, 217, 232, 172, 187, 232, 247,  79,  94,
+  157, 172, 126, 141, 203, 218,  95, 110, 233, 248, 218, 233,
+  142, 157, 111, 126, 173, 188, 188, 203, 234, 249, 219, 234,
+  127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250,
+  174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190,
+  190, 205, 236, 251, 206, 221, 237, 252, 191, 206, 222, 237,
+  207, 222, 238, 253, 223, 238, 239, 254,   0,   0,
 };
 
 #if CONFIG_EXT_TX
@@ -1493,139 +1538,177 @@
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
-  2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
-  66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98,
-  192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99,
-  193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225,
-  225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6,
-  195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289,
-  227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196,
-  39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228,
-  134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384,
-  166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354,
-  198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135,
-  230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417,
-  199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262,
-  10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325,
-  200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450,
-  449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480,
-  450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451,
-  327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201,
-  482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233,
-  452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265,
-  297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484,
-  544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515,
-  546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330,
-  361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577,
-  608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331,
-  362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46,
-  142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142,
-  48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80,
-  81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608,
-  484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547,
-  485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51,
-  20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455,
-  393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114,
-  145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580,
-  611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
-  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115,
-  146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612,
-  550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271,
-  302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675,
-  706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241,
-  179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242,
-  87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23,
-  768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365,
-  396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769,
-  800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
-  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149,
-  180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739,
-  770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553,
-  491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274,
-  305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26,
-  57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585,
-  616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
-  275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865,
-  803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338,
-  369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587,
-  618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647,
-  492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803,
-  741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276,
-  307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866,
-  897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587,
-  525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246,
-  277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929,
-  960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743,
-  681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464,
-  495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278,
-  216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30,
-  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682,
-  713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434,
-  341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93,
-  31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621,
-  559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94,
-  125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219,
-  250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
-  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528,
-  559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156,
-  932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653,
-  684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
-  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933,
-  964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747,
-  685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468,
-  499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
-  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841,
-  872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593,
-  531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221,
-  252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
-  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222,
-  253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378,
-  223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904,
-  935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
-  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812,
-  843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564,
-  502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968,
-  999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
-  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503,
-  534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317,
-  255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721,
-  752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
-  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815,
-  846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381,
-  319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876,
-  907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
-  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940,
-  971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692,
-  630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972,
-  1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
-  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507,
-  538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942,
-  849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570,
-  601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
-  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975,
-  1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663,
-  508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540,
-  571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
-  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945,
-  976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759,
-  697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977,
-  1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
-  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823,
-  854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762,
-  607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761,
-  792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
-  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887,
-  918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701,
-  639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733,
-  764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
-  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889,
-  920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828,
-  766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798,
-  829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
-  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924,
-  955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957,
-  895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0,
+  0,   0,   0,   0,  32,   0,  32,  32,   1,  32,  33,   1,
+  64,  64,  33,  64,   2,  33,  96,  96,  34,   2,  65,  96,
+  34,  65, 128, 128,  97, 128,   3,  34,  66,  97,  35,   3,
+  35,  66,  98, 129, 129, 160, 160, 161,   4,  35,  67,  98,
+  192, 192,  36,   4, 130, 161, 161, 192,  36,  67,  99, 130,
+  5,  36,  68,  99, 193, 224, 162, 193, 224, 225, 131, 162,
+  37,  68, 100, 131,  37,   5, 194, 225, 225, 256, 256, 257,
+  163, 194,  69, 100, 132, 163,   6,  37, 226, 257,  38,   6,
+  195, 226, 257, 288, 101, 132, 288, 289,  38,  69, 164, 195,
+  133, 164, 258, 289, 227, 258, 196, 227,   7,  38, 289, 320,
+  70, 101, 320, 321,  39,   7, 165, 196,  39,  70, 102, 133,
+  290, 321, 259, 290, 228, 259, 321, 352, 352, 353, 197, 228,
+  134, 165,  71, 102,   8,  39, 322, 353, 291, 322, 260, 291,
+  103, 134, 353, 384, 166, 197, 229, 260,  40,  71,  40,   8,
+  384, 385, 135, 166, 354, 385, 323, 354, 198, 229, 292, 323,
+  72, 103, 261, 292,   9,  40, 385, 416, 167, 198, 104, 135,
+  230, 261, 355, 386, 416, 417, 293, 324, 324, 355,  41,   9,
+  41,  72, 386, 417, 199, 230, 136, 167, 417, 448, 262, 293,
+  356, 387,  73, 104, 387, 418, 231, 262,  10,  41, 168, 199,
+  325, 356, 418, 449, 105, 136, 448, 449,  42,  73, 294, 325,
+  200, 231,  42,  10, 357, 388, 137, 168, 263, 294, 388, 419,
+  74, 105, 419, 450, 449, 480, 326, 357, 232, 263, 295, 326,
+  169, 200,  11,  42, 106, 137, 480, 481, 450, 481, 358, 389,
+  264, 295, 201, 232, 138, 169, 389, 420,  43,  74, 420, 451,
+  327, 358,  43,  11, 481, 512, 233, 264, 451, 482, 296, 327,
+  75, 106, 170, 201, 482, 513, 512, 513, 390, 421, 359, 390,
+  421, 452, 107, 138,  12,  43, 202, 233, 452, 483, 265, 296,
+  328, 359, 139, 170,  44,  75, 483, 514, 513, 544, 234, 265,
+  297, 328, 422, 453,  44,  12, 391, 422, 171, 202,  76, 107,
+  514, 545, 453, 484, 544, 545, 266, 297, 203, 234, 108, 139,
+  329, 360, 298, 329, 140, 171, 515, 546,  13,  44, 423, 454,
+  235, 266, 545, 576, 454, 485,  45,  76, 172, 203, 330, 361,
+  576, 577,  45,  13, 267, 298, 546, 577,  77, 108, 204, 235,
+  455, 486, 577, 608, 299, 330, 109, 140, 547, 578,  14,  45,
+  46,  14, 141, 172, 578, 609, 331, 362,  46,  77, 173, 204,
+  15,  15,  78, 109, 205, 236, 579, 610, 110, 141,  15,  46,
+  142, 173,  47,  78, 174, 205,  16,  16,  79, 110, 206, 237,
+  16,  47, 111, 142,  48,  79, 143, 174,  80, 111, 175, 206,
+  17,  48,  49,  17, 207, 238,  49,  80,  81, 112,  18,  18,
+  18,  49,  50,  81,  82, 113,  19,  50,  51,  82,  83, 114,
+  608, 609, 484, 515, 360, 391, 236, 267, 112, 143,  51,  19,
+  640, 640, 609, 640, 516, 547, 485, 516, 392, 423, 361, 392,
+  268, 299, 237, 268, 144, 175, 113, 144,  20,  51,  52,  20,
+  672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517,
+  424, 455, 393, 424, 362, 393, 300, 331, 269, 300, 238, 269,
+  176, 207, 145, 176, 114, 145,  52,  83,  21,  52,  53,  21,
+  704, 704, 673, 704, 642, 673, 611, 642, 580, 611, 549, 580,
+  518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
+  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208,
+  146, 177, 115, 146,  84, 115,  53,  84,  22,  53,  54,  22,
+  705, 736, 674, 705, 643, 674, 581, 612, 550, 581, 519, 550,
+  457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271, 302,
+  209, 240, 178, 209, 147, 178,  85, 116,  54,  85,  23,  54,
+  706, 737, 675, 706, 582, 613, 551, 582, 458, 489, 427, 458,
+  334, 365, 303, 334, 210, 241, 179, 210,  86, 117,  55,  86,
+  707, 738, 583, 614, 459, 490, 335, 366, 211, 242,  87, 118,
+  736, 737, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147,
+  55,  23, 768, 768, 737, 768, 644, 675, 613, 644, 520, 551,
+  489, 520, 396, 427, 365, 396, 272, 303, 241, 272, 148, 179,
+  117, 148,  24,  55,  56,  24, 800, 800, 769, 800, 738, 769,
+  676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
+  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273,
+  180, 211, 149, 180, 118, 149,  56,  87,  25,  56,  57,  25,
+  832, 832, 801, 832, 770, 801, 739, 770, 708, 739, 677, 708,
+  646, 677, 615, 646, 584, 615, 553, 584, 522, 553, 491, 522,
+  460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336,
+  274, 305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150,
+  88, 119,  57,  88,  26,  57,  58,  26, 833, 864, 802, 833,
+  771, 802, 709, 740, 678, 709, 647, 678, 585, 616, 554, 585,
+  523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
+  275, 306, 213, 244, 182, 213, 151, 182,  89, 120,  58,  89,
+  27,  58, 834, 865, 803, 834, 710, 741, 679, 710, 586, 617,
+  555, 586, 462, 493, 431, 462, 338, 369, 307, 338, 214, 245,
+  183, 214,  90, 121,  59,  90, 835, 866, 711, 742, 587, 618,
+  463, 494, 339, 370, 215, 246,  91, 122, 864, 865, 740, 771,
+  616, 647, 492, 523, 368, 399, 244, 275, 120, 151,  59,  27,
+  896, 896, 865, 896, 772, 803, 741, 772, 648, 679, 617, 648,
+  524, 555, 493, 524, 400, 431, 369, 400, 276, 307, 245, 276,
+  152, 183, 121, 152,  28,  59,  60,  28, 928, 928, 897, 928,
+  866, 897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680,
+  618, 649, 556, 587, 525, 556, 494, 525, 432, 463, 401, 432,
+  370, 401, 308, 339, 277, 308, 246, 277, 184, 215, 153, 184,
+  122, 153,  60,  91,  29,  60,  61,  29, 960, 960, 929, 960,
+  898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774,
+  712, 743, 681, 712, 650, 681, 619, 650, 588, 619, 557, 588,
+  526, 557, 495, 526, 464, 495, 433, 464, 402, 433, 371, 402,
+  340, 371, 309, 340, 278, 309, 247, 278, 216, 247, 185, 216,
+  154, 185, 123, 154,  92, 123,  61,  92,  30,  61,  62,  30,
+  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806,
+  713, 744, 682, 713, 651, 682, 589, 620, 558, 589, 527, 558,
+  465, 496, 434, 465, 403, 434, 341, 372, 310, 341, 279, 310,
+  217, 248, 186, 217, 155, 186,  93, 124,  62,  93,  31,  62,
+  962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714,
+  590, 621, 559, 590, 466, 497, 435, 466, 342, 373, 311, 342,
+  218, 249, 187, 218,  94, 125,  63,  94, 963, 994, 839, 870,
+  715, 746, 591, 622, 467, 498, 343, 374, 219, 250,  95, 126,
+  868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
+  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683,
+  621, 652, 528, 559, 497, 528, 404, 435, 373, 404, 280, 311,
+  249, 280, 156, 187, 125, 156, 932, 963, 901, 932, 870, 901,
+  808, 839, 777, 808, 746, 777, 684, 715, 653, 684, 622, 653,
+  560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
+  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157,
+  964, 995, 933, 964, 902, 933, 871, 902, 840, 871, 809, 840,
+  778, 809, 747, 778, 716, 747, 685, 716, 654, 685, 623, 654,
+  592, 623, 561, 592, 530, 561, 499, 530, 468, 499, 437, 468,
+  406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
+  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965,
+  903, 934, 841, 872, 810, 841, 779, 810, 717, 748, 686, 717,
+  655, 686, 593, 624, 562, 593, 531, 562, 469, 500, 438, 469,
+  407, 438, 345, 376, 314, 345, 283, 314, 221, 252, 190, 221,
+  159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
+  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377,
+  315, 346, 222, 253, 191, 222, 967, 998, 843, 874, 719, 750,
+  595, 626, 471, 502, 347, 378, 223, 254, 872, 903, 748, 779,
+  624, 655, 500, 531, 376, 407, 252, 283, 904, 935, 873, 904,
+  780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
+  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936,
+  874, 905, 812, 843, 781, 812, 750, 781, 688, 719, 657, 688,
+  626, 657, 564, 595, 533, 564, 502, 533, 440, 471, 409, 440,
+  378, 409, 316, 347, 285, 316, 254, 285, 968, 999, 937, 968,
+  906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
+  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596,
+  534, 565, 503, 534, 472, 503, 441, 472, 410, 441, 379, 410,
+  348, 379, 317, 348, 286, 317, 255, 286, 969, 1000, 938, 969,
+  907, 938, 845, 876, 814, 845, 783, 814, 721, 752, 690, 721,
+  659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
+  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970,
+  846, 877, 815, 846, 722, 753, 691, 722, 598, 629, 567, 598,
+  474, 505, 443, 474, 350, 381, 319, 350, 971, 1002, 847, 878,
+  723, 754, 599, 630, 475, 506, 351, 382, 876, 907, 752, 783,
+  628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
+  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443,
+  381, 412, 940, 971, 909, 940, 878, 909, 816, 847, 785, 816,
+  754, 785, 692, 723, 661, 692, 630, 661, 568, 599, 537, 568,
+  506, 537, 444, 475, 413, 444, 382, 413, 972, 1003, 941, 972,
+  910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
+  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600,
+  538, 569, 507, 538, 476, 507, 445, 476, 414, 445, 383, 414,
+  973, 1004, 942, 973, 911, 942, 849, 880, 818, 849, 787, 818,
+  725, 756, 694, 725, 663, 694, 601, 632, 570, 601, 539, 570,
+  477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
+  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509,
+  447, 478, 975, 1006, 851, 882, 727, 758, 603, 634, 479, 510,
+  880, 911, 756, 787, 632, 663, 508, 539, 912, 943, 881, 912,
+  788, 819, 757, 788, 664, 695, 633, 664, 540, 571, 509, 540,
+  944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
+  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541,
+  976, 1007, 945, 976, 914, 945, 883, 914, 852, 883, 821, 852,
+  790, 821, 759, 790, 728, 759, 697, 728, 666, 697, 635, 666,
+  604, 635, 573, 604, 542, 573, 511, 542, 977, 1008, 946, 977,
+  915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
+  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978,
+  854, 885, 823, 854, 730, 761, 699, 730, 606, 637, 575, 606,
+  979, 1010, 855, 886, 731, 762, 607, 638, 884, 915, 760, 791,
+  636, 667, 916, 947, 885, 916, 792, 823, 761, 792, 668, 699,
+  637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
+  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980,
+  918, 949, 887, 918, 856, 887, 825, 856, 794, 825, 763, 794,
+  732, 763, 701, 732, 670, 701, 639, 670, 981, 1012, 950, 981,
+  919, 950, 857, 888, 826, 857, 795, 826, 733, 764, 702, 733,
+  671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
+  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795,
+  920, 951, 889, 920, 796, 827, 765, 796, 952, 983, 921, 952,
+  890, 921, 828, 859, 797, 828, 766, 797, 984, 1015, 953, 984,
+  922, 953, 891, 922, 860, 891, 829, 860, 798, 829, 767, 798,
+  985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
+  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894,
+  892, 923, 924, 955, 893, 924, 956, 987, 925, 956, 894, 925,
+  988, 1019, 957, 988, 926, 957, 895, 926, 989, 1020, 958, 989,
+  927, 958, 990, 1021, 959, 990, 991, 1022,   0,   0,
 };
 
 #if CONFIG_EXT_TX
diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index b2339c6..a94aafd 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -94,7 +94,7 @@
                              int start, int stop, int y_only,
                              VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2;
   int mi_row, mi_col;
 #if !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path;
@@ -116,12 +116,12 @@
 #endif  // CONFIG_EXT_PARTITION
 
   for (mi_row = start; mi_row < stop;
-       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+       mi_row += lf_sync->num_workers * cm->mib_size) {
     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
-      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
-      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
+      const int r = mi_row >> cm->mib_size_log2;
+      const int c = mi_col >> cm->mib_size_log2;
       int plane;
 
       sync_read(lf_sync, r, c);
@@ -175,7 +175,7 @@
                                 VP9LfSync *lf_sync) {
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   // Number of superblock rows and cols
-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
   // Decoder may allocate more threads than number of tiles based on user's
   // input.
   const int tile_cols = cm->tile_cols;
@@ -215,7 +215,7 @@
 
     // Loopfilter data
     vp10_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->start = start + i * cm->mib_size;
     lf_data->stop = stop;
     lf_data->y_only = y_only;
 
@@ -428,10 +428,14 @@
       cm->counts.inter_compound_mode[i][j] +=
           counts->inter_compound_mode[i][j];
 
-  for (i = 0; i < BLOCK_SIZES; i++)
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     for (j = 0; j < 2; j++)
       cm->counts.interintra[i][j] += counts->interintra[i][j];
 
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTERINTRA_MODES; j++)
+      cm->counts.interintra_mode[i][j] += counts->interintra_mode[i][j];
+
   for (i = 0; i < BLOCK_SIZES; i++)
     for (j = 0; j < 2; j++)
       cm->counts.wedge_interintra[i][j] += counts->wedge_interintra[i][j];
diff --git a/vp10/common/tile_common.c b/vp10/common/tile_common.c
index de5f921..04b19eb 100644
--- a/vp10/common/tile_common.c
+++ b/vp10/common/tile_common.c
@@ -12,9 +12,6 @@
 #include "vp10/common/onyxc_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-#define MIN_TILE_WIDTH_B64 4
-#define MAX_TILE_WIDTH_B64 64
-
 void vp10_tile_set_row(TileInfo *tile, const VP10_COMMON *cm, int row) {
   tile->mi_row_start = row * cm->tile_height;
   tile->mi_row_end   = VPXMIN(tile->mi_row_start + cm->tile_height,
@@ -33,26 +30,35 @@
 }
 
 #if !CONFIG_EXT_TILE
-// TODO(geza.lore): CU_SIZE dependent.
-static int get_min_log2_tile_cols(const int sb64_cols) {
+
+# if CONFIG_EXT_PARTITION
+#   define MIN_TILE_WIDTH_MAX_SB 2
+#   define MAX_TILE_WIDTH_MAX_SB 32
+# else
+#   define MIN_TILE_WIDTH_MAX_SB 4
+#   define MAX_TILE_WIDTH_MAX_SB 64
+# endif  // CONFIG_EXT_PARTITION
+
+static int get_min_log2_tile_cols(const int max_sb_cols) {
   int min_log2 = 0;
-  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
+  while ((MAX_TILE_WIDTH_MAX_SB << min_log2) < max_sb_cols)
     ++min_log2;
   return min_log2;
 }
 
-static int get_max_log2_tile_cols(const int sb64_cols) {
+static int get_max_log2_tile_cols(const int max_sb_cols) {
   int max_log2 = 1;
-  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+  while ((max_sb_cols >> max_log2) >= MIN_TILE_WIDTH_MAX_SB)
     ++max_log2;
   return max_log2 - 1;
 }
 
-void vp10_get_tile_n_bits(int mi_cols,
+void vp10_get_tile_n_bits(const int mi_cols,
                           int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
-  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
-  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  const int max_sb_cols =
+      ALIGN_POWER_OF_TWO(mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(max_sb_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(max_sb_cols);
   assert(*min_log2_tile_cols <= *max_log2_tile_cols);
 }
 #endif  // !CONFIG_EXT_TILE
diff --git a/vp10/common/tile_common.h b/vp10/common/tile_common.h
index 09cf060..2babc89 100644
--- a/vp10/common/tile_common.h
+++ b/vp10/common/tile_common.h
@@ -30,8 +30,8 @@
 void vp10_tile_set_row(TileInfo *tile, const struct VP10Common *cm, int row);
 void vp10_tile_set_col(TileInfo *tile, const struct VP10Common *cm, int col);
 
-void vp10_get_tile_n_bits(int mi_cols,
-                         int *min_log2_tile_cols, int *max_log2_tile_cols);
+void vp10_get_tile_n_bits(const int mi_cols,
+                          int *min_log2_tile_cols, int *max_log2_tile_cols);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index f592539..0e51b15 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -16,7 +16,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_dsp/bitreader_buffer.h"
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -100,14 +100,14 @@
   return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
 }
 
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp10_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i;
 #if CONFIG_REF_MV
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
@@ -130,9 +130,9 @@
 }
 
 #if CONFIG_EXT_INTER
-static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j;
-  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
     for (j = 0; j < INTER_MODE_CONTEXTS; ++j) {
       for (i = 0; i < INTER_COMPOUND_MODES - 1; ++i) {
         vp10_diff_update_prob(r, &fc->inter_compound_mode_probs[j][i]);
@@ -153,7 +153,7 @@
   }
 }
 
-static void read_frame_reference_mode_probs(VP10_COMMON *cm, vpx_reader *r) {
+static void read_frame_reference_mode_probs(VP10_COMMON *cm, vp10_reader *r) {
   FRAME_CONTEXT *const fc = cm->fc;
   int i, j;
 
@@ -178,13 +178,13 @@
   }
 }
 
-static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
+static void update_mv_probs(vpx_prob *p, int n, vp10_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
     vp10_diff_update_prob(r, &p[i]);
 }
 
-static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vp10_reader *r) {
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
@@ -262,7 +262,7 @@
                                          const rans_dec_lut *const token_tab,
                                                 struct AnsDecoder *const r,
 #else
-                                                vpx_reader *r,
+                                                vp10_reader *r,
 #endif  // CONFIG_ANS
                                                 MB_MODE_INFO *const mbmi,
                                                 int plane,
@@ -298,7 +298,7 @@
 }
 
 #if CONFIG_VAR_TX
-static void decode_reconstruct_tx(MACROBLOCKD *const xd, vpx_reader *r,
+static void decode_reconstruct_tx(MACROBLOCKD *const xd, vp10_reader *r,
                                   MB_MODE_INFO *const mbmi,
                                   int plane, BLOCK_SIZE plane_bsize,
                                   int block, int blk_row, int blk_col,
@@ -360,7 +360,7 @@
                                    const rans_dec_lut *const token_tab,
                                    struct AnsDecoder *const r,
 #else
-                                   vpx_reader *r,
+                                   vp10_reader *r,
 #endif
                                    MB_MODE_INFO *const mbmi, int plane,
                                    int row, int col, TX_SIZE tx_size) {
@@ -503,7 +503,7 @@
     buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
   }
 #if CONFIG_EXT_INTER
-  if (ref && get_wedge_bits(xd->mi[0]->mbmi.sb_type) &&
+  if (ref && is_interinter_wedge_used(xd->mi[0]->mbmi.sb_type) &&
       xd->mi[0]->mbmi.use_wedge_interinter)
     vp10_make_masked_inter_predictor(
         buf_ptr, b_w, dst, dst_buf_stride,
@@ -544,7 +544,7 @@
                   x0, y0, b_w, b_h, frame_width, frame_height);
   buf_ptr = mc_buf + border_offset;
 #if CONFIG_EXT_INTER
-  if (ref && get_wedge_bits(xd->mi[0]->mbmi.sb_type) &&
+  if (ref && is_interinter_wedge_used(xd->mi[0]->mbmi.sb_type) &&
       xd->mi[0]->mbmi.use_wedge_interinter)
     vp10_make_masked_inter_predictor(
         buf_ptr, b_w, dst, dst_buf_stride,
@@ -753,7 +753,7 @@
      }
   }
 #if CONFIG_EXT_INTER
-  if (ref && get_wedge_bits(mi->mbmi.sb_type) &&
+  if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
       mi->mbmi.use_wedge_interinter) {
     vp10_make_masked_inter_predictor(
         buf_ptr, buf_stride, dst, dst_buf->stride,
@@ -1027,7 +1027,11 @@
   set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
                  cm->mi_rows, cm->mi_cols);
 
+#if CONFIG_EXT_TILE
+  xd->up_available    = (mi_row_ori > tile->mi_row_start);
+#else
   xd->up_available    = (mi_row_ori != 0);
+#endif  // CONFIG_EXT_TILE
   xd->left_available  = (mi_col_ori > tile->mi_col_start);
 
   set_plane_n4(xd, bw, bh, bwl, bhl);
@@ -1095,7 +1099,8 @@
     }
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   set_txfm_ctx(xd->left_txfm_context, xd->mi[0]->mbmi.tx_size, bh);
   set_txfm_ctx(xd->above_txfm_context, xd->mi[0]->mbmi.tx_size, bw);
 #endif
@@ -1292,17 +1297,15 @@
                                    int mi_row_top, int mi_col_top,
                                    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
                                    uint8_t *dst_buf[3], int dst_stride[3]) {
-  VP10_COMMON *const cm = &pbi->common;
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if !CONFIG_EXT_PARTITION_TYPES
-  MB_MODE_INFO *mbmi;
-#endif
-  int i, offset = mi_row * cm->mi_stride + mi_col;
+  const VP10_COMMON *const cm = &pbi->common;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
+  int i;
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
 
   DECLARE_ALIGNED(16, uint8_t,
@@ -1345,16 +1348,8 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  xd->mi = cm->mi_grid_visible + offset;
-  xd->mi[0] = cm->mi + offset;
-#if CONFIG_EXT_PARTITION_TYPES
-  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                            mi_row, mi_col, bsize);
-#else
-  mbmi = &xd->mi[0]->mbmi;
-  partition = partition_lookup[bsl][mbmi->sb_type];
-#endif
-  subsize = get_subsize(bsize, partition);
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  xd->mi[0] = cm->mi + mi_offset;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
@@ -1808,10 +1803,7 @@
                          int supertx_enabled,
 #endif  // CONFIG_SUPERTX
                          int mi_row, int mi_col,
-                         vpx_reader *r,
-#if CONFIG_ANS
-                         struct AnsDecoder *const tok,
-#endif  // CONFIG_ANS
+                         vp10_reader *r,
 #if CONFIG_EXT_PARTITION_TYPES
                          PARTITION_TYPE partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -1887,10 +1879,9 @@
           for (col = 0; col < max_blocks_wide; col += step)
             predict_and_reconstruct_intra_block(xd,
 #if CONFIG_ANS
-                                                cm->token_tab, tok,
-#else
-                                                r,
+                                                cm->token_tab,
 #endif
+                                                r,
                                                 mbmi, plane,
                                                 row, col, tx_size);
       }
@@ -1991,10 +1982,9 @@
             for (col = 0; col < max_blocks_wide; col += step)
               eobtotal += reconstruct_inter_block(xd,
 #if CONFIG_ANS
-                                                  cm->token_tab, tok,
-#else
-                                                  r,
+                                                  cm->token_tab,
 #endif
+                                                  r,
                                                   mbmi, plane, row, col,
                                                   tx_size);
 #endif
@@ -2008,14 +1998,15 @@
   }
 #endif  // CONFIG_SUPERTX
 
-  xd->corrupted |= vpx_reader_has_error(r);
+  xd->corrupted |= vp10_reader_has_error(r);
 }
 
 static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
                                               int mi_row, int mi_col,
                                               int bsl) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const PARTITION_CONTEXT *left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
   int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
 
 //  assert(bsl >= 0);
@@ -2029,7 +2020,8 @@
                                                 BLOCK_SIZE subsize,
                                                 int bw) {
   PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  PARTITION_CONTEXT *const left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
@@ -2040,7 +2032,7 @@
 #endif  // !CONFIG_EXT_PARTITION_TYPES
 
 static PARTITION_TYPE read_partition(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, vpx_reader *r,
+                                     int mi_row, int mi_col, vp10_reader *r,
                                      int has_rows, int has_cols,
 #if CONFIG_EXT_PARTITION_TYPES
                                      BLOCK_SIZE bsize,
@@ -2054,16 +2046,16 @@
   if (has_rows && has_cols)
 #if CONFIG_EXT_PARTITION_TYPES
     if (bsize <= BLOCK_8X8)
-      p = (PARTITION_TYPE)vpx_read_tree(r, vp10_partition_tree, probs);
+      p = (PARTITION_TYPE)vp10_read_tree(r, vp10_partition_tree, probs);
     else
-      p = (PARTITION_TYPE)vpx_read_tree(r, vp10_ext_partition_tree, probs);
+      p = (PARTITION_TYPE)vp10_read_tree(r, vp10_ext_partition_tree, probs);
 #else
-    p = (PARTITION_TYPE)vpx_read_tree(r, vp10_partition_tree, probs);
+    p = (PARTITION_TYPE)vp10_read_tree(r, vp10_partition_tree, probs);
 #endif  // CONFIG_EXT_PARTITION_TYPES
   else if (!has_rows && has_cols)
-    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = vp10_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = vp10_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
   else
     p = PARTITION_SPLIT;
 
@@ -2075,9 +2067,9 @@
 
 #if CONFIG_SUPERTX
 static int read_skip_without_seg(VP10_COMMON *cm, const MACROBLOCKD *xd,
-                                 vpx_reader *r) {
+                                 vp10_reader *r) {
   const int ctx = vp10_get_skip_context(xd);
-  const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+  const int skip = vp10_read(r, cm->fc->skip_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
     ++counts->skip[ctx][skip];
@@ -2091,10 +2083,7 @@
                              int supertx_enabled,
 #endif
                              int mi_row, int mi_col,
-                             vpx_reader* r,
-#if CONFIG_ANS
-                             struct AnsDecoder *const tok,
-#endif  // CONFIG_ANS
+                             vp10_reader* r,
                              BLOCK_SIZE bsize, int n4x4_l2) {
   VP10_COMMON *const cm = &pbi->common;
   const int n8x8_l2 = n4x4_l2 - 1;
@@ -2132,7 +2121,7 @@
       !xd->lossless[0]) {
     const int supertx_context =
         partition_supertx_context_lookup[partition];
-    supertx_enabled = vpx_read(
+    supertx_enabled = vp10_read(
         r, cm->fc->supertx_prob[supertx_context][supertx_size]);
     if (xd->counts)
       xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
@@ -2154,7 +2143,7 @@
       if (get_ext_tx_types(supertx_size, bsize, 1) > 1) {
         int eset = get_ext_tx_set(supertx_size, bsize, 1);
         if (eset > 0) {
-          txfm = vpx_read_tree(r, vp10_ext_tx_inter_tree[eset],
+          txfm = vp10_read_tree(r, vp10_ext_tx_inter_tree[eset],
                                cm->fc->inter_ext_tx_prob[eset][supertx_size]);
           if (xd->counts)
             ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
@@ -2162,7 +2151,7 @@
       }
 #else
       if (supertx_size < TX_32X32) {
-        txfm = vpx_read_tree(r, vp10_ext_tx_tree,
+        txfm = vp10_read_tree(r, vp10_ext_tx_tree,
                              cm->fc->inter_ext_tx_prob[supertx_size]);
         if (xd->counts)
           ++xd->counts->inter_ext_tx[supertx_size][txfm];
@@ -2183,9 +2172,6 @@
                  supertx_enabled,
 #endif  // CONFIG_SUPERTX
                  mi_row, mi_col, r,
-#if CONFIG_ANS
-                 tok,
-#endif  // CONFIG_ANS
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2198,9 +2184,6 @@
                      supertx_enabled,
 #endif  // CONFIG_SUPERTX
                      mi_row, mi_col, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2212,9 +2195,6 @@
                      supertx_enabled,
 #endif  // CONFIG_SUPERTX
                      mi_row, mi_col, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2225,9 +2205,6 @@
                        supertx_enabled,
 #endif  // CONFIG_SUPERTX
                        mi_row + hbs, mi_col, r,
-#if CONFIG_ANS
-                       tok,
-#endif  // CONFIG_ANS
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2239,9 +2216,6 @@
                      supertx_enabled,
 #endif  // CONFIG_SUPERTX
                      mi_row, mi_col, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2252,9 +2226,6 @@
                        supertx_enabled,
 #endif  // CONFIG_SUPERTX
                        mi_row, mi_col + hbs, r,
-#if CONFIG_ANS
-                       tok,
-#endif  // CONFIG_ANS
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2266,36 +2237,24 @@
                          supertx_enabled,
 #endif  // CONFIG_SUPERTX
                          mi_row, mi_col, r,
-#if CONFIG_ANS
-                         tok,
-#endif  // CONFIG_ANS
                          subsize, n8x8_l2);
         decode_partition(pbi, xd,
 #if CONFIG_SUPERTX
                          supertx_enabled,
 #endif  // CONFIG_SUPERTX
                          mi_row, mi_col + hbs, r,
-#if CONFIG_ANS
-                         tok,
-#endif  // CONFIG_ANS
                          subsize, n8x8_l2);
         decode_partition(pbi, xd,
 #if CONFIG_SUPERTX
                          supertx_enabled,
 #endif  // CONFIG_SUPERTX
                          mi_row + hbs, mi_col, r,
-#if CONFIG_ANS
-                         tok,
-#endif  // CONFIG_ANS
                          subsize, n8x8_l2);
         decode_partition(pbi, xd,
 #if CONFIG_SUPERTX
                          supertx_enabled,
 #endif  // CONFIG_SUPERTX
                          mi_row + hbs, mi_col + hbs, r,
-#if CONFIG_ANS
-                         tok,
-#endif  // CONFIG_ANS
                          subsize, n8x8_l2);
         break;
 #if CONFIG_EXT_PARTITION_TYPES
@@ -2305,27 +2264,18 @@
                      supertx_enabled,
 #endif
                      mi_row,       mi_col,       r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row,       mi_col + hbs, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row + hbs, mi_col, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, subsize, n4x4_l2, n8x8_l2);
         break;
       case PARTITION_HORZ_B:
@@ -2334,27 +2284,18 @@
                      supertx_enabled,
 #endif
                      mi_row, mi_col, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, subsize, n4x4_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row + hbs, mi_col,       r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row + hbs, mi_col + hbs, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         break;
       case PARTITION_VERT_A:
@@ -2363,27 +2304,18 @@
                      supertx_enabled,
 #endif
                      mi_row,       mi_col,       r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row + hbs, mi_col,       r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row, mi_col + hbs, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, subsize, n8x8_l2, n4x4_l2);
         break;
       case PARTITION_VERT_B:
@@ -2392,27 +2324,18 @@
                      supertx_enabled,
 #endif
                      mi_row, mi_col, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, subsize, n8x8_l2, n4x4_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row,       mi_col + hbs, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         decode_block(pbi, xd,
 #if CONFIG_SUPERTX
                      supertx_enabled,
 #endif
                      mi_row + hbs, mi_col + hbs, r,
-#if CONFIG_ANS
-                     tok,
-#endif  // CONFIG_ANS
                      partition, bsize2, n8x8_l2, n8x8_l2);
         break;
 #endif
@@ -2459,10 +2382,9 @@
           for (col = 0; col < max_blocks_wide; col += step)
             eobtotal += reconstruct_inter_block(xd,
 #if CONFIG_ANS
-                                                cm->token_tab, tok,
-#else
-                                                r,
+                                                cm->token_tab,
 #endif
+                                                r,
                                                 mbmi, i, row, col,
                                                 tx_size);
       }
@@ -2512,11 +2434,12 @@
 #endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
+#if !CONFIG_ANS
 static void setup_bool_decoder(const uint8_t *data,
                                const uint8_t *data_end,
                                const size_t read_size,
                                struct vpx_internal_error_info *error_info,
-                               vpx_reader *r,
+                               vp10_reader *r,
                                vpx_decrypt_cb decrypt_cb,
                                void *decrypt_state) {
   // Validate the calculated partition length. If the buffer
@@ -2530,7 +2453,7 @@
     vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
-#if CONFIG_ANS
+#else
 static void setup_token_decoder(const uint8_t *data,
                                 const uint8_t *data_end,
                                 const size_t read_size,
@@ -2554,10 +2477,10 @@
 #endif
 
 static void read_coef_probs_common(vp10_coeff_probs_model *coef_probs,
-                                   vpx_reader *r) {
+                                   vp10_reader *r) {
   int i, j, k, l, m;
 
-  if (vpx_read_bit(r))
+  if (vp10_read_bit(r))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
@@ -2567,7 +2490,7 @@
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
-                            vpx_reader *r) {
+                            vp10_reader *r) {
     const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
     TX_SIZE tx_size;
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
@@ -2895,16 +2818,24 @@
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void setup_tile_info(VP10Decoder *const pbi,
+static void read_tile_info(VP10Decoder *const pbi,
                             struct vpx_read_bit_buffer *const rb) {
   VP10_COMMON *const cm = &pbi->common;
 #if CONFIG_EXT_TILE
   // Read the tile width/height
-  cm->tile_width  = vpx_rb_read_literal(rb, 6) + 1;   // in [1, 64]
-  cm->tile_height = vpx_rb_read_literal(rb, 6) + 1;   // in [1, 64]
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    cm->tile_width  = vpx_rb_read_literal(rb, 5) + 1;
+    cm->tile_height = vpx_rb_read_literal(rb, 5) + 1;
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    cm->tile_width  = vpx_rb_read_literal(rb, 6) + 1;
+    cm->tile_height = vpx_rb_read_literal(rb, 6) + 1;
+  }
 
-  cm->tile_width  = cm->tile_width << MI_BLOCK_SIZE_LOG2;
-  cm->tile_height = cm->tile_height << MI_BLOCK_SIZE_LOG2;
+  cm->tile_width  <<= cm->mib_size_log2;
+  cm->tile_height <<= cm->mib_size_log2;
 
   cm->tile_width  = VPXMIN(cm->tile_width, cm->mi_cols);
   cm->tile_height = VPXMIN(cm->tile_height, cm->mi_rows);
@@ -2945,12 +2876,14 @@
   cm->tile_cols = 1 << cm->log2_tile_cols;
   cm->tile_rows = 1 << cm->log2_tile_rows;
 
-  cm->tile_width = (mi_cols_aligned_to_sb(cm->mi_cols) >> cm->log2_tile_cols);
-  cm->tile_height = (mi_cols_aligned_to_sb(cm->mi_rows) >> cm->log2_tile_rows);
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
 
-  // round to integer multiples of 8
-  cm->tile_width  = mi_cols_aligned_to_sb(cm->tile_width);
-  cm->tile_height = mi_cols_aligned_to_sb(cm->tile_height);
+  // round to integer multiples of superblock size
+  cm->tile_width  = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
 
   // tile size magnitude
   if (cm->tile_rows > 1 || cm->tile_cols > 1) {
@@ -3185,8 +3118,7 @@
   int tile_row, tile_col;
 
 #if CONFIG_ENTROPY
-  cm->do_subframe_update =
-      cm->log2_tile_cols == 0 && cm->log2_tile_rows == 0;
+  cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
 #endif  // CONFIG_ENTROPY
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
@@ -3237,14 +3169,15 @@
               &cm->counts : NULL;
       vp10_zero(td->dqcoeff);
       vp10_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
+#if !CONFIG_ANS
       setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
-                         &td->bit_reader,
-                         pbi->decrypt_cb, pbi->decrypt_state);
-#if CONFIG_ANS
+                         &td->bit_reader, pbi->decrypt_cb,
+                         pbi->decrypt_state);
+#else
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                          &td->token_ans,
-                          pbi->decrypt_cb, pbi->decrypt_state);
-#endif  // CONFIG_ANS
+                          &td->bit_reader, pbi->decrypt_cb,
+                          pbi->decrypt_state);
+#endif
       vp10_init_macroblockd(cm, &td->xd, td->dqcoeff);
       td->xd.plane[0].color_index_map = td->color_index_map[0];
       td->xd.plane[1].color_index_map = td->color_index_map[1];
@@ -3266,22 +3199,19 @@
       vp10_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end);
 
       for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += MI_BLOCK_SIZE) {
+           mi_row += cm->mib_size) {
         int mi_col;
 
         vp10_zero_left_context(&td->xd);
 
         for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += MI_BLOCK_SIZE) {
+             mi_col += cm->mib_size) {
           decode_partition(pbi, &td->xd,
 #if CONFIG_SUPERTX
                            0,
 #endif  // CONFIG_SUPERTX
                            mi_row, mi_col, &td->bit_reader,
-#if CONFIG_ANS
-                           &td->token_ans,
-#endif  // CONFIG_ANS
-                           BLOCK_LARGEST, MAX_SB_SIZE_LOG2 - 2);
+                           cm->sb_size, b_width_log2_lookup[cm->sb_size]);
         }
         pbi->mb.corrupted |= td->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -3308,8 +3238,8 @@
     // Loopfilter one tile row.
     if (cm->lf.filter_level && !cm->skip_loop_filter) {
       LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-      const int lf_start = VPXMAX(0, tile_info.mi_row_start - MI_BLOCK_SIZE);
-      const int lf_end = tile_info.mi_row_end - MI_BLOCK_SIZE;
+      const int lf_start = VPXMAX(0, tile_info.mi_row_start - cm->mib_size);
+      const int lf_end = tile_info.mi_row_end - cm->mib_size;
 
       // Delay the loopfilter if the first tile row is only
       // a single superblock high.
@@ -3333,7 +3263,7 @@
     // After loopfiltering, the last 7 row pixels in each superblock row may
     // still be changed by the longest loopfilter of the next superblock row.
     if (cm->frame_parallel_decode)
-      vp10_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2);
+      vp10_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
 #endif  // !CONFIG_VAR_TX
   }
 
@@ -3369,6 +3299,7 @@
 static int tile_worker_hook(TileWorkerData *const tile_data,
                             const TileInfo *const tile) {
   VP10Decoder *const pbi = tile_data->pbi;
+  const VP10_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
 
   if (setjmp(tile_data->error_info.jmp)) {
@@ -3383,20 +3314,17 @@
   vp10_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
+       mi_row += cm->mib_size) {
     vp10_zero_left_context(&tile_data->xd);
 
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE) {
+         mi_col += cm->mib_size) {
       decode_partition(pbi, &tile_data->xd,
 #if CONFIG_SUPERTX
                        0,
 #endif
                        mi_row, mi_col, &tile_data->bit_reader,
-#if CONFIG_ANS
-                       &tile_data->token_ans,
-#endif  // CONFIG_ANS
-                       BLOCK_LARGEST, MAX_SB_SIZE_LOG2 - 2);
+                       cm->sb_size, b_width_log2_lookup[cm->sb_size]);
     }
   }
   return !tile_data->xd.corrupted;
@@ -3535,12 +3463,13 @@
         vp10_zero(twd->dqcoeff);
         vp10_tile_init(tile_info, cm, tile_row, buf->col);
         vp10_tile_init(&twd->xd.tile, cm, tile_row, buf->col);
+#if !CONFIG_ANS
         setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
                            &twd->bit_reader,
                            pbi->decrypt_cb, pbi->decrypt_state);
-#if CONFIG_ANS
+#else
         setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                            &twd->token_ans, pbi->decrypt_cb,
+                            &twd->bit_reader, pbi->decrypt_cb,
                             pbi->decrypt_state);
 #endif  // CONFIG_ANS
         vp10_init_macroblockd(cm, &twd->xd, twd->dqcoeff);
@@ -3848,6 +3777,12 @@
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp10_setup_past_independence(cm);
 
+#if CONFIG_EXT_PARTITION
+  set_sb_size(cm, vpx_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+#else
+  set_sb_size(cm, BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
   setup_loopfilter(cm, rb);
 #if CONFIG_LOOP_RESTORATION
   setup_restoration(cm, rb);
@@ -3887,7 +3822,7 @@
                                                       : read_tx_mode(rb);
   cm->reference_mode = read_frame_reference_mode(cm, rb);
 
-  setup_tile_info(pbi, rb);
+  read_tile_info(pbi, rb);
   sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
@@ -3898,11 +3833,11 @@
 }
 
 #if CONFIG_EXT_TX
-static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j, k;
   int s;
   for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
-    if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_inter_ext_tx_for_txsize[s][i]) continue;
         for (j = 0; j < num_ext_tx_set_inter[s] - 1; ++j)
@@ -3912,7 +3847,7 @@
   }
 
   for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
-    if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_intra_ext_tx_for_txsize[s][i]) continue;
         for (j = 0; j < INTRA_MODES; ++j)
@@ -3925,16 +3860,16 @@
 
 #else
 
-static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j, k;
-  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (j = 0; j < TX_TYPES; ++j)
         for (k = 0; k < TX_TYPES - 1; ++k)
           vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k]);
     }
   }
-  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (k = 0; k < TX_TYPES - 1; ++k)
         vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k]);
@@ -3944,9 +3879,9 @@
 #endif  // CONFIG_EXT_TX
 
 #if CONFIG_SUPERTX
-static void read_supertx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_supertx_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j;
-  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
     for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
       for (j = 1; j < TX_SIZES; ++j) {
         vp10_diff_update_prob(r, &fc->supertx_prob[i][j]);
@@ -3963,13 +3898,19 @@
   MACROBLOCKD *const xd = &pbi->mb;
 #endif
   FRAME_CONTEXT *const fc = cm->fc;
-  vpx_reader r;
+  vp10_reader r;
   int k, i, j;
 
+#if !CONFIG_ANS
   if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                       pbi->decrypt_state))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
+#else
+  if (ans_read_init(&r, data, partition_size))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate compressed header ANS decoder");
+#endif  // !CONFIG_ANS
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     for (i = 0; i < TX_SIZES - 1; ++i)
@@ -4035,20 +3976,24 @@
 #if CONFIG_EXT_INTER
     read_inter_compound_mode_probs(fc, &r);
     if (cm->reference_mode != COMPOUND_REFERENCE) {
-      for (i = 0; i < BLOCK_SIZES; i++) {
-        if (is_interintra_allowed_bsize(i)) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
           vp10_diff_update_prob(&r, &fc->interintra_prob[i]);
         }
       }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        for (j = 0; j < INTERINTRA_MODES - 1; j++)
+          vp10_diff_update_prob(&r, &fc->interintra_mode_prob[i][j]);
+      }
       for (i = 0; i < BLOCK_SIZES; i++) {
-        if (is_interintra_allowed_bsize(i) && get_wedge_bits(i)) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
           vp10_diff_update_prob(&r, &fc->wedge_interintra_prob[i]);
         }
       }
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < BLOCK_SIZES; i++) {
-        if (get_wedge_bits(i)) {
+        if (is_interinter_wedge_used(i)) {
           vp10_diff_update_prob(&r, &fc->wedge_interinter_prob[i]);
         }
       }
@@ -4087,7 +4032,7 @@
 #endif
   }
 
-  return vpx_reader_has_error(&r);
+  return vp10_reader_has_error(&r);
 }
 
 #ifdef NDEBUG
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index 8035e06..2c9cd58 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -24,25 +24,25 @@
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
-static INLINE int read_uniform(vpx_reader *r, int n) {
+static INLINE int read_uniform(vp10_reader *r, int n) {
   int l = get_unsigned_bits(n);
   int m = (1 << l) - n;
-  int v = vpx_read_literal(r, l-1);
+  int v = vp10_read_literal(r, l-1);
 
   assert(l != 0);
 
   if (v < m)
     return v;
   else
-    return (v << 1) - m + vpx_read_literal(r, 1);
+    return (v << 1) - m + vp10_read_literal(r, 1);
 }
 
-static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
-  return (PREDICTION_MODE)vpx_read_tree(r, vp10_intra_mode_tree, p);
+static PREDICTION_MODE read_intra_mode(vp10_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vp10_read_tree(r, vp10_intra_mode_tree, p);
 }
 
 static PREDICTION_MODE read_intra_mode_y(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                         vpx_reader *r, int size_group) {
+                                         vp10_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
       read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
   FRAME_COUNTS *counts = xd->counts;
@@ -52,7 +52,7 @@
 }
 
 static PREDICTION_MODE read_intra_mode_uv(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                          vpx_reader *r,
+                                          vp10_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
                                          cm->fc->uv_mode_prob[y_mode]);
@@ -62,17 +62,30 @@
   return uv_mode;
 }
 
+#if CONFIG_EXT_INTER
+static INTERINTRA_MODE read_interintra_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                            vp10_reader *r, int size_group) {
+  const INTERINTRA_MODE ii_mode =
+      (INTERINTRA_MODE)vp10_read_tree(r, vp10_interintra_mode_tree,
+                                      cm->fc->interintra_mode_prob[size_group]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->interintra_mode[size_group][ii_mode];
+  return ii_mode;
+}
+#endif  // CONFIG_EXT_INTER
+
 static PREDICTION_MODE read_inter_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
 #if CONFIG_REF_MV && CONFIG_EXT_INTER
                                        MB_MODE_INFO *mbmi,
 #endif
-                                       vpx_reader *r, int16_t ctx) {
+                                       vp10_reader *r, int16_t ctx) {
 #if CONFIG_REF_MV
   FRAME_COUNTS *counts = xd->counts;
   int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
   vpx_prob mode_prob = cm->fc->newmv_prob[mode_ctx];
 
-  if (vpx_read(r, mode_prob) == 0) {
+  if (vp10_read(r, mode_prob) == 0) {
     if (counts)
       ++counts->newmv_mode[mode_ctx][0];
 
@@ -83,7 +96,7 @@
 #if CONFIG_EXT_INTER
     } else {
       mode_prob = cm->fc->new2mv_prob;
-      if (vpx_read(r, mode_prob) == 0) {
+      if (vp10_read(r, mode_prob) == 0) {
         if (counts)
           ++counts->new2mv_mode[0];
         return NEWMV;
@@ -104,7 +117,7 @@
   mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
 
   mode_prob = cm->fc->zeromv_prob[mode_ctx];
-  if (vpx_read(r, mode_prob) == 0) {
+  if (vp10_read(r, mode_prob) == 0) {
     if (counts)
       ++counts->zeromv_mode[mode_ctx][0];
     return ZEROMV;
@@ -123,7 +136,7 @@
 
   mode_prob = cm->fc->refmv_prob[mode_ctx];
 
-  if (vpx_read(r, mode_prob) == 0) {
+  if (vp10_read(r, mode_prob) == 0) {
     if (counts)
       ++counts->refmv_mode[mode_ctx][0];
 
@@ -137,7 +150,7 @@
   // Invalid prediction mode.
   assert(0);
 #else
-  const int mode = vpx_read_tree(r, vp10_inter_mode_tree,
+  const int mode = vp10_read_tree(r, vp10_inter_mode_tree,
                                  cm->fc->inter_mode_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
@@ -151,7 +164,7 @@
 static void read_drl_idx(const VP10_COMMON *cm,
                          MACROBLOCKD *xd,
                          MB_MODE_INFO *mbmi,
-                         vpx_reader *r) {
+                         vp10_reader *r) {
   uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
 
@@ -161,7 +174,7 @@
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
         vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
-        if (!vpx_read(r, drl_prob)) {
+        if (!vp10_read(r, drl_prob)) {
           mbmi->ref_mv_idx = idx;
           if (xd->counts)
             ++xd->counts->drl_mode[drl_ctx][0];
@@ -183,7 +196,7 @@
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
         vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
-        if (!vpx_read(r, drl_prob)) {
+        if (!vp10_read(r, drl_prob)) {
           mbmi->ref_mv_idx = idx - 1;
           if (xd->counts)
             ++xd->counts->drl_mode[drl_ctx][0];
@@ -201,8 +214,8 @@
 #if CONFIG_EXT_INTER
 static PREDICTION_MODE read_inter_compound_mode(VP10_COMMON *cm,
                                                 MACROBLOCKD *xd,
-                                                vpx_reader *r, int16_t ctx) {
-  const int mode = vpx_read_tree(r, vp10_inter_compound_mode_tree,
+                                                vp10_reader *r, int16_t ctx) {
+  const int mode = vp10_read_tree(r, vp10_inter_compound_mode_tree,
                                  cm->fc->inter_compound_mode_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
 
@@ -214,16 +227,16 @@
 }
 #endif  // CONFIG_EXT_INTER
 
-static int read_segment_id(vpx_reader *r,
+static int read_segment_id(vp10_reader *r,
     const struct segmentation_probs *segp) {
-  return vpx_read_tree(r, vp10_segment_tree, segp->tree_probs);
+  return vp10_read_tree(r, vp10_segment_tree, segp->tree_probs);
 }
 
 #if CONFIG_VAR_TX
 static void read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd,
                                MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
                                TX_SIZE tx_size, int blk_row, int blk_col,
-                               vpx_reader *r) {
+                               vp10_reader *r) {
   int is_split = 0;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
@@ -232,8 +245,8 @@
   int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
                                    xd->left_txfm_context + tx_row,
                                    tx_size);
-  TX_SIZE (*const inter_tx_size)[MI_BLOCK_SIZE] =
-    (TX_SIZE (*)[MI_BLOCK_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
+  TX_SIZE (*const inter_tx_size)[MAX_MIB_SIZE] =
+    (TX_SIZE (*)[MAX_MIB_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
 
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> 5;
@@ -243,7 +256,7 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
      return;
 
-  is_split = vpx_read(r, cm->fc->txfm_partition_prob[ctx]);
+  is_split = vp10_read(r, cm->fc->txfm_partition_prob[ctx]);
 
   if (is_split) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
@@ -285,11 +298,11 @@
 #endif
 
 static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, vpx_reader *r) {
+                                     TX_SIZE max_tx_size, vp10_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
   const int tx_size_cat = max_tx_size - TX_8X8;
-  int tx_size = vpx_read_tree(r, vp10_tx_size_tree[tx_size_cat],
+  int tx_size = vp10_read_tree(r, vp10_tx_size_tree[tx_size_cat],
                               cm->fc->tx_size_probs[tx_size_cat][ctx]);
   if (counts)
     ++counts->tx_size[tx_size_cat][ctx][tx_size];
@@ -297,7 +310,7 @@
 }
 
 static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
-                            int allow_select, vpx_reader *r) {
+                            int allow_select, vp10_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -335,7 +348,7 @@
 
 static int read_intra_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
                                  int mi_offset, int x_mis, int y_mis,
-                                 vpx_reader *r) {
+                                 vp10_reader *r) {
   struct segmentation *const seg = &cm->seg;
   FRAME_COUNTS *counts = xd->counts;
   struct segmentation_probs *const segp = &cm->fc->seg;
@@ -366,7 +379,7 @@
 }
 
 static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vpx_reader *r) {
+                                 int mi_row, int mi_col, vp10_reader *r) {
   struct segmentation *const seg = &cm->seg;
   FRAME_COUNTS *counts = xd->counts;
   struct segmentation_probs *const segp = &cm->fc->seg;
@@ -396,7 +409,7 @@
   if (seg->temporal_update) {
     const int ctx = vp10_get_pred_context_seg_id(xd);
     const vpx_prob pred_prob = segp->pred_probs[ctx];
-    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
+    mbmi->seg_id_predicted = vp10_read(r, pred_prob);
     if (counts)
       ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
     if (mbmi->seg_id_predicted) {
@@ -416,12 +429,12 @@
 }
 
 static int read_skip(VP10_COMMON *cm, const MACROBLOCKD *xd,
-                     int segment_id, vpx_reader *r) {
+                     int segment_id, vp10_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp10_get_skip_context(xd);
-    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+    const int skip = vp10_read(r, cm->fc->skip_probs[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->skip[ctx][skip];
@@ -431,7 +444,7 @@
 
 static void read_palette_mode_info(VP10_COMMON *const cm,
                                    MACROBLOCKD *const xd,
-                                   vpx_reader *r) {
+                                   vp10_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *const above_mi = xd->above_mi;
@@ -445,14 +458,14 @@
       palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
     if (left_mi)
       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    if (vpx_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+    if (vp10_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
                                                      [palette_ctx])) {
       pmi->palette_size[0] =
-        vpx_read_tree(r, vp10_palette_size_tree,
+        vp10_read_tree(r, vp10_palette_size_tree,
                       vp10_default_palette_y_size_prob[bsize - BLOCK_8X8]) + 2;
       n = pmi->palette_size[0];
       for (i = 0; i < n; ++i)
-        pmi->palette_colors[i] = vpx_read_literal(r, cm->bit_depth);
+        pmi->palette_colors[i] = vp10_read_literal(r, cm->bit_depth);
 
       xd->plane[0].color_index_map[0] = read_uniform(r, n);
       assert(xd->plane[0].color_index_map[0] < n);
@@ -460,18 +473,18 @@
   }
 
   if (mbmi->uv_mode == DC_PRED) {
-    if (vpx_read(r,
+    if (vp10_read(r,
                  vp10_default_palette_uv_mode_prob[pmi->palette_size[0] > 0])) {
       pmi->palette_size[1] =
-          vpx_read_tree(r, vp10_palette_size_tree,
+          vp10_read_tree(r, vp10_palette_size_tree,
                         vp10_default_palette_uv_size_prob[bsize - BLOCK_8X8])
                         + 2;
       n = pmi->palette_size[1];
       for (i = 0; i < n; ++i) {
         pmi->palette_colors[PALETTE_MAX_SIZE + i] =
-            vpx_read_literal(r, cm->bit_depth);
+            vp10_read_literal(r, cm->bit_depth);
         pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
-            vpx_read_literal(r, cm->bit_depth);
+            vp10_read_literal(r, cm->bit_depth);
       }
       xd->plane[1].color_index_map[0] = read_uniform(r, n);
       assert(xd->plane[1].color_index_map[0] < n);
@@ -481,7 +494,7 @@
 
 #if CONFIG_EXT_INTRA
 static void read_ext_intra_mode_info(VP10_COMMON *const cm,
-                                     MACROBLOCKD *const xd, vpx_reader *r) {
+                                     MACROBLOCKD *const xd, vp10_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   FRAME_COUNTS *counts = xd->counts;
@@ -492,7 +505,7 @@
   if (mbmi->mode == DC_PRED &&
       mbmi->palette_mode_info.palette_size[0] == 0) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
-        vpx_read(r, cm->fc->ext_intra_probs[0]);
+        vp10_read(r, cm->fc->ext_intra_probs[0]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
       mbmi->ext_intra_mode_info.ext_intra_mode[0] =
           read_uniform(r, FILTER_INTRA_MODES);
@@ -503,7 +516,7 @@
   if (mbmi->uv_mode == DC_PRED &&
       mbmi->palette_mode_info.palette_size[1] == 0) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
-        vpx_read(r, cm->fc->ext_intra_probs[1]);
+        vp10_read(r, cm->fc->ext_intra_probs[1]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
       mbmi->ext_intra_mode_info.ext_intra_mode[1] =
           read_uniform(r, FILTER_INTRA_MODES);
@@ -516,7 +529,7 @@
 
 static void read_intra_frame_mode_info(VP10_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vp10_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *above_mi = xd->above_mi;
@@ -569,7 +582,7 @@
             mbmi->angle_delta[0] * ANGLE_STEP;
         if (pick_intra_filter(p_angle)) {
           FRAME_COUNTS *counts = xd->counts;
-          mbmi->intra_filter = vpx_read_tree(r, vp10_intra_filter_tree,
+          mbmi->intra_filter = vp10_read_tree(r, vp10_intra_filter_tree,
                                              cm->fc->intra_filter_probs[ctx]);
           if (counts)
             ++counts->intra_filter[ctx][mbmi->intra_filter];
@@ -602,7 +615,7 @@
       FRAME_COUNTS *counts = xd->counts;
       int eset = get_ext_tx_set(mbmi->tx_size, mbmi->sb_type, 0);
       if (eset > 0) {
-        mbmi->tx_type = vpx_read_tree(
+        mbmi->tx_type = vp10_read_tree(
             r, vp10_ext_tx_intra_tree[eset],
             cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode]);
         if (counts)
@@ -618,7 +631,7 @@
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       FRAME_COUNTS *counts = xd->counts;
       TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-      mbmi->tx_type = vpx_read_tree(
+      mbmi->tx_type = vp10_read_tree(
           r, vp10_ext_tx_tree,
           cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
       if (counts)
@@ -637,16 +650,16 @@
 #endif  // CONFIG_EXT_INTRA
 }
 
-static int read_mv_component(vpx_reader *r,
+static int read_mv_component(vp10_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
-  const int sign = vpx_read(r, mvcomp->sign);
-  const int mv_class = vpx_read_tree(r, vp10_mv_class_tree, mvcomp->classes);
+  const int sign = vp10_read(r, mvcomp->sign);
+  const int mv_class = vp10_read_tree(r, vp10_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = vpx_read_tree(r, vp10_mv_class0_tree, mvcomp->class0);
+    d = vp10_read_tree(r, vp10_mv_class0_tree, mvcomp->class0);
     mag = 0;
   } else {
     int i;
@@ -654,16 +667,16 @@
 
     d = 0;
     for (i = 0; i < n; ++i)
-      d |= vpx_read(r, mvcomp->bits[i]) << i;
+      d |= vp10_read(r, mvcomp->bits[i]) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
-  fr = vpx_read_tree(r, vp10_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+  fr = vp10_read_tree(r, vp10_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
                                                : mvcomp->fp);
 
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+  hp = usehp ? vp10_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
 
   // Result
@@ -671,11 +684,11 @@
   return sign ? -mag : mag;
 }
 
-static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
+static INLINE void read_mv(vp10_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE joint_type =
-      (MV_JOINT_TYPE)vpx_read_tree(r, vp10_mv_joint_tree, ctx->joints);
+      (MV_JOINT_TYPE)vp10_read_tree(r, vp10_mv_joint_tree, ctx->joints);
   const int use_hp = allow_hp && vp10_use_mv_hp(ref);
   MV diff = {0, 0};
 
@@ -693,11 +706,11 @@
 
 static REFERENCE_MODE read_block_reference_mode(VP10_COMMON *cm,
                                                 const MACROBLOCKD *xd,
-                                                vpx_reader *r) {
+                                                vp10_reader *r) {
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp10_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
+        (REFERENCE_MODE)vp10_read(r, cm->fc->comp_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->comp_inter[ctx][mode];
@@ -709,7 +722,7 @@
 
 // Read the referncence frame
 static void read_ref_frames(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                            vpx_reader *r,
+                            vp10_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = xd->counts;
@@ -724,7 +737,7 @@
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ctx = vp10_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = vpx_read(r, fc->comp_ref_prob[ctx][0]);
+      const int bit = vp10_read(r, fc->comp_ref_prob[ctx][0]);
       if (counts)
         ++counts->comp_ref[ctx][0][bit];
       ref_frame[idx] = cm->comp_fixed_ref;
@@ -732,18 +745,18 @@
 #if CONFIG_EXT_REFS
       if (!bit) {
         const int ctx1 = vp10_get_pred_context_comp_ref_p1(cm, xd);
-        const int bit1 = vpx_read(r, fc->comp_ref_prob[ctx1][1]);
+        const int bit1 = vp10_read(r, fc->comp_ref_prob[ctx1][1]);
         if (counts)
           ++counts->comp_ref[ctx1][1][bit1];
         ref_frame[!idx] = cm->comp_var_ref[bit1 ? 0 : 1];
       } else {
         const int ctx2 = vp10_get_pred_context_comp_ref_p2(cm, xd);
-        const int bit2 = vpx_read(r, fc->comp_ref_prob[ctx2][2]);
+        const int bit2 = vp10_read(r, fc->comp_ref_prob[ctx2][2]);
         if (counts)
           ++counts->comp_ref[ctx2][2][bit2];
         if (!bit2) {
           const int ctx3 = vp10_get_pred_context_comp_ref_p3(cm, xd);
-          const int bit3 = vpx_read(r, fc->comp_ref_prob[ctx3][3]);
+          const int bit3 = vp10_read(r, fc->comp_ref_prob[ctx3][3]);
           if (counts)
             ++counts->comp_ref[ctx3][3][bit3];
           ref_frame[!idx] = cm->comp_var_ref[bit3 ? 2 : 3];
@@ -757,29 +770,29 @@
     } else if (mode == SINGLE_REFERENCE) {
 #if CONFIG_EXT_REFS
       const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
-      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = vp10_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
         ++counts->single_ref[ctx0][0][bit0];
       if (bit0) {
         const int ctx1 = vp10_get_pred_context_single_ref_p2(xd);
-        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = vp10_read(r, fc->single_ref_prob[ctx1][1]);
         if (counts)
           ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
       } else {
         const int ctx2 = vp10_get_pred_context_single_ref_p3(xd);
-        const int bit2 = vpx_read(r, fc->single_ref_prob[ctx2][2]);
+        const int bit2 = vp10_read(r, fc->single_ref_prob[ctx2][2]);
         if (counts)
           ++counts->single_ref[ctx2][2][bit2];
         if (bit2) {
           const int ctx4 = vp10_get_pred_context_single_ref_p5(xd);
-          const int bit4 = vpx_read(r, fc->single_ref_prob[ctx4][4]);
+          const int bit4 = vp10_read(r, fc->single_ref_prob[ctx4][4]);
           if (counts)
             ++counts->single_ref[ctx4][4][bit4];
           ref_frame[0] = bit4 ? LAST4_FRAME : LAST3_FRAME;
         } else {
           const int ctx3 = vp10_get_pred_context_single_ref_p4(xd);
-          const int bit3 = vpx_read(r, fc->single_ref_prob[ctx3][3]);
+          const int bit3 = vp10_read(r, fc->single_ref_prob[ctx3][3]);
           if (counts)
             ++counts->single_ref[ctx3][3][bit3];
           ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
@@ -787,12 +800,12 @@
       }
 #else
       const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
-      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = vp10_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
         ++counts->single_ref[ctx0][0][bit0];
       if (bit0) {
         const int ctx1 = vp10_get_pred_context_single_ref_p2(xd);
-        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = vp10_read(r, fc->single_ref_prob[ctx1][1]);
         if (counts)
           ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
@@ -811,13 +824,13 @@
 
 #if CONFIG_OBMC
 static int read_is_obmc_block(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                              vpx_reader *r) {
+                              vp10_reader *r) {
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   FRAME_COUNTS *counts = xd->counts;
   int is_obmc;
 
   if (is_obmc_allowed(&xd->mi[0]->mbmi)) {
-    is_obmc = vpx_read(r, cm->fc->obmc_prob[bsize]);
+    is_obmc = vp10_read(r, cm->fc->obmc_prob[bsize]);
     if (counts)
       ++counts->obmc[bsize][is_obmc];
     return is_obmc;
@@ -829,14 +842,14 @@
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
     VP10_COMMON *const cm, MACROBLOCKD *const xd,
-    vpx_reader *r) {
+    vp10_reader *r) {
   const int ctx = vp10_get_pred_context_switchable_interp(xd);
   FRAME_COUNTS *counts = xd->counts;
   INTERP_FILTER type;
 #if CONFIG_EXT_INTERP
   if (!vp10_is_interp_needed(xd)) return EIGHTTAP_REGULAR;
 #endif
-  type = (INTERP_FILTER)vpx_read_tree(r, vp10_switchable_interp_tree,
+  type = (INTERP_FILTER)vp10_read_tree(r, vp10_switchable_interp_tree,
                                       cm->fc->switchable_interp_prob[ctx]);
   if (counts)
     ++counts->switchable_interp[ctx][type];
@@ -845,7 +858,7 @@
 
 static void read_intra_block_mode_info(VP10_COMMON *const cm,
                                        MACROBLOCKD *const xd, MODE_INFO *mi,
-                                       vpx_reader *r) {
+                                       vp10_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   int i;
@@ -884,7 +897,7 @@
         if (pick_intra_filter(p_angle)) {
           FRAME_COUNTS *counts = xd->counts;
           const int ctx = vp10_get_pred_context_intra_interp(xd);
-          mbmi->intra_filter = vpx_read_tree(r, vp10_intra_filter_tree,
+          mbmi->intra_filter = vp10_read_tree(r, vp10_intra_filter_tree,
                                              cm->fc->intra_filter_probs[ctx]);
           if (counts)
             ++counts->intra_filter[ctx][mbmi->intra_filter];
@@ -926,7 +939,7 @@
 #endif
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
-                            int is_compound, int allow_hp, vpx_reader *r) {
+                            int is_compound, int allow_hp, vp10_reader *r) {
   int i;
   int ret = 1;
 #if CONFIG_REF_MV
@@ -1141,12 +1154,12 @@
 }
 
 static int read_is_inter_block(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vpx_reader *r) {
+                               int segment_id, vp10_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp10_get_intra_inter_context(xd);
-    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
+    const int is_inter = vp10_read(r, cm->fc->intra_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->intra_inter[ctx][is_inter];
@@ -1157,17 +1170,17 @@
 static void fpm_sync(void *const data, int mi_row) {
   VP10Decoder *const pbi = (VP10Decoder *)data;
   vp10_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << MI_BLOCK_SIZE_LOG2);
+                       mi_row << pbi->common.mib_size_log2);
 }
 
 static void read_inter_block_mode_info(VP10Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        MODE_INFO *const mi,
 #if (CONFIG_OBMC || CONFIG_EXT_INTER) && CONFIG_SUPERTX
-                                       int mi_row, int mi_col, vpx_reader *r,
+                                       int mi_row, int mi_col, vp10_reader *r,
                                        int supertx_enabled) {
 #else
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vp10_reader *r) {
 #endif  // CONFIG_OBMC && CONFIG_SUPERTX
   VP10_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -1513,13 +1526,14 @@
       !supertx_enabled &&
 #endif
       is_interintra_allowed(mbmi)) {
-    const int interintra = vpx_read(r, cm->fc->interintra_prob[bsize]);
+    const int bsize_group = size_group_lookup[bsize];
+    const int interintra = vp10_read(r, cm->fc->interintra_prob[bsize_group]);
     if (xd->counts)
-      xd->counts->interintra[bsize][interintra]++;
+      xd->counts->interintra[bsize_group][interintra]++;
     assert(mbmi->ref_frame[1] == NONE);
     if (interintra) {
-      const PREDICTION_MODE interintra_mode =
-          read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+      const INTERINTRA_MODE interintra_mode =
+          read_interintra_mode(cm, xd, r, bsize_group);
       mbmi->ref_frame[1] = INTRA_FRAME;
       mbmi->interintra_mode = interintra_mode;
       mbmi->interintra_uv_mode = interintra_mode;
@@ -1530,15 +1544,15 @@
       mbmi->angle_delta[1] = 0;
       mbmi->intra_filter = INTRA_FILTER_LINEAR;
 #endif  // CONFIG_EXT_INTRA
-      if (get_wedge_bits(bsize)) {
+      if (is_interintra_wedge_used(bsize)) {
         mbmi->use_wedge_interintra =
-            vpx_read(r, cm->fc->wedge_interintra_prob[bsize]);
+            vp10_read(r, cm->fc->wedge_interintra_prob[bsize]);
         if (xd->counts)
           xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
         if (mbmi->use_wedge_interintra) {
           mbmi->interintra_wedge_index =
           mbmi->interintra_uv_wedge_index =
-              vpx_read_literal(r, get_wedge_bits(bsize));
+              vp10_read_literal(r, get_wedge_bits(bsize));
         }
       }
     }
@@ -1548,14 +1562,14 @@
 #if CONFIG_OBMC
       !(is_obmc_allowed(mbmi) && mbmi->obmc) &&
 #endif  // CONFIG_OBMC
-      get_wedge_bits(bsize)) {
+      is_interinter_wedge_used(bsize)) {
     mbmi->use_wedge_interinter =
-        vpx_read(r, cm->fc->wedge_interinter_prob[bsize]);
+        vp10_read(r, cm->fc->wedge_interinter_prob[bsize]);
     if (xd->counts)
       xd->counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
     if (mbmi->use_wedge_interinter) {
       mbmi->interinter_wedge_index =
-          vpx_read_literal(r, get_wedge_bits(bsize));
+          vp10_read_literal(r, get_wedge_bits(bsize));
     }
   }
 #endif  // CONFIG_EXT_INTER
@@ -1572,7 +1586,7 @@
 #if CONFIG_SUPERTX
                                        int supertx_enabled,
 #endif  // CONFIG_SUPERTX
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vp10_reader *r) {
   VP10_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -1592,7 +1606,8 @@
 
 #if CONFIG_VAR_TX
     xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+    xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
     if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
         !mbmi->skip && inter_block) {
       const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -1668,14 +1683,14 @@
       if (inter_block) {
         if (eset > 0) {
           mbmi->tx_type =
-              vpx_read_tree(r, vp10_ext_tx_inter_tree[eset],
+              vp10_read_tree(r, vp10_ext_tx_inter_tree[eset],
                             cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size]);
           if (counts)
             ++counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
         }
       } else if (ALLOW_INTRA_EXT_TX) {
         if (eset > 0) {
-          mbmi->tx_type = vpx_read_tree(r, vp10_ext_tx_intra_tree[eset],
+          mbmi->tx_type = vp10_read_tree(r, vp10_ext_tx_intra_tree[eset],
                                         cm->fc->intra_ext_tx_prob[eset]
                                                 [mbmi->tx_size][mbmi->mode]);
           if (counts)
@@ -1695,14 +1710,14 @@
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       FRAME_COUNTS *counts = xd->counts;
       if (inter_block) {
-        mbmi->tx_type = vpx_read_tree(
+        mbmi->tx_type = vp10_read_tree(
             r, vp10_ext_tx_tree,
             cm->fc->inter_ext_tx_prob[mbmi->tx_size]);
         if (counts)
           ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
       } else {
         const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-        mbmi->tx_type = vpx_read_tree(
+        mbmi->tx_type = vp10_read_tree(
             r, vp10_ext_tx_tree,
             cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
         if (counts)
@@ -1719,7 +1734,7 @@
 #if CONFIG_SUPERTX
                          int supertx_enabled,
 #endif  // CONFIG_SUPERTX
-                         int mi_row, int mi_col, vpx_reader *r,
+                         int mi_row, int mi_col, vp10_reader *r,
                          int x_mis, int y_mis) {
   VP10_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
diff --git a/vp10/decoder/decodemv.h b/vp10/decoder/decodemv.h
index 959a001..c10c6bf 100644
--- a/vp10/decoder/decodemv.h
+++ b/vp10/decoder/decodemv.h
@@ -11,7 +11,7 @@
 #ifndef VP10_DECODER_DECODEMV_H_
 #define VP10_DECODER_DECODEMV_H_
 
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 
 #include "vp10/decoder/decoder.h"
 
@@ -24,7 +24,7 @@
                          int supertx_enabled,
 #endif
 
-                         int mi_row, int mi_col, vpx_reader *r,
+                         int mi_row, int mi_col, vp10_reader *r,
                          int x_mis, int y_mis);
 
 #ifdef __cplusplus
diff --git a/vp10/decoder/decoder.h b/vp10/decoder/decoder.h
index 23c7424..c349252 100644
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@@ -14,13 +14,10 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_util/vpx_thread.h"
 
-#if CONFIG_ANS
-#include "vp10/common/ans.h"
-#endif
 #include "vp10/common/thread_common.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/ppflags.h"
@@ -33,10 +30,7 @@
 // TODO(hkuang): combine this with TileWorkerData.
 typedef struct TileData {
   VP10_COMMON *cm;
-  vpx_reader bit_reader;
-#if CONFIG_ANS
-  struct AnsDecoder token_ans;
-#endif  // CONFIG_ANS
+  vp10_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
@@ -45,10 +39,7 @@
 
 typedef struct TileWorkerData {
   struct VP10Decoder *pbi;
-  vpx_reader bit_reader;
-#if CONFIG_ANS
-  struct AnsDecoder token_ans;
-#endif  // CONFIG_ANS
+  vp10_reader bit_reader;
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index 343c7c8..def3d7a 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -38,10 +38,10 @@
   } while (0)
 
 #if !CONFIG_ANS
-static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+static INLINE int read_coeff(const vpx_prob *probs, int n, vp10_reader *r) {
   int i, val = 0;
   for (i = 0; i < n; ++i)
-    val = (val << 1) | vpx_read(r, probs[i]);
+    val = (val << 1) | vp10_read(r, probs[i]);
   return val;
 }
 
@@ -50,7 +50,7 @@
                         tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
                         const int16_t *dq,
                         int ctx, const int16_t *scan, const int16_t *nb,
-                        vpx_reader *r) {
+                        vp10_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = xd->fc;
@@ -120,12 +120,12 @@
     prob = coef_probs[band][ctx];
     if (counts)
       ++eob_branch_count[band][ctx];
-    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!vp10_read(r, prob[EOB_CONTEXT_NODE])) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
 
-    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vp10_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
@@ -137,13 +137,13 @@
       prob = coef_probs[band][ctx];
     }
 
-    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
+    if (!vp10_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vpx_read_tree(r, vp10_coef_con_tree,
+      token = vp10_read_tree(r, vp10_coef_con_tree,
                             vp10_pareto8_full[prob[PIVOT_NODE] - 1]);
       switch (token) {
         case TWO_TOKEN:
@@ -194,13 +194,13 @@
     v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VP9_HIGHBITDEPTH
-    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
+    dqcoeff[scan[c]] = highbd_check_range((vp10_read_bit(r) ? -v : v),
                                           xd->bd);
 #else
-    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+    dqcoeff[scan[c]] = check_range(vp10_read_bit(r) ? -v : v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
-    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+    dqcoeff[scan[c]] = vp10_read_bit(r) ? -v : v;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = vp10_pt_energy_class[token];
     ++c;
@@ -429,7 +429,7 @@
 }
 
 void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
-                                vpx_reader *r) {
+                                vp10_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -449,7 +449,7 @@
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
       color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
                                                  color_order);
-      color_idx = vpx_read_tree(r, vp10_palette_color_tree[n - 2],
+      color_idx = vp10_read_tree(r, vp10_palette_color_tree[n - 2],
                                 prob[n - 2][color_ctx]);
       assert(color_idx >= 0 && color_idx < n);
       color_map[i * cols + j] = color_order[color_idx];
@@ -468,7 +468,7 @@
 #if CONFIG_ANS
                              struct AnsDecoder *const r,
 #else
-                             vpx_reader *r,
+                             vp10_reader *r,
 #endif  // CONFIG_ANS
                              int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/vp10/decoder/detokenize.h b/vp10/decoder/detokenize.h
index 1c9e22e..a19d90f 100644
--- a/vp10/decoder/detokenize.h
+++ b/vp10/decoder/detokenize.h
@@ -21,7 +21,7 @@
 #endif
 
 void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
-                                vpx_reader *r);
+                                vp10_reader *r);
 int vp10_decode_block_tokens(MACROBLOCKD *const xd,
 #if CONFIG_ANS
                              const rans_dec_lut *const token_tab,
@@ -33,7 +33,7 @@
 #if CONFIG_ANS
                              struct AnsDecoder *const r,
 #else
-                             vpx_reader *r,
+                             vp10_reader *r,
 #endif  // CONFIG_ANS
                              int seg_id);
 
diff --git a/vp10/decoder/dsubexp.c b/vp10/decoder/dsubexp.c
index 7d2872e..4d53e12 100644
--- a/vp10/decoder/dsubexp.c
+++ b/vp10/decoder/dsubexp.c
@@ -21,11 +21,11 @@
   return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
-static int decode_uniform(vpx_reader *r) {
+static int decode_uniform(vp10_reader *r) {
   const int l = 8;
   const int m = (1 << l) - 190;
-  const int v = vpx_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
+  const int v = vp10_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vp10_read_bit(r);
 }
 
 static int inv_remap_prob(int v, int m) {
@@ -58,18 +58,18 @@
   }
 }
 
-static int decode_term_subexp(vpx_reader *r) {
-  if (!vpx_read_bit(r))
-    return vpx_read_literal(r, 4);
-  if (!vpx_read_bit(r))
-    return vpx_read_literal(r, 4) + 16;
-  if (!vpx_read_bit(r))
-    return vpx_read_literal(r, 5) + 32;
+static int decode_term_subexp(vp10_reader *r) {
+  if (!vp10_read_bit(r))
+    return vp10_read_literal(r, 4);
+  if (!vp10_read_bit(r))
+    return vp10_read_literal(r, 4) + 16;
+  if (!vp10_read_bit(r))
+    return vp10_read_literal(r, 5) + 32;
   return decode_uniform(r) + 64;
 }
 
-void vp10_diff_update_prob(vpx_reader *r, vpx_prob* p) {
-  if (vpx_read(r, DIFF_UPDATE_PROB)) {
+void vp10_diff_update_prob(vp10_reader *r, vpx_prob* p) {
+  if (vp10_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r);
     *p = (vpx_prob)inv_remap_prob(delp, *p);
   }
diff --git a/vp10/decoder/dsubexp.h b/vp10/decoder/dsubexp.h
index 1a7ed99..c05ec6e 100644
--- a/vp10/decoder/dsubexp.h
+++ b/vp10/decoder/dsubexp.h
@@ -12,13 +12,13 @@
 #ifndef VP10_DECODER_DSUBEXP_H_
 #define VP10_DECODER_DSUBEXP_H_
 
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp10_diff_update_prob(vpx_reader *r, vpx_prob* p);
+void vp10_diff_update_prob(vp10_reader *r, vpx_prob* p);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/encoder/aq_complexity.c b/vp10/encoder/aq_complexity.c
index 9f73ecc..a4c38d1 100644
--- a/vp10/encoder/aq_complexity.c
+++ b/vp10/encoder/aq_complexity.c
@@ -116,8 +116,6 @@
   VP10_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[BLOCK_LARGEST];
-  const int bh = num_8x8_blocks_high_lookup[BLOCK_LARGEST];
   const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
@@ -130,7 +128,7 @@
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
     // It is converted to bits * 256 units.
     const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
-                            (bw * bh);
+                            (cm->mib_size * cm->mib_size);
     double logvar;
     double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
diff --git a/vp10/encoder/aq_cyclicrefresh.c b/vp10/encoder/aq_cyclicrefresh.c
index defb974..057c057 100644
--- a/vp10/encoder/aq_cyclicrefresh.c
+++ b/vp10/encoder/aq_cyclicrefresh.c
@@ -388,8 +388,8 @@
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
-  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
+  sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
@@ -404,8 +404,8 @@
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * MI_BLOCK_SIZE;
-    int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int mi_row = sb_row_index * cm->mib_size;
+    int mi_col = sb_col_index * cm->mib_size;
     int qindex_thresh =
         cpi->oxcf.content == VP9E_CONTENT_SCREEN
             ? vp10_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
@@ -413,11 +413,9 @@
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
-    // Loop through all 8x8 blocks in superblock and update map.
-    xmis =
-        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_LARGEST]);
-    ymis =
-        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_LARGEST]);
+    // Loop through all MI blocks in superblock and update map.
+    xmis = VPXMIN(cm->mi_cols - mi_col, cm->mib_size);
+    ymis = VPXMIN(cm->mi_rows - mi_row, cm->mib_size);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 721a7a6..12e7e71 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -93,16 +93,16 @@
     {{0, 1}, {2, 2}, {6, 3}, {7, 3}},  // Max tx_size is 32X32
 };
 
-static INLINE void write_uniform(vpx_writer *w, int n, int v) {
+static INLINE void write_uniform(vp10_writer *w, int n, int v) {
   int l = get_unsigned_bits(n);
   int m = (1 << l) - n;
   if (l == 0)
     return;
   if (v < m) {
-    vpx_write_literal(w, v, l - 1);
+    vp10_write_literal(w, v, l - 1);
   } else {
-    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vpx_write_literal(w, (v - m) & 1, 1);
+    vp10_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp10_write_literal(w, (v - m) & 1, 1);
   }
 }
 
@@ -115,6 +115,9 @@
 #if CONFIG_EXT_INTRA
 static struct vp10_token intra_filter_encodings[INTRA_FILTERS];
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+static struct vp10_token interintra_mode_encodings[INTERINTRA_MODES];
+#endif  // CONFIG_EXT_INTER
 
 void vp10_encode_token_init() {
 #if CONFIG_EXT_TX
@@ -131,15 +134,26 @@
 #if CONFIG_EXT_INTRA
   vp10_tokens_from_tree(intra_filter_encodings, vp10_intra_filter_tree);
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+  vp10_tokens_from_tree(interintra_mode_encodings, vp10_interintra_mode_tree);
+#endif  // CONFIG_EXT_INTER
 }
 
-static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+static void write_intra_mode(vp10_writer *w, PREDICTION_MODE mode,
                              const vpx_prob *probs) {
   vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
+#if CONFIG_EXT_INTER
+static void write_interintra_mode(vp10_writer *w, INTERINTRA_MODE mode,
+                                  const vpx_prob *probs) {
+  vp10_write_token(w, vp10_interintra_mode_tree, probs,
+                   &interintra_mode_encodings[mode]);
+}
+#endif  // CONFIG_EXT_INTER
+
 static void write_inter_mode(VP10_COMMON *cm,
-                             vpx_writer *w, PREDICTION_MODE mode,
+                             vp10_writer *w, PREDICTION_MODE mode,
 #if CONFIG_REF_MV && CONFIG_EXT_INTER
                              int is_compound,
 #endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
@@ -148,14 +162,14 @@
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
   const vpx_prob newmv_prob = cm->fc->newmv_prob[newmv_ctx];
 #if CONFIG_EXT_INTER
-  vpx_write(w, mode != NEWMV && mode != NEWFROMNEARMV, newmv_prob);
+  vp10_write(w, mode != NEWMV && mode != NEWFROMNEARMV, newmv_prob);
 
   if (!is_compound && (mode == NEWMV || mode == NEWFROMNEARMV))
-    vpx_write(w, mode == NEWFROMNEARMV, cm->fc->new2mv_prob);
+    vp10_write(w, mode == NEWFROMNEARMV, cm->fc->new2mv_prob);
 
   if (mode != NEWMV && mode != NEWFROMNEARMV) {
 #else
-  vpx_write(w, mode != NEWMV, newmv_prob);
+  vp10_write(w, mode != NEWMV, newmv_prob);
 
   if (mode != NEWMV) {
 #endif  // CONFIG_EXT_INTER
@@ -167,7 +181,7 @@
       return;
     }
 
-    vpx_write(w, mode != ZEROMV, zeromv_prob);
+    vp10_write(w, mode != ZEROMV, zeromv_prob);
 
     if (mode != ZEROMV) {
       int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
@@ -181,7 +195,7 @@
         refmv_ctx = 8;
 
       refmv_prob = cm->fc->refmv_prob[refmv_ctx];
-      vpx_write(w, mode != NEARESTMV, refmv_prob);
+      vp10_write(w, mode != NEARESTMV, refmv_prob);
     }
   }
 #else
@@ -196,7 +210,7 @@
 static void write_drl_idx(const VP10_COMMON *cm,
                           const MB_MODE_INFO *mbmi,
                           const MB_MODE_INFO_EXT *mbmi_ext,
-                          vpx_writer *w) {
+                          vp10_writer *w) {
   uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
 
   assert(mbmi->ref_mv_idx < 3);
@@ -209,7 +223,7 @@
             vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
         vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
 
-        vpx_write(w, mbmi->ref_mv_idx != idx, drl_prob);
+        vp10_write(w, mbmi->ref_mv_idx != idx, drl_prob);
         if (mbmi->ref_mv_idx == idx)
           return;
       }
@@ -226,7 +240,7 @@
             vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
         vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
 
-        vpx_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
+        vp10_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
         if (mbmi->ref_mv_idx == (idx - 1))
           return;
       }
@@ -237,7 +251,7 @@
 #endif
 
 #if CONFIG_EXT_INTER
-static void write_inter_compound_mode(VP10_COMMON *cm, vpx_writer *w,
+static void write_inter_compound_mode(VP10_COMMON *cm, vp10_writer *w,
                                       PREDICTION_MODE mode,
                                       const int16_t mode_ctx) {
   const vpx_prob *const inter_compound_probs =
@@ -257,7 +271,7 @@
 static void prob_diff_update(const vpx_tree_index *tree,
                              vpx_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/],
-                             int n, vpx_writer *w) {
+                             int n, vp10_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -292,7 +306,7 @@
                                 const MACROBLOCKD *xd,
                                 const MB_MODE_INFO *mbmi,
                                 TX_SIZE tx_size, int blk_row, int blk_col,
-                                vpx_writer *w) {
+                                vp10_writer *w) {
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
@@ -310,14 +324,14 @@
      return;
 
   if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
-    vpx_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+    vp10_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
     txfm_partition_update(xd->above_txfm_context + tx_col,
                           xd->left_txfm_context + tx_row, tx_size);
   } else {
     const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int bsl = b_width_log2_lookup[bsize];
     int i;
-    vpx_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+    vp10_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
 
     if (tx_size == TX_8X8) {
       txfm_partition_update(xd->above_txfm_context + tx_col,
@@ -335,7 +349,7 @@
   }
 }
 
-static void update_txfm_partition_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_txfm_partition_probs(VP10_COMMON *cm, vp10_writer *w,
                                         FRAME_COUNTS *counts) {
   int k;
   for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
@@ -345,7 +359,7 @@
 #endif
 
 static void write_selected_tx_size(const VP10_COMMON *cm,
-                                   const MACROBLOCKD *xd, vpx_writer *w) {
+                                   const MACROBLOCKD *xd, vp10_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -358,7 +372,7 @@
 }
 
 #if CONFIG_REF_MV
-static void update_inter_mode_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_inter_mode_probs(VP10_COMMON *cm, vp10_writer *w,
                                     FRAME_COUNTS *counts) {
   int i;
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
@@ -380,7 +394,7 @@
 #endif
 
 #if CONFIG_EXT_INTER
-static void update_inter_compound_mode_probs(VP10_COMMON *cm, vpx_writer *w) {
+static void update_inter_compound_mode_probs(VP10_COMMON *cm, vp10_writer *w) {
   const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i;
@@ -393,7 +407,7 @@
                                         INTER_COMPOUND_MODES);
   }
   do_update = savings > savings_thresh;
-  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
       prob_diff_update(vp10_inter_compound_mode_tree,
@@ -406,17 +420,17 @@
 #endif  // CONFIG_EXT_INTER
 
 static int write_skip(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
+                      int segment_id, const MODE_INFO *mi, vp10_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
-    vpx_write(w, skip, vp10_get_skip_prob(cm, xd));
+    vp10_write(w, skip, vp10_get_skip_prob(cm, xd));
     return skip;
   }
 }
 
-static void update_skip_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_skip_probs(VP10_COMMON *cm, vp10_writer *w,
                               FRAME_COUNTS *counts) {
   int k;
 
@@ -424,7 +438,7 @@
     vp10_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }
 
-static void update_switchable_interp_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_switchable_interp_probs(VP10_COMMON *cm, vp10_writer *w,
                                            FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
@@ -435,7 +449,7 @@
 
 
 #if CONFIG_EXT_TX
-static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
+static void update_ext_tx_probs(VP10_COMMON *cm, vp10_writer *w) {
   const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i, j;
@@ -450,7 +464,7 @@
           cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s]);
     }
     do_update = savings > savings_thresh;
-    vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
     if (do_update) {
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_inter_ext_tx_for_txsize[s][i]) continue;
@@ -473,7 +487,7 @@
             cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s]);
     }
     do_update = savings > savings_thresh;
-    vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
     if (do_update) {
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_intra_ext_tx_for_txsize[s][i]) continue;
@@ -489,7 +503,7 @@
 
 #else
 
-static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
+static void update_ext_tx_probs(VP10_COMMON *cm, vp10_writer *w) {
   const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i, j;
@@ -503,7 +517,7 @@
           cm->counts.intra_ext_tx[i][j], TX_TYPES);
   }
   do_update = savings > savings_thresh;
-  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (j = 0; j < TX_TYPES; ++j)
@@ -521,7 +535,7 @@
         cm->counts.inter_ext_tx[i], TX_TYPES);
   }
   do_update = savings > savings_thresh;
-  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       prob_diff_update(vp10_ext_tx_tree,
@@ -533,7 +547,7 @@
 }
 #endif  // CONFIG_EXT_TX
 
-static void pack_palette_tokens(vpx_writer *w, const TOKENEXTRA **tp,
+static void pack_palette_tokens(vp10_writer *w, const TOKENEXTRA **tp,
                                 int n, int num) {
   int i;
   const TOKENEXTRA *p = *tp;
@@ -548,7 +562,7 @@
 }
 
 #if CONFIG_SUPERTX
-static void update_supertx_probs(VP10_COMMON *cm, vpx_writer *w) {
+static void update_supertx_probs(VP10_COMMON *cm, vp10_writer *w) {
   const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i, j;
@@ -561,7 +575,7 @@
     }
   }
   do_update = savings > savings_thresh;
-  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
       for (j = 1; j < TX_SIZES; ++j) {
@@ -574,7 +588,7 @@
 #endif  // CONFIG_SUPERTX
 
 #if !CONFIG_ANS
-static void pack_mb_tokens(vpx_writer *w,
+static void pack_mb_tokens(vp10_writer *w,
                            const TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
   const TOKENEXTRA *p = *tp;
@@ -605,13 +619,13 @@
     if (p->skip_eob_node)
       n -= p->skip_eob_node;
     else
-      vpx_write(w, t != EOB_TOKEN, p->context_tree[0]);
+      vp10_write(w, t != EOB_TOKEN, p->context_tree[0]);
 
     if (t != EOB_TOKEN) {
-      vpx_write(w, t != ZERO_TOKEN, p->context_tree[1]);
+      vp10_write(w, t != ZERO_TOKEN, p->context_tree[1]);
 
       if (t != ZERO_TOKEN) {
-        vpx_write(w, t != ONE_TOKEN, p->context_tree[2]);
+        vp10_write(w, t != ONE_TOKEN, p->context_tree[2]);
 
         if (t != ONE_TOKEN) {
           int len = UNCONSTRAINED_NODES - p->skip_eob_node;
@@ -639,13 +653,13 @@
             skip_bits--;
             assert(!bb);
           } else {
-            vpx_write(w, bb, pb[i >> 1]);
+            vp10_write(w, bb, pb[i >> 1]);
           }
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vpx_write_bit(w, e & 1);
+      vp10_write_bit(w, e & 1);
     }
     ++p;
 
@@ -742,7 +756,7 @@
 #endif  // !CONFIG_ANS
 
 #if CONFIG_VAR_TX
-static void pack_txb_tokens(vpx_writer *w,
+static void pack_txb_tokens(vp10_writer *w,
                            const TOKENEXTRA **tp,
                            const TOKENEXTRA *const tok_end,
                            MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
@@ -793,7 +807,7 @@
 }
 #endif
 
-static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
+static void write_segment_id(vp10_writer *w, const struct segmentation *seg,
                              const struct segmentation_probs *segp,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
@@ -802,7 +816,7 @@
 
 // This function encodes the reference frame
 static void write_ref_frames(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                             vpx_writer *w) {
+                             vp10_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
@@ -817,9 +831,9 @@
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vpx_write(w, is_compound, vp10_get_reference_mode_prob(cm, xd));
+      vp10_write(w, is_compound, vp10_get_reference_mode_prob(cm, xd));
     } else {
-      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
+      assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
@@ -830,18 +844,18 @@
 #else
       const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
 #endif  // CONFIG_EXT_REFS
-      vpx_write(w, bit, vp10_get_pred_prob_comp_ref_p(cm, xd));
+      vp10_write(w, bit, vp10_get_pred_prob_comp_ref_p(cm, xd));
 
 #if CONFIG_EXT_REFS
       if (!bit) {
         const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
-        vpx_write(w, bit1, vp10_get_pred_prob_comp_ref_p1(cm, xd));
+        vp10_write(w, bit1, vp10_get_pred_prob_comp_ref_p1(cm, xd));
       } else {
         const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
-        vpx_write(w, bit2, vp10_get_pred_prob_comp_ref_p2(cm, xd));
+        vp10_write(w, bit2, vp10_get_pred_prob_comp_ref_p2(cm, xd));
         if (!bit2) {
           const int bit3 = mbmi->ref_frame[0] == LAST3_FRAME;
-          vpx_write(w, bit3, vp10_get_pred_prob_comp_ref_p3(cm, xd));
+          vp10_write(w, bit3, vp10_get_pred_prob_comp_ref_p3(cm, xd));
         }
       }
 #endif  // CONFIG_EXT_REFS
@@ -849,30 +863,30 @@
 #if CONFIG_EXT_REFS
       const int bit0 = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
                         mbmi->ref_frame[0] == ALTREF_FRAME);
-      vpx_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+      vp10_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
 
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        vpx_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+        vp10_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
       } else {
         const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
                           mbmi->ref_frame[0] == LAST4_FRAME);
-        vpx_write(w, bit2, vp10_get_pred_prob_single_ref_p3(cm, xd));
+        vp10_write(w, bit2, vp10_get_pred_prob_single_ref_p3(cm, xd));
 
         if (!bit2) {
           const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
-          vpx_write(w, bit3, vp10_get_pred_prob_single_ref_p4(cm, xd));
+          vp10_write(w, bit3, vp10_get_pred_prob_single_ref_p4(cm, xd));
         } else {
           const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
-          vpx_write(w, bit4, vp10_get_pred_prob_single_ref_p5(cm, xd));
+          vp10_write(w, bit4, vp10_get_pred_prob_single_ref_p5(cm, xd));
         }
       }
 #else
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      vpx_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+      vp10_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        vpx_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+        vp10_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
       }
 #endif  // CONFIG_EXT_REFS
     }
@@ -882,13 +896,13 @@
 #if CONFIG_EXT_INTRA
 static void write_ext_intra_mode_info(const VP10_COMMON *const cm,
                                       const MB_MODE_INFO *const mbmi,
-                                      vpx_writer *w) {
+                                      vp10_writer *w) {
 #if !ALLOW_FILTER_INTRA_MODES
   return;
 #endif
   if (mbmi->mode == DC_PRED &&
       mbmi->palette_mode_info.palette_size[0] == 0) {
-    vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[0],
+    vp10_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[0],
               cm->fc->ext_intra_probs[0]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
       EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[0];
@@ -897,7 +911,7 @@
   }
   if (mbmi->uv_mode == DC_PRED &&
       mbmi->palette_mode_info.palette_size[1] == 0) {
-    vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[1],
+    vp10_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[1],
               cm->fc->ext_intra_probs[1]);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
       EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[1];
@@ -909,7 +923,7 @@
 
 static void write_switchable_interp_filter(VP10_COMP *cpi,
                                            const MACROBLOCKD *xd,
-                                           vpx_writer *w) {
+                                           vp10_writer *w) {
   VP10_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   if (cm->interp_filter == SWITCHABLE) {
@@ -930,7 +944,7 @@
 static void write_palette_mode_info(const VP10_COMMON *cm,
                                     const MACROBLOCKD *xd,
                                     const MODE_INFO *const mi,
-                                    vpx_writer *w) {
+                                    vp10_writer *w) {
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
@@ -945,30 +959,30 @@
       palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
     if (left_mi)
       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    vpx_write(w, n > 0,
+    vp10_write(w, n > 0,
               vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx]);
     if (n > 0) {
       vp10_write_token(w, vp10_palette_size_tree,
                        vp10_default_palette_y_size_prob[bsize - BLOCK_8X8],
                        &palette_size_encodings[n - 2]);
       for (i = 0; i < n; ++i)
-        vpx_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
+        vp10_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
       write_uniform(w, n, pmi->palette_first_color_idx[0]);
     }
   }
 
   if (mbmi->uv_mode == DC_PRED) {
     n = pmi->palette_size[1];
-    vpx_write(w, n > 0,
+    vp10_write(w, n > 0,
               vp10_default_palette_uv_mode_prob[pmi->palette_size[0] > 0]);
     if (n > 0) {
       vp10_write_token(w, vp10_palette_size_tree,
                        vp10_default_palette_uv_size_prob[bsize - BLOCK_8X8],
                        &palette_size_encodings[n - 2]);
       for (i = 0; i < n; ++i) {
-        vpx_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
+        vp10_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
                           cm->bit_depth);
-        vpx_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
+        vp10_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
                           cm->bit_depth);
       }
       write_uniform(w, n, pmi->palette_first_color_idx[1]);
@@ -980,7 +994,7 @@
 #if CONFIG_SUPERTX
                                 int supertx_enabled,
 #endif
-                                vpx_writer *w) {
+                                vp10_writer *w) {
   VP10_COMMON *const cm = &cpi->common;
 #if !CONFIG_REF_MV
   const nmv_context *nmvc = &cm->fc->nmvc;
@@ -1003,7 +1017,7 @@
     if (seg->temporal_update) {
       const int pred_flag = mbmi->seg_id_predicted;
       vpx_prob pred_prob = vp10_get_pred_prob_seg_id(segp, xd);
-      vpx_write(w, pred_flag, pred_prob);
+      vp10_write(w, pred_flag, pred_prob);
       if (!pred_flag)
         write_segment_id(w, seg, segp, segment_id);
     } else {
@@ -1024,7 +1038,7 @@
   if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
     if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-      vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
+      vp10_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
 #if CONFIG_SUPERTX
@@ -1106,7 +1120,7 @@
     if (!supertx_enabled)
 #endif  // CONFIG_SUPERTX
       if (is_obmc_allowed(mbmi))
-        vpx_write(w, mbmi->obmc, cm->fc->obmc_prob[bsize]);
+        vp10_write(w, mbmi->obmc, cm->fc->obmc_prob[bsize]);
 #endif  // CONFIG_OBMC
 
 #if CONFIG_REF_MV
@@ -1280,16 +1294,18 @@
 #endif  // CONFIG_SUPERTX
         is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
-      vpx_write(w, interintra, cm->fc->interintra_prob[bsize]);
+      const int bsize_group = size_group_lookup[bsize];
+      vp10_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
       if (interintra) {
-        write_intra_mode(w, mbmi->interintra_mode,
-                         cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+        write_interintra_mode(
+            w, mbmi->interintra_mode,
+            cm->fc->interintra_mode_prob[bsize_group]);
         assert(mbmi->interintra_mode == mbmi->interintra_uv_mode);
-        if (get_wedge_bits(bsize)) {
-          vpx_write(w, mbmi->use_wedge_interintra,
-                    cm->fc->wedge_interintra_prob[bsize]);
+        if (is_interintra_wedge_used(bsize)) {
+          vp10_write(w, mbmi->use_wedge_interintra,
+                     cm->fc->wedge_interintra_prob[bsize]);
           if (mbmi->use_wedge_interintra) {
-            vpx_write_literal(w, mbmi->interintra_wedge_index,
+            vp10_write_literal(w, mbmi->interintra_wedge_index,
                               get_wedge_bits(bsize));
           }
         }
@@ -1300,11 +1316,11 @@
 #if CONFIG_OBMC
         !(is_obmc_allowed(mbmi) && mbmi->obmc) &&
 #endif  // CONFIG_OBMC
-        get_wedge_bits(bsize)) {
-      vpx_write(w, mbmi->use_wedge_interinter,
-                cm->fc->wedge_interinter_prob[bsize]);
+        is_interinter_wedge_used(bsize)) {
+      vp10_write(w, mbmi->use_wedge_interinter,
+                 cm->fc->wedge_interinter_prob[bsize]);
       if (mbmi->use_wedge_interinter)
-        vpx_write_literal(w, mbmi->interinter_wedge_index,
+        vp10_write_literal(w, mbmi->interinter_wedge_index,
                           get_wedge_bits(bsize));
     }
 #endif  // CONFIG_EXT_INTER
@@ -1368,7 +1384,7 @@
 }
 
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vpx_writer *w) {
+                              MODE_INFO **mi_8x8, vp10_writer *w) {
   const struct segmentation *const seg = &cm->seg;
   const struct segmentation_probs *const segp = &cm->fc->seg;
   const MODE_INFO *const mi = mi_8x8[0];
@@ -1460,30 +1476,18 @@
 #endif  // CONFIG_EXT_INTRA
 }
 
-#if CONFIG_ANS && CONFIG_SUPERTX
-#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end,            \
-                              supertx_enabled, mi_row, mi_col)            \
-  write_modes_b(cpi, tile, w, ans, tok, tok_end, supertx_enabled, mi_row, \
-                mi_col)
-#elif CONFIG_SUPERTX
-#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, \
+#if CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end,      \
                               supertx_enabled, mi_row, mi_col) \
   write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
-#elif CONFIG_ANS
-#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, \
-                              supertx_enabled, mi_row, mi_col) \
-  write_modes_b(cpi, tile, w, ans, tok, tok_end, mi_row, mi_col)
 #else
-#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, \
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end,      \
                               supertx_enabled, mi_row, mi_col) \
   write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
 #endif  // CONFIG_ANS && CONFIG_SUPERTX
 
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
-                          vpx_writer *w,
-#if CONFIG_ANS
-                          struct BufAnsCoder *ans,
-#endif  // CONFIG_ANS
+                          vp10_writer *w,
                           const TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
 #if CONFIG_SUPERTX
@@ -1494,6 +1498,7 @@
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
   int plane;
+  int bh, bw;
 #if CONFIG_ANS
   (void) tok;
   (void) tok_end;
@@ -1503,18 +1508,21 @@
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
+  assert(m->mbmi.sb_type <= cm->sb_size);
+
+  bh = num_8x8_blocks_high_lookup[m->mbmi.sb_type];
+  bw = num_8x8_blocks_wide_lookup[m->mbmi.sb_type];
+
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile,
-                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
-                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cm, xd, xd->mi, w);
   } else {
 #if CONFIG_VAR_TX
     xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+    xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 #endif
     pack_inter_mode_mvs(cpi, m,
 #if CONFIG_SUPERTX
@@ -1576,7 +1584,7 @@
         for (row = 0; row < num_4x4_h; row += bw)
           for (col = 0; col < num_4x4_w; col += bw)
 #if CONFIG_ANS
-            pack_mb_tokens_ans(ans, cm->token_tab, tok, tok_end, cm->bit_depth,
+            pack_mb_tokens_ans(w, cm->token_tab, tok, tok_end, cm->bit_depth,
                                tx);
 #else
             pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
@@ -1586,7 +1594,7 @@
       TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
                          : m->mbmi.tx_size;
 #if CONFIG_ANS
-      pack_mb_tokens_ans(ans, cm->token_tab, tok, tok_end, cm->bit_depth, tx);
+      pack_mb_tokens_ans(w, cm->token_tab, tok, tok_end, cm->bit_depth, tx);
 #else
       pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
 #endif  // CONFIG_ANS
@@ -1600,7 +1608,8 @@
 static void write_partition(const VP10_COMMON *const cm,
                             const MACROBLOCKD *const xd,
                             int hbs, int mi_row, int mi_col,
-                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
+                            PARTITION_TYPE p, BLOCK_SIZE bsize,
+                            vp10_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   const vpx_prob *const probs = cm->fc->partition_prob[ctx];
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
@@ -1618,41 +1627,29 @@
 #endif  // CONFIG_EXT_PARTITION_TYPES
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
-    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
+    vp10_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
-    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
+    vp10_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
     assert(p == PARTITION_SPLIT);
   }
 }
 
-#if CONFIG_ANS && CONFIG_SUPERTX
-#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,            \
-                               supertx_enabled, mi_row, mi_col, bsize)     \
-  write_modes_sb(cpi, tile, w, ans, tok, tok_end, supertx_enabled, mi_row, \
-                 mi_col, bsize)
-#elif CONFIG_SUPERTX
-#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,               \
+#if CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end,                    \
                                supertx_enabled, mi_row, mi_col, bsize)        \
   write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
                  bsize)
-#elif CONFIG_ANS
-#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,        \
-                               supertx_enabled, mi_row, mi_col, bsize) \
-  write_modes_sb(cpi, tile, w, ans, tok, tok_end, mi_row, mi_col, bsize)
 #else
-#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,        \
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end,             \
                                supertx_enabled, mi_row, mi_col, bsize) \
   write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
 #endif  // CONFIG_ANS && CONFIG_SUPERTX
 
 static void write_modes_sb(VP10_COMP *const cpi,
                            const TileInfo *const tile,
-                           vpx_writer *const w,
-#if CONFIG_ANS
-                           struct BufAnsCoder *ans,
-#endif  // CONFIG_ANS
+                           vp10_writer *const w,
                            const TOKENEXTRA **tok,
                            const TOKENEXTRA *const tok_end,
 #if CONFIG_SUPERTX
@@ -1661,13 +1658,12 @@
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
-  const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  MODE_INFO *m = NULL;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
 #if CONFIG_SUPERTX
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *mbmi;
   const int pack_token = !supertx_enabled;
   TX_SIZE supertx_size;
   int plane;
@@ -1676,17 +1672,10 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
-
-  partition = partition_lookup[bsl][m->mbmi.sb_type];
-#if CONFIG_EXT_PARTITION_TYPES
-  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                            mi_row, mi_col, bsize);
-#endif
-  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
-  subsize = get_subsize(bsize, partition);
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
 #if CONFIG_SUPERTX
-  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+  xd->mi = cm->mi_grid_visible + mi_offset;
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[bsize],
                  mi_col, num_8x8_blocks_wide_lookup[bsize],
@@ -1700,9 +1689,9 @@
     prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
                                [supertx_size];
     supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
-    vpx_write(w, supertx_enabled, prob);
+    vp10_write(w, supertx_enabled, prob);
     if (supertx_enabled) {
-      vpx_write(w, xd->mi[0]->mbmi.skip, vp10_get_skip_prob(cm, xd));
+      vp10_write(w, xd->mi[0]->mbmi.skip, vp10_get_skip_prob(cm, xd));
 #if CONFIG_EXT_TX
       if (get_ext_tx_types(supertx_size, bsize, 1) > 1 &&
           !xd->mi[0]->mbmi.skip) {
@@ -1726,106 +1715,70 @@
   }
 #endif  // CONFIG_SUPERTX
   if (subsize < BLOCK_8X8) {
-    write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+    write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                           mi_row, mi_col);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row, mi_col);
         break;
       case PARTITION_HORZ:
-        write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row, mi_col);
-        if (mi_row + bs < cm->mi_rows)
-          write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end,
-                                supertx_enabled, mi_row + bs, mi_col);
+        if (mi_row + hbs < cm->mi_rows)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
+                                supertx_enabled, mi_row + hbs, mi_col);
         break;
       case PARTITION_VERT:
-        write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                               mi_row, mi_col);
-        if (mi_col + bs < cm->mi_cols)
-          write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end,
-                                supertx_enabled, mi_row, mi_col + bs);
+        if (mi_col + hbs < cm->mi_cols)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
+                                supertx_enabled, mi_row, mi_col + hbs);
         break;
       case PARTITION_SPLIT:
-        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                                mi_row, mi_col, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
-                               mi_row, mi_col + bs, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
-                               mi_row + bs, mi_col, subsize);
-        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
-                               mi_row + bs, mi_col + bs, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col + hbs, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col + hbs, subsize);
         break;
 #if CONFIG_EXT_PARTITION_TYPES
       case PARTITION_HORZ_A:
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row, mi_col + bs);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row + bs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col);
         break;
       case PARTITION_HORZ_B:
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row + bs, mi_col);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row + bs, mi_col + bs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col + hbs);
         break;
       case PARTITION_VERT_A:
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row + bs, mi_col);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row, mi_col + bs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col + hbs);
         break;
       case PARTITION_VERT_B:
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
                       mi_row, mi_col);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row, mi_col + bs);
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif
-                      mi_row + bs, mi_col + bs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col + hbs);
         break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
       default:
@@ -1834,22 +1787,22 @@
   }
 #if CONFIG_SUPERTX
   if (partition != PARTITION_NONE && supertx_enabled && pack_token &&
-      !m->mbmi.skip) {
+      !mbmi->skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const int mbmi_txb_size = txsize_to_bsize[m->mbmi.tx_size];
+      const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
       const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi_txb_size];
       const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi_txb_size];
       int row, col;
-      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
-                         : m->mbmi.tx_size;
+      TX_SIZE tx = plane ? get_uv_tx_size(mbmi, &xd->plane[plane])
+                         : mbmi->tx_size;
       BLOCK_SIZE txb_size = txsize_to_bsize[tx];
       int bw = num_4x4_blocks_wide_lookup[txb_size];
 
       for (row = 0; row < num_4x4_h; row += bw)
         for (col = 0; col < num_4x4_w; col += bw)
 #if CONFIG_ANS
-          pack_mb_tokens_ans(ans, cm->token_tab, tok, tok_end, cm->bit_depth,
+          pack_mb_tokens_ans(w, cm->token_tab, tok, tok_end, cm->bit_depth,
                              tx);
 #else
           pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
@@ -1872,10 +1825,7 @@
 
 static void write_modes(VP10_COMP *const cpi,
                         const TileInfo *const tile,
-                        vpx_writer *const w,
-#if CONFIG_ANS
-                        struct BufAnsCoder *ans,
-#endif  // CONFIG_ANS
+                        vp10_writer *const w,
                         const TOKENEXTRA **tok,
                         const TOKENEXTRA *const tok_end) {
   VP10_COMMON *const cm = &cpi->common;
@@ -1888,12 +1838,12 @@
 
   vp10_zero_above_context(cm, mi_col_start, mi_col_end);
 
-  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) {
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
     vp10_zero_left_context(xd);
 
-    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
-      write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, 0,
-                             mi_row, mi_col, BLOCK_LARGEST);
+    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
+      write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0,
+                             mi_row, mi_col, cm->sb_size);
     }
   }
 }
@@ -1925,7 +1875,7 @@
   }
 }
 
-static void update_coef_probs_common(vpx_writer* const bc, VP10_COMP *cpi,
+static void update_coef_probs_common(vp10_writer* const bc, VP10_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp10_coeff_stats *frame_branch_ct,
                                      vp10_coeff_probs_model *new_coef_probs) {
@@ -1971,10 +1921,10 @@
 
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
-        vpx_write_bit(bc, 0);
+        vp10_write_bit(bc, 0);
         return;
       }
-      vpx_write_bit(bc, 1);
+      vp10_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -1996,7 +1946,7 @@
                       *oldp, &newp, upd);
                 if (s > 0 && newp != *oldp)
                   u = 1;
-                vpx_write(bc, u, upd);
+                vp10_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp10_write_prob_diff_update(bc, newp, *oldp);
@@ -2044,11 +1994,11 @@
                 if (u == 1 && updates == 1) {
                   int v;
                   // first update
-                  vpx_write_bit(bc, 1);
+                  vp10_write_bit(bc, 1);
                   for (v = 0; v < noupdates_before_first; ++v)
-                    vpx_write(bc, 0, upd);
+                    vp10_write(bc, 0, upd);
                 }
-                vpx_write(bc, u, upd);
+                vp10_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp10_write_prob_diff_update(bc, newp, *oldp);
@@ -2060,7 +2010,7 @@
         }
       }
       if (updates == 0) {
-        vpx_write_bit(bc, 0);  // no updates
+        vp10_write_bit(bc, 0);  // no updates
       }
       return;
     }
@@ -2118,7 +2068,7 @@
           }
 }
 
-static void update_coef_probs_subframe(vpx_writer* const bc, VP10_COMP *cpi,
+static void update_coef_probs_subframe(vp10_writer* const bc, VP10_COMP *cpi,
                                        TX_SIZE tx_size,
                                        vp10_coeff_stats
                                        branch_ct[COEF_PROBS_BUFS][TX_SIZES]
@@ -2177,10 +2127,10 @@
 
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
-        vpx_write_bit(bc, 0);
+        vp10_write_bit(bc, 0);
         return;
       }
-      vpx_write_bit(bc, 1);
+      vp10_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -2209,7 +2159,7 @@
                                                        max_idx);
                 if (s > 0 && newp != *oldp)
                   u = 1;
-                vpx_write(bc, u, upd);
+                vp10_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp10_write_prob_diff_update(bc, newp, *oldp);
@@ -2261,11 +2211,11 @@
                 if (u == 1 && updates == 1) {
                   int v;
                   // first update
-                  vpx_write_bit(bc, 1);
+                  vp10_write_bit(bc, 1);
                   for (v = 0; v < noupdates_before_first; ++v)
-                    vpx_write(bc, 0, upd);
+                    vp10_write(bc, 0, upd);
                 }
-                vpx_write(bc, u, upd);
+                vp10_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp10_write_prob_diff_update(bc, newp, *oldp);
@@ -2277,7 +2227,7 @@
         }
       }
       if (updates == 0) {
-        vpx_write_bit(bc, 0);  // no updates
+        vp10_write_bit(bc, 0);  // no updates
       }
       return;
     }
@@ -2287,7 +2237,7 @@
 }
 #endif  // CONFIG_ENTROPY
 
-static void update_coef_probs(VP10_COMP *cpi, vpx_writer* w) {
+static void update_coef_probs(VP10_COMP *cpi, vp10_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
@@ -2318,7 +2268,7 @@
     vp10_coeff_probs_model frame_coef_probs[PLANE_TYPES];
     if (cpi->td.counts->tx_size_totals[tx_size] <= 20 ||
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
-      vpx_write_bit(w, 0);
+      vp10_write_bit(w, 0);
     } else {
 #if CONFIG_ENTROPY
       if (cm->do_subframe_update &&
@@ -2363,8 +2313,8 @@
     vp10_copy(eob_counts_copy, cm->counts.eob_branch);
     for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
       for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-        full_to_model_counts(cm->counts.coef[tx_size],
-                             subframe_stats->coef_counts_buf[i][tx_size]);
+        vp10_full_to_model_counts(cm->counts.coef[tx_size],
+                                  subframe_stats->coef_counts_buf[i][tx_size]);
       vp10_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
       vp10_partial_adapt_probs(cm, 0, 0);
       vp10_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
@@ -2512,7 +2462,7 @@
   }
 }
 
-static void update_seg_probs(VP10_COMP *cpi, vpx_writer *w) {
+static void update_seg_probs(VP10_COMP *cpi, vp10_writer *w) {
   VP10_COMMON *cm = &cpi->common;
 
   if (!cpi->common.seg.enabled)
@@ -2540,7 +2490,7 @@
 }
 
 
-static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_txfm_probs(VP10_COMMON *cm, vp10_writer *w,
                               FRAME_COUNTS *counts) {
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
@@ -2582,21 +2532,32 @@
   }
 }
 
-static void write_tile_info(VP10_COMMON *const cm,
+static void write_tile_info(const VP10_COMMON *const cm,
                             struct vpx_write_bit_buffer *wb) {
 #if CONFIG_EXT_TILE
-  // TODO(geza.lore): Dependent on CU_SIZE
   const int tile_width  =
-            mi_cols_aligned_to_sb(cm->tile_width) >> MI_BLOCK_SIZE_LOG2;
+    ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >> cm->mib_size_log2;
   const int tile_height =
-            mi_cols_aligned_to_sb(cm->tile_height) >> MI_BLOCK_SIZE_LOG2;
+    ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >> cm->mib_size_log2;
 
-  assert(tile_width > 0 && tile_width <= 64);
-  assert(tile_height > 0 && tile_height <= 64);
+  assert(tile_width > 0);
+  assert(tile_height > 0);
 
   // Write the tile sizes
-  vpx_wb_write_literal(wb, tile_width - 1, 6);
-  vpx_wb_write_literal(wb, tile_height - 1, 6);
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    assert(tile_width <= 32);
+    assert(tile_height <= 32);
+    vpx_wb_write_literal(wb, tile_width - 1, 5);
+    vpx_wb_write_literal(wb, tile_height - 1, 5);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    assert(tile_width <= 64);
+    assert(tile_height <= 64);
+    vpx_wb_write_literal(wb, tile_width - 1, 6);
+    vpx_wb_write_literal(wb, tile_height - 1, 6);
+  }
 #else
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
@@ -2713,11 +2674,10 @@
                            uint8_t *const dst,
                            unsigned int *max_tile_size,
                            unsigned int *max_tile_col_size) {
-  VP10_COMMON *const cm = &cpi->common;
-  vpx_writer mode_bc;
+  const VP10_COMMON *const cm = &cpi->common;
+  vp10_writer mode_bc;
 #if CONFIG_ANS
   struct AnsCoder token_ans;
-  struct BufAnsCoder buffered_ans;
 #endif  // CONFIG_ANS
   int tile_row, tile_col;
   TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
@@ -2758,6 +2718,7 @@
       unsigned int tile_size;
       const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
       const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      const int data_offset = have_tiles ? 4 : 0;
 
       vp10_tile_set_row(&tile_info, cm, tile_row);
 
@@ -2765,28 +2726,20 @@
 
       // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
       // even for the last one, unless no tiling is used at all.
-      if (have_tiles) {
-        total_size += 4;
-        vpx_start_encode(&mode_bc, buf->data + 4);
-      } else {
-        vpx_start_encode(&mode_bc, buf->data);
-      }
-
+      total_size += data_offset;
 #if !CONFIG_ANS
+      vpx_start_encode(&mode_bc, buf->data + data_offset);
       write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
       assert(tok == tok_end);
       vpx_stop_encode(&mode_bc);
       tile_size = mode_bc.pos;
 #else
-      buf_ans_write_init(&buffered_ans, uco_ans_buf, ans_window_size);
-      write_modes(cpi, &tile_info, &mode_bc, &buffered_ans, &tok, tok_end);
+      buf_ans_write_init(&mode_bc, uco_ans_buf, ans_window_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
       assert(tok == tok_end);
-      vpx_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
-
-      ans_write_init(&token_ans, dst + total_size + tile_size);
-      buf_ans_flush(&buffered_ans, &token_ans);
-      tile_size += ans_write_end(&token_ans);
+      ans_write_init(&token_ans, buf->data + data_offset);
+      buf_ans_flush(&mode_bc, &token_ans);
+      tile_size = ans_write_end(&token_ans);
 #endif  // !CONFIG_ANS
 
       buf->size = tile_size;
@@ -2850,23 +2803,19 @@
       if (!is_last_tile)
         total_size += 4;
 
-      vpx_start_encode(&mode_bc, dst + total_size);
-
 #if !CONFIG_ANS
+      vpx_start_encode(&mode_bc, dst + total_size);
       write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
       assert(tok == tok_end);
       vpx_stop_encode(&mode_bc);
       tile_size = mode_bc.pos;
 #else
-      buf_ans_write_init(&buffered_ans, uco_ans_buf, ans_window_size);
-      write_modes(cpi, &tile_info, &mode_bc, &buffered_ans, &tok, tok_end);
+      buf_ans_write_init(&mode_bc, uco_ans_buf, ans_window_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
       assert(tok == tok_end);
-      vpx_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
-
-      ans_write_init(&token_ans, dst + total_size + tile_size);
-      buf_ans_flush(&buffered_ans, &token_ans);
-      tile_size += ans_write_end(&token_ans);
+      ans_write_init(&token_ans, dst + total_size);
+      buf_ans_flush(&mode_bc, &token_ans);
+      tile_size = ans_write_end(&token_ans);
 #endif  // !CONFIG_ANS
 
       assert(tile_size > 0);
@@ -3059,6 +3008,15 @@
 
   vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
+  assert(cm->mib_size == num_8x8_blocks_wide_lookup[cm->sb_size]);
+  assert(cm->mib_size == 1 << cm->mib_size_log2);
+#if CONFIG_EXT_PARTITION
+  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
+  vpx_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
+#else
+  assert(cm->sb_size == BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
   encode_loopfilter(cm, wb);
 #if CONFIG_LOOP_RESTORATION
   encode_restoration(cm, wb);
@@ -3088,10 +3046,20 @@
 #endif  // CONFIG_SUPERTX
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
-  vpx_writer header_bc;
+  vp10_writer header_bc;
   int i, j;
 
+#if CONFIG_ANS
+  struct AnsCoder header_ans;
+  struct buffered_ans_symbol *uco_ans_buf;
+  const int ans_window_size = 50000;  // TODO(aconverse): revisit window size
+  int header_size;
+  CHECK_MEM_ERROR(cm, uco_ans_buf,
+                  vpx_malloc(ans_window_size * sizeof(*uco_ans_buf)));
+  buf_ans_write_init(&header_bc, uco_ans_buf, ans_window_size);
+#else
   vpx_start_encode(&header_bc, data);
+#endif
   update_txfm_probs(cm, &header_bc, counts);
   update_coef_probs(cpi, &header_bc);
 
@@ -3144,15 +3112,21 @@
     update_inter_compound_mode_probs(cm, &header_bc);
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
-      for (i = 0; i < BLOCK_SIZES; i++) {
-        if (is_interintra_allowed_bsize(i)) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
           vp10_cond_prob_diff_update(&header_bc,
                                      &fc->interintra_prob[i],
                                      cm->counts.interintra[i]);
         }
       }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        prob_diff_update(vp10_interintra_mode_tree,
+                         cm->fc->interintra_mode_prob[i],
+                         counts->interintra_mode[i],
+                         INTERINTRA_MODES, &header_bc);
+      }
       for (i = 0; i < BLOCK_SIZES; i++) {
-        if (is_interintra_allowed_bsize(i) && get_wedge_bits(i))
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
           vp10_cond_prob_diff_update(&header_bc,
                                      &fc->wedge_interintra_prob[i],
                                      cm->counts.wedge_interintra[i]);
@@ -3160,7 +3134,7 @@
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < BLOCK_SIZES; i++)
-        if (get_wedge_bits(i))
+        if (is_interinter_wedge_used(i))
           vp10_cond_prob_diff_update(&header_bc,
                                      &fc->wedge_interinter_prob[i],
                                      cm->counts.wedge_interinter[i]);
@@ -3223,10 +3197,18 @@
 #endif  // CONFIG_SUPERTX
   }
 
+#if CONFIG_ANS
+  ans_write_init(&header_ans, data);
+  buf_ans_flush(&header_bc, &header_ans);
+  vpx_free(uco_ans_buf);
+  header_size = ans_write_end(&header_ans);
+  assert(header_size <= 0xffff);
+  return header_size;
+#else
   vpx_stop_encode(&header_bc);
   assert(header_bc.pos <= 0xffff);
-
   return header_bc.pos;
+#endif  // CONFIG_ANS
 }
 
 static int choose_size_bytes(uint32_t size, int spare_msbs) {
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index b5e61d9..2e8af98 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -64,7 +64,7 @@
 
 typedef struct {
   uint8_t best_palette_color_map[MAX_SB_SQUARE];
-  double kmeans_data_buf[2 * MAX_SB_SQUARE];
+  float kmeans_data_buf[2 * MAX_SB_SQUARE];
   uint8_t kmeans_indices_buf[MAX_SB_SQUARE];
   uint8_t kmeans_pre_indices_buf[MAX_SB_SQUARE];
 } PALETTE_BUFFER;
@@ -140,11 +140,11 @@
 
   // Notes transform blocks where no coefficents are coded.
   // Set during mode selection. Read during block encoding.
-  uint8_t zcoeff_blk[TX_SIZES][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
+  uint8_t zcoeff_blk[TX_SIZES][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
 #if CONFIG_VAR_TX
-  uint8_t blk_skip[MAX_MB_PLANE][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
+  uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
 #if CONFIG_REF_MV
-  uint8_t blk_skip_drl[MAX_MB_PLANE][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
+  uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
 #endif
 #endif
 
diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index b7c8260..41155c9 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -244,8 +244,16 @@
     }
     ++square_index;
   }
-  td->pc_root = &td->pc_tree[tree_nodes - 1];
-  td->pc_root[0].none.best_mode_index = 2;
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[i]->none.best_mode_index = 2;
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->pc_root[i] = td->pc_root[i+1]->split[0];
+    td->pc_root[i]->none.best_mode_index = 2;
+  }
 }
 
 void vp10_free_pc_tree(ThreadData *td) {
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index b73f66c..06463c1 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -49,6 +49,12 @@
 #include "vp10/encoder/segmentation.h"
 #include "vp10/encoder/tokenize.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+# define IF_HBD(...) __VA_ARGS__
+#else
+# define IF_HBD(...)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void encode_superblock(VP10_COMP *cpi, ThreadData * td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -276,7 +282,8 @@
 
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
   xd->max_tx_size = max_txsize_lookup[bsize];
 #endif
 
@@ -372,7 +379,11 @@
   assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
   set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
                  cm->mi_rows, cm->mi_cols);
+#if CONFIG_EXT_TILE
+  xd->up_available    = (mi_row_ori > tile->mi_row_start);
+#else
   xd->up_available    = (mi_row_ori != 0);
+#endif  // CONFIG_EXT_TILE
   xd->left_available  = (mi_col_ori > tile->mi_col_start);
 
   // R/D setup.
@@ -408,234 +419,102 @@
   }
 }
 
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int log2_count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-typedef struct {
-  partition_variance part_variances;
-  var split[4];
-} v4x4;
-
-typedef struct {
-  partition_variance part_variances;
-  v4x4 split[4];
-} v8x8;
-
-typedef struct {
-  partition_variance part_variances;
-  v8x8 split[4];
-} v16x16;
-
-typedef struct {
-  partition_variance part_variances;
-  v16x16 split[4];
-} v32x32;
-
-typedef struct {
-  partition_variance part_variances;
-  v32x32 split[4];
-} v64x64;
-
-#if CONFIG_EXT_PARTITION
-typedef struct {
-  partition_variance part_variances;
-  v64x64 split[4];
-} v128x128;
-#endif  // CONFIG_EXT_PARTITION
-
-typedef struct {
-  partition_variance *part_variances;
-  var *split[4];
-} variance_node;
-
-typedef enum {
-  V16X16,
-  V32X32,
-  V64X64,
-#if CONFIG_EXT_PARTITION
-  V128X128,
-#endif  // CONFIG_EXT_PARTITION
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
-  int i;
-  node->part_variances = NULL;
-  switch (bsize) {
-#if CONFIG_EXT_PARTITION
-    case BLOCK_128X128: {
-      v128x128 *vt = (v128x128 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-#endif  // CONFIG_EXT_PARTITION
-    case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_4X4: {
-      v4x4 *vt = (v4x4 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i];
-      break;
-    }
-    default: {
-      assert(0);
-      break;
-    }
-  }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->log2_count = c;
-}
-
-static void get_variance(var *v) {
-  v->variance = (int)(256 * (v->sum_square_error -
-      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
-}
-
-static void sum_2_variances(const var *a, const var *b, var *r) {
-  assert(a->log2_count == b->log2_count);
-  fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->log2_count + 1, r);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
-  variance_node node;
-  memset(&node, 0, sizeof(node));
-  tree_to_node(data, bsize, &node);
-  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
-  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
-  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
-  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
-  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
-                  &node.part_variances->none);
-}
-
-static int set_vt_partitioning(VP10_COMP *cpi,
+static void set_vt_partitioning(VP10_COMP *cpi,
                                MACROBLOCK *const x,
                                MACROBLOCKD *const xd,
-                               void *data,
-                               BLOCK_SIZE bsize,
+                               VAR_TREE *vt,
                                int mi_row,
                                int mi_col,
-                               int64_t threshold,
-                               BLOCK_SIZE bsize_min,
-                               int force_split) {
+                               const int64_t *const threshold,
+                               const BLOCK_SIZE *const bsize_min) {
   VP10_COMMON * const cm = &cpi->common;
-  variance_node vt;
-  const int block_width = num_8x8_blocks_wide_lookup[bsize];
-  const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  const int hbw = num_8x8_blocks_wide_lookup[vt->bsize] / 2;
+  const int hbh = num_8x8_blocks_high_lookup[vt->bsize] / 2;
+  const int has_cols = mi_col + hbw < cm->mi_cols;
+  const int has_rows = mi_row + hbh < cm->mi_rows;
 
-  assert(block_height == block_width);
-  tree_to_node(data, bsize, &vt);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-  if (force_split == 1)
-    return 0;
+  assert(vt->bsize >= BLOCK_8X8);
+
+  assert(hbh == hbw);
+
+  if (vt->force_split || (!has_cols && !has_rows))
+    goto split;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
-  if (bsize == bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+  if (vt->bsize == bsize_min[0]) {
+    if (has_cols && has_rows &&
+        vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
+    } else {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+      set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+      if (vt->bsize > BLOCK_8X8) {
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+      }
+      return;
     }
-    return 0;
-  } else if (bsize > bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
+  } else if (vt->bsize > bsize_min[0]) {
     // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
-        (bsize > BLOCK_32X32 ||
-        vt.part_variances->none.variance > (threshold << 4))) {
-      return 0;
+        (vt->bsize > BLOCK_32X32 ||
+        vt->variances.none.variance > (threshold[0] << 4))) {
+      goto split;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+    if (has_cols && has_rows &&
+        vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
     }
 
     // Check vertical split.
-    if (mi_row + block_height / 2 < cm->mi_rows) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-      get_variance(&vt.part_variances->vert[0]);
-      get_variance(&vt.part_variances->vert[1]);
-      if (vt.part_variances->vert[0].variance < threshold &&
-          vt.part_variances->vert[1].variance < threshold &&
+    if (has_rows) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+      if (vt->variances.vert[0].variance < threshold[0] &&
+          vt->variances.vert[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        return;
       }
     }
     // Check horizontal split.
-    if (mi_col + block_width / 2 < cm->mi_cols) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-      get_variance(&vt.part_variances->horz[0]);
-      get_variance(&vt.part_variances->horz[1]);
-      if (vt.part_variances->horz[0].variance < threshold &&
-          vt.part_variances->horz[1].variance < threshold &&
+    if (has_cols) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+      if (vt->variances.horz[0].variance < threshold[0] &&
+          vt->variances.horz[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        return;
       }
     }
-
-    return 0;
   }
-  return 0;
+
+split:
+  {
+    set_vt_partitioning(cpi, x, xd, vt->split[0],
+                        mi_row, mi_col,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[1],
+                        mi_row, mi_col + hbw,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[2],
+                        mi_row + hbh, mi_col,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[3],
+                        mi_row + hbh, mi_col + hbw,
+                        threshold + 1, bsize_min + 1);
+    return;
+  }
 }
 
 // Set the variance split thresholds for following the block sizes:
@@ -649,23 +528,24 @@
   const int64_t threshold_base = (int64_t)(threshold_multiplier *
       cpi->y_dequant[q][1]);
   if (is_key_frame) {
-    thresholds[0] = threshold_base;
-    thresholds[1] = threshold_base >> 2;
-    thresholds[2] = threshold_base >> 2;
-    thresholds[3] = threshold_base << 2;
-  } else {
     thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    thresholds[2] = threshold_base;
     if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[0] = threshold_base >> 2;
-      thresholds[2] = threshold_base << 3;
+      thresholds[1] = threshold_base >> 2;
+      thresholds[3] = threshold_base << 3;
     } else {
-      thresholds[0] = threshold_base;
-      thresholds[1] = (5 * threshold_base) >> 2;
+      thresholds[1] = threshold_base;
+      thresholds[2] = (5 * threshold_base) >> 2;
       if (cm->width >= 1920 && cm->height >= 1080)
-        thresholds[1] = (7 * threshold_base) >> 2;
-      thresholds[2] = threshold_base << cpi->oxcf.speed;
+        thresholds[2] = (7 * threshold_base) >> 2;
+      thresholds[3] = threshold_base << cpi->oxcf.speed;
     }
   }
+  thresholds[0] = INT64_MIN;
 }
 
 void vp10_set_variance_partition_thresholds(VP10_COMP *cpi, int q) {
@@ -694,10 +574,10 @@
 }
 
 // Compute the minmax over the 8x8 subblocks.
-static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
-                              int dp, int x16_idx, int y16_idx,
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
 #if CONFIG_VP9_HIGHBITDEPTH
-                              int highbd_flag,
+                              int highbd,
 #endif
                               int pixels_wide,
                               int pixels_high) {
@@ -706,24 +586,26 @@
   int minmax_min = 255;
   // Loop over the 4 8x8 subblocks.
   for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+    const int x8_idx = ((k & 1) << 3);
+    const int y8_idx = ((k >> 1) << 3);
     int min = 0;
     int max = 0;
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      const int src_offset = y8_idx * src_stride + x8_idx;
+      const int ref_offset = y8_idx * ref_stride + x8_idx;
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                              d + y8_idx * dp + x8_idx, dp,
+      if (highbd) {
+        vpx_highbd_minmax_8x8(src + src_offset, src_stride,
+                              ref + ref_offset, ref_stride,
                               &min, &max);
       } else {
-        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                       d + y8_idx * dp + x8_idx, dp,
+        vpx_minmax_8x8(src + src_offset, src_stride,
+                       ref + ref_offset, ref_stride,
                        &min, &max);
       }
 #else
-      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                     d + y8_idx * dp + x8_idx, dp,
+      vpx_minmax_8x8(src + src_offset, src_stride,
+                     ref + ref_offset, ref_stride,
                      &min, &max);
 #endif
       if ((max - min) > minmax_max)
@@ -735,117 +617,259 @@
   return (minmax_max - minmax_min);
 }
 
-static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide,
-                                 int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x4_idx = x8_idx + ((k & 1) << 2);
-    int y4_idx = y8_idx + ((k >> 1) << 2);
-    unsigned int sse = 0;
-    int sum = 0;
-    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      } else {
-        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      }
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return vpx_highbd_avg_4x4(src, stride);
+  } else {
+    return vpx_avg_4x4(src, stride);
+  }
+}
 #else
-      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-      if (!is_key_frame)
-        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+  return vpx_avg_4x4(src, stride);
+}
 #endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
-    }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return vpx_highbd_avg_8x8(src, stride);
+  } else {
+    return vpx_avg_8x8(src, stride);
+  }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+  return vpx_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               const int highbd,
+#endif
+                               BLOCK_SIZE bsize,
+                               BLOCK_SIZE leaf_size,
+                               const int width, const int height,
+                               const uint8_t *const src, const int src_stride,
+                               const uint8_t *const ref, const int ref_stride) {
+  assert(bsize >= leaf_size);
+
+  vt->bsize = bsize;
+
+  vt->force_split = 0;
+
+  vt->src = src;
+  vt->src_stride = src_stride;
+  vt->ref = ref;
+  vt->ref_stride = ref_stride;
+
+  vt->width = width;
+  vt->height = height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  vt->highbd = highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (bsize > leaf_size) {
+    const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    const int px = num_4x4_blocks_wide_lookup[subsize] * 4;
+
+    init_variance_tree(vt->split[0],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       VPXMIN(px, width), VPXMIN(px, height),
+                       src, src_stride,
+                       ref, ref_stride);
+    init_variance_tree(vt->split[1],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       width - px, VPXMIN(px, height),
+                       src + px, src_stride,
+                       ref + px, ref_stride);
+    init_variance_tree(vt->split[2],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       VPXMIN(px, width), height - px,
+                       src + px * src_stride, src_stride,
+                       ref + px * ref_stride, ref_stride);
+    init_variance_tree(vt->split[3],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       width - px, height - px,
+                       src + px * src_stride + px, src_stride,
+                       ref + px * ref_stride + px, ref_stride);
   }
 }
 
-static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide,
-                                 int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt,
+                               const BLOCK_SIZE leaf_size) {
+  if (vt->bsize > leaf_size) {
+    fill_variance_tree(vt->split[0], leaf_size);
+    fill_variance_tree(vt->split[1], leaf_size);
+    fill_variance_tree(vt->split[2], leaf_size);
+    fill_variance_tree(vt->split[3], leaf_size);
+    fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
     unsigned int sse = 0;
     int sum = 0;
-    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      } else {
-        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      }
-#else
-      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-      if (!is_key_frame)
-        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-#endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
+    int src_avg;
+    int ref_avg;
+    assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+    if (leaf_size == BLOCK_4X4) {
+      src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    } else {
+      src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
     }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+    sum = src_avg - ref_avg;
+    sse = sum * sum;
+    fill_variance(sse, sum, 0, &vt->variances.none);
   }
 }
 
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize >= BLOCK_8X8) {
+    if (vt->bsize == BLOCK_16X16) {
+      if (vt->variances.none.variance <= threshold)
+        return;
+      else
+        vt->force_split = 0;
+    }
+
+    refine_variance_tree(vt->split[0], threshold);
+    refine_variance_tree(vt->split[1], threshold);
+    refine_variance_tree(vt->split[2], threshold);
+    refine_variance_tree(vt->split[3], threshold);
+
+    if (vt->bsize <= BLOCK_16X16)
+      fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
+    const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+    const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    const int sum = src_avg - ref_avg;
+    const unsigned int sse =  sum * sum;
+    assert(vt->bsize == BLOCK_4X4);
+    fill_variance(sse, sum, 0, &vt->variances.none);
+  }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt,
+                                 const int64_t threshold) {
+  if (vt->bsize == BLOCK_32X32) {
+    vt->force_split = vt->variances.none.variance > threshold;
+  } else {
+    vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+  }
+  return vt->force_split;
+}
+
+static int check_split(VP10_COMP *const cpi,
+                       VAR_TREE *const vt,
+                       const int segment_id,
+                       const int64_t *const thresholds
+                       ) {
+  if (vt->bsize == BLOCK_16X16) {
+    vt->force_split = vt->variances.none.variance > thresholds[0];
+    if (!vt->force_split &&
+        vt->variances.none.variance > thresholds[-1] &&
+         !cyclic_refresh_segment_id_boosted(segment_id)) {
+      // We have some nominal amount of 16x16 variance (based on average),
+      // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+      // force split to 8x8 block for this 16x16 block.
+      int minmax = compute_minmax_8x8(vt->src, vt->src_stride,
+                                      vt->ref, vt->ref_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      vt->highbd,
+#endif
+                                      vt->width, vt->height);
+      vt->force_split = minmax > cpi->vbp_threshold_minmax;
+    }
+  } else {
+    vt->force_split |= check_split(cpi, vt->split[0],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[1],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[2],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[3],
+                                   segment_id, thresholds + 1);
+
+    if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+      vt->force_split = vt->variances.none.variance > thresholds[0];
+    }
+  }
+
+  return vt->force_split;
+}
+
 // This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-static int choose_partitioning(VP10_COMP *cpi,
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(VP10_COMP *const cpi,
+                                ThreadData *const td,
                                 const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
-  VP10_COMMON * const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int i, j, k, m;
-  v64x64 vt;
-  v16x16 vt2[16];
-  int force_split[21];
-  uint8_t *s;
-  const uint8_t *d;
-  int sp;
-  int dp;
-  int pixels_wide = 8 * num_8x8_blocks_wide_lookup[BLOCK_LARGEST];
-  int pixels_high = 8 * num_8x8_blocks_high_lookup[BLOCK_LARGEST];
-  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
-      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+                                MACROBLOCK *const x,
+                                const int mi_row, const int mi_col) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+  int i;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int pixels_wide = 8 * num_8x8_blocks_wide_lookup[cm->sb_size];
+  int pixels_high = 8 * num_8x8_blocks_high_lookup[cm->sb_size];
+  int64_t thresholds[5] = {
+    cpi->vbp_thresholds[0],
+    cpi->vbp_thresholds[1],
+    cpi->vbp_thresholds[2],
+    cpi->vbp_thresholds[3],
+    cpi->vbp_thresholds[4],
+  };
+  BLOCK_SIZE bsize_min[5] = {
+      BLOCK_16X16,
+      BLOCK_16X16,
+      BLOCK_16X16,
+      cpi->vbp_bsize_min,
+      BLOCK_8X8
+  };
+  const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+  const int64_t *const thre = thresholds + start_level;
+  const BLOCK_SIZE *const bmin = bsize_min + start_level;
 
-  // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int use_4x4_partition = is_key_frame;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
-  int variance4x4downsample[16];
 
   int segment_id = CR_SEGMENT_ID_BASE;
 
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
                                                     cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, BLOCK_LARGEST, mi_row, mi_col);
+    segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
 
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
@@ -853,45 +877,38 @@
     }
   }
 
-#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
-  printf("Not yet implemented: choose_partitioning\n");
-  exit(-1);
-#endif  // CONFIG_EXT_PARTITION
-
-  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_LARGEST);
+  set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0)
     pixels_high += (xd->mb_to_bottom_edge >> 3);
 
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
+  src = x->plane[0].src.buf;
+  src_stride = x->plane[0].src.stride;
 
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
     unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
 
-    const int max_mi_block_size = num_8x8_blocks_wide_lookup[BLOCK_LARGEST];
-    const int is_right_edge = mi_col + max_mi_block_size / 2 > cm->mi_cols;
-    const int is_left_edge = mi_row + max_mi_block_size / 2 > cm->mi_rows;
+    const int hbs = cm->mib_size / 2;
+    const int split_vert = mi_col + hbs >= cm->mi_cols;
+    const int split_horz = mi_row + hbs >= cm->mi_rows;
     BLOCK_SIZE bsize;
 
-    if (is_right_edge && is_left_edge)
-      bsize = get_subsize(BLOCK_LARGEST, PARTITION_SPLIT);
-    else if (is_right_edge)
-      bsize = get_subsize(BLOCK_LARGEST, PARTITION_VERT);
-    else if (is_left_edge)
-      bsize = get_subsize(BLOCK_LARGEST, PARTITION_HORZ);
+    if (split_vert && split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
+    else if (split_vert)
+      bsize = get_subsize(cm->sb_size, PARTITION_VERT);
+    else if (split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
     else
-      bsize = BLOCK_LARGEST;
+      bsize = cm->sb_size;
 
     assert(yv12 != NULL);
-    yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
     if (yv12_g && yv12_g != yv12) {
       vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -908,11 +925,12 @@
                          &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
-    mbmi->sb_type = BLOCK_LARGEST;
+    mbmi->sb_type = cm->sb_size;
     mbmi->mv[0].as_int = 0;
     mbmi->interp_filter = BILINEAR;
 
     y_sad = vp10_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
     if (y_sad_g < y_sad) {
       vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -923,9 +941,9 @@
       x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
     }
 
-    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_LARGEST);
+    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, cm->sb_size);
 
-    for (i = 1; i <= 2; ++i) {
+    for (i = 1; i < MAX_MB_PLANE; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
       const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
@@ -939,196 +957,65 @@
       x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
     }
 
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
+    ref = xd->plane[0].dst.buf;
+    ref_stride = xd->plane[0].dst.stride;
 
     // If the y_sad is very small, take the largest partition and exit.
     // Don't check on boosted segment for now, as largest is suppressed there.
     if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
-      if (!is_right_edge && !is_left_edge) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_LARGEST);
-        return 0;
+      if (!split_vert && !split_horz) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
+        return;
       }
     }
   } else {
-    d = VP10_VAR_OFFS;
-    dp = 0;
+    ref = VP10_VAR_OFFS;
+    ref_stride = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       switch (xd->bd) {
         case 10:
-          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
           break;
         case 12:
-          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
           break;
         case 8:
         default:
-          d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
           break;
       }
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
-  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
-  // for splits.
-  for (i = 0; i < 4; i++) {
-    const int x32_idx = ((i & 1) << 5);
-    const int y32_idx = ((i >> 1) << 5);
-    const int i2 = i << 2;
-    force_split[i + 1] = 0;
-    for (j = 0; j < 4; j++) {
-      const int x16_idx = x32_idx + ((j & 1) << 4);
-      const int y16_idx = y32_idx + ((j >> 1) << 4);
-      const int split_index = 5 + i2 + j;
-      v16x16 *vst = &vt.split[i].split[j];
-      force_split[split_index] = 0;
-      variance4x4downsample[i2 + j] = 0;
-      if (!is_key_frame) {
-        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+  init_variance_tree(vt,
 #if CONFIG_VP9_HIGHBITDEPTH
-                            xd->cur_buf->flags,
-#endif
-                            pixels_wide,
-                            pixels_high,
-                            is_key_frame);
-        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-        get_variance(&vt.split[i].split[j].part_variances.none);
-        if (vt.split[i].split[j].part_variances.none.variance >
-            thresholds[2]) {
-          // 16X16 variance is above threshold for split, so force split to 8x8
-          // for this 16x16 block (this also forces splits for upper levels).
-          force_split[split_index] = 1;
-          force_split[i + 1] = 1;
-          force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
-                   thresholds[1] &&
-                   !cyclic_refresh_segment_id_boosted(segment_id)) {
-          // We have some nominal amount of 16x16 variance (based on average),
-          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
-          // force split to 8x8 block for this 16x16 block.
-          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                          xd->cur_buf->flags,
-#endif
-                                          pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
-            force_split[split_index] = 1;
-            force_split[i + 1] = 1;
-            force_split[0] = 1;
-          }
-        }
-      }
-      if (is_key_frame || (low_res &&
-          vt.split[i].split[j].part_variances.none.variance >
-          (thresholds[1] << 1))) {
-        force_split[split_index] = 0;
-        // Go down to 4x4 down-sampling for variance.
-        variance4x4downsample[i2 + j] = 1;
-        for (k = 0; k < 4; k++) {
-          int x8_idx = x16_idx + ((k & 1) << 3);
-          int y8_idx = y16_idx + ((k >> 1) << 3);
-          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
-              &vt2[i2 + j].split[k];
-          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               xd->cur_buf->flags,
-#endif
-                               pixels_wide,
-                               pixels_high,
-                               is_key_frame);
-        }
-      }
+                     xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                     cm->sb_size,
+                     (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+                     pixels_wide, pixels_high,
+                     src, src_stride, ref, ref_stride);
+
+  // Fill in the entire tree of variances and compute splits.
+  if (is_key_frame)  {
+    fill_variance_tree(vt, BLOCK_4X4);
+    check_split_key_frame(vt, thre[1]);
+  } else {
+    fill_variance_tree(vt, BLOCK_8X8);
+    check_split(cpi, vt, segment_id, thre);
+    if (low_res) {
+      refine_variance_tree(vt, thre[1] << 1);
     }
   }
 
-  // Fill the rest of the variance tree by summing split partition values.
-  for (i = 0; i < 4; i++) {
-    const int i2 = i << 2;
-    for (j = 0; j < 4; j++) {
-      if (variance4x4downsample[i2 + j] == 1) {
-        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
-            &vt.split[i].split[j];
-        for (m = 0; m < 4; m++)
-          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
-        fill_variance_tree(vtemp, BLOCK_16X16);
-      }
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-    // If variance of this 32x32 block is above the threshold, force the block
-    // to split. This also forces a split on the upper (64x64) level.
-    if (!force_split[i + 1]) {
-      get_variance(&vt.split[i].part_variances.none);
-      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
-        force_split[i + 1] = 1;
-        force_split[0] = 1;
-      }
-    }
-  }
-  if (!force_split[0]) {
-    fill_variance_tree(&vt, BLOCK_64X64);
-    get_variance(&vt.part_variances.none);
-  }
+  vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+                     mi_row + cm->mib_size > cm->mi_rows;
 
   // Now go through the entire structure, splitting every block size until
   // we get to one that's got a variance lower than our threshold.
-  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
-      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           thresholds[0], BLOCK_16X16, force_split[0])) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      const int i2 = i << 2;
-      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx),
-                               thresholds[1], BLOCK_16X16,
-                               force_split[i + 1])) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
-          // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
-          v16x16 *vtemp = (!is_key_frame &&
-                           variance4x4downsample[i2 + j] == 1) ?
-                           &vt2[i2 + j] : &vt.split[i].split[j];
-          if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
-                                   mi_row + y32_idx + y16_idx,
-                                   mi_col + x32_idx + x16_idx,
-                                   thresholds[2],
-                                   cpi->vbp_bsize_min,
-                                   force_split[5 + i2  + j])) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              if (use_4x4_partition) {
-                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
-                                         BLOCK_8X8,
-                                         mi_row + y32_idx + y16_idx + y8_idx,
-                                         mi_col + x32_idx + x16_idx + x8_idx,
-                                         thresholds[3], BLOCK_8X8, 0)) {
-                  set_block_size(cpi, x, xd,
-                                 (mi_row + y32_idx + y16_idx + y8_idx),
-                                 (mi_col + x32_idx + x16_idx + x8_idx),
-                                 BLOCK_4X4);
-                }
-              } else {
-                set_block_size(cpi, x, xd,
-                               (mi_row + y32_idx + y16_idx + y8_idx),
-                               (mi_col + x32_idx + x16_idx + x8_idx),
-                               BLOCK_8X8);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  return 0;
+  set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
 }
 
 static void update_state(VP10_COMP *cpi, ThreadData *td,
@@ -1309,7 +1196,7 @@
                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
                                  int output_enabled) {
   int y, x_idx;
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_REF_MV
   int i;
 #endif
   VP10_COMMON *const cm = &cpi->common;
@@ -2022,13 +1909,14 @@
         !supertx_enabled &&
 #endif
         is_interintra_allowed(mbmi)) {
+      const int bsize_group = size_group_lookup[bsize];
       if (mbmi->ref_frame[1] == INTRA_FRAME) {
-        counts->y_mode[size_group_lookup[bsize]][mbmi->interintra_mode]++;
-        counts->interintra[bsize][1]++;
-        if (get_wedge_bits(bsize))
+        counts->interintra[bsize_group][1]++;
+        counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+        if (is_interintra_wedge_used(bsize))
           counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
       } else {
-        counts->interintra[bsize][0]++;
+        counts->interintra[bsize_group][0]++;
       }
     }
     if (cm->reference_mode != SINGLE_REFERENCE &&
@@ -2036,7 +1924,7 @@
 #if CONFIG_OBMC
         !(is_obmc_allowed(mbmi) && mbmi->obmc) &&
 #endif  // CONFIG_OBMC
-        get_wedge_bits(bsize)) {
+        is_interinter_wedge_used(bsize)) {
       counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
     }
 #endif  // CONFIG_EXT_INTER
@@ -2146,15 +2034,15 @@
 }
 
 typedef struct {
-  ENTROPY_CONTEXT a[2 * MI_BLOCK_SIZE * MAX_MB_PLANE];
-  ENTROPY_CONTEXT l[2 * MI_BLOCK_SIZE * MAX_MB_PLANE];
-  PARTITION_CONTEXT sa[MI_BLOCK_SIZE];
-  PARTITION_CONTEXT sl[MI_BLOCK_SIZE];
+  ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *p_ta;
   TXFM_CONTEXT *p_tl;
-  TXFM_CONTEXT ta[MI_BLOCK_SIZE];
-  TXFM_CONTEXT tl[MI_BLOCK_SIZE];
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
 #endif
 } RD_SEARCH_MACROBLOCK_CONTEXT;
 
@@ -2175,14 +2063,14 @@
         xd->plane[p].subsampling_x);
     memcpy(
         xd->left_context[p]
-            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+            + ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
         ctx->l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
   memcpy(xd->above_seg_context + mi_col, ctx->sa,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(xd->left_seg_context + (mi_row & MI_MASK), ctx->sl,
+  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
 #if CONFIG_VAR_TX
   xd->above_txfm_context = ctx->p_ta;
@@ -2214,13 +2102,13 @@
     memcpy(
         ctx->l + num_4x4_blocks_high * p,
         xd->left_context[p]
-            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+            + ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
   memcpy(ctx->sa, xd->above_seg_context + mi_col,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MI_MASK),
+  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
 #if CONFIG_VAR_TX
   memcpy(ctx->ta, xd->above_txfm_context,
@@ -2262,35 +2150,24 @@
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree) {
-  VP10_COMMON *const cm = &cpi->common;
+  const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  int ctx;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize = bsize;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
 
+  assert(bsize >= BLOCK_8X8);
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
-
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-  if (output_enabled && bsize != BLOCK_4X4)
+  if (output_enabled)
     td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
@@ -2494,7 +2371,7 @@
 }
 
 // Check to see if the given partition size is allowed for a specified number
-// of 8x8 block rows and columns remaining in the image.
+// of mi block rows and columns remaining in the image.
 // If not then return the largest allowed partition size
 static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
                                       int rows_left, int cols_left,
@@ -2513,62 +2390,64 @@
   return bsize;
 }
 
-static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
-    int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
-    BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
+static void set_partial_sb_partition(const VP10_COMMON *const cm,
+                                     MODE_INFO *mi,
+                                     int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining,
+                                     BLOCK_SIZE bsize, MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+  for (r = 0; r < cm->mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
-      const int index = r * mis + c;
-      mi_8x8[index] = mi + index;
-      mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
-          row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+    for (c = 0; c < cm->mib_size; c += bw) {
+      const int index = r * cm->mi_stride + c;
+      mib[index] = mi + index;
+      mib[index]->mbmi.sb_type = find_partition_size(bsize,
+          mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
     }
   }
 }
 
-// This function attempts to set all mode info entries in a given SB64
+// This function attempts to set all mode info entries in a given superblock
 // to the same block partition size.
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_fixed_partitioning(VP10_COMP *cpi, const TileInfo *const tile,
-                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                                   MODE_INFO **mib, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   VP10_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
   int bw = num_8x8_blocks_wide_lookup[bsize];
 
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
-  // Apply the requested partition size to the SB64 if it is all "in image"
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        int index = block_row * mis + block_col;
-        mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = bsize;
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->mib_size) &&
+      (mi_rows_remaining >= cm->mib_size)) {
+    for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+        int index = block_row * cm->mi_stride + block_col;
+        mib[index] = mi_upper_left + index;
+        mib[index]->mbmi.sb_type = bsize;
       }
     }
   } else {
-    // Else this is a partial SB64.
-    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
-        col8x8_remaining, bsize, mi_8x8);
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw,
+                             mi_rows_remaining, mi_cols_remaining, bsize, mib);
   }
 }
 
 static void rd_use_partition(VP10_COMP *cpi,
                              ThreadData *td,
                              TileDataEnc *tile_data,
-                             MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                             MODE_INFO **mib, TOKENEXTRA **tp,
                              int mi_row, int mi_col,
                              BLOCK_SIZE bsize,
                              int *rate, int64_t *dist,
@@ -2580,18 +2459,17 @@
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int mis = cm->mi_stride;
-  const int bsl = b_width_log2_lookup[bsize];
-  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
-  const int bss = (1 << bsl) / 4;
-  int i, pl;
-  PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE subsize;
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
 #if CONFIG_SUPERTX
@@ -2600,10 +2478,6 @@
   int chosen_rate_nocoef = INT_MAX;
 #endif
 
-#if CONFIG_EXT_PARTITION_TYPES
-  assert(0);
-#endif
-
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
@@ -2614,14 +2488,12 @@
   vp10_rd_cost_reset(&none_rdc);
   vp10_rd_cost_reset(&chosen_rdc);
 
-  partition = partition_lookup[bsl][bs_type];
-  subsize = get_subsize(bsize, partition);
-
   pc_tree->partitioning = partition;
 
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 #endif
 
   save_context(x, &x_ctx, mi_row, mi_col, bsize);
@@ -2640,7 +2512,7 @@
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
         if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -2650,8 +2522,8 @@
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + (mi_step >> 1) < cm->mi_rows &&
-        mi_col + (mi_step >> 1) < cm->mi_cols) {
+        mi_row + hbs < cm->mi_rows &&
+        mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
 #if CONFIG_SUPERTX
@@ -2662,8 +2534,6 @@
 #endif
                        bsize, ctx, INT64_MAX);
 
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
@@ -2675,7 +2545,7 @@
 
       restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 
-      mi_8x8[0]->mbmi.sb_type = bs_type;
+      mib[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
@@ -2702,7 +2572,7 @@
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
+          bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) {
         RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
         int rt_nocoef = 0;
@@ -2712,7 +2582,7 @@
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
-                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         mi_row + hbs, mi_col, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
 #endif
@@ -2745,7 +2615,7 @@
 #endif
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
+          bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) {
         RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
         int rt_nocoef = 0;
@@ -2755,7 +2625,7 @@
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
-                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+                         mi_row, mi_col + hbs, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
 #endif
@@ -2798,8 +2668,8 @@
       last_part_rate_nocoef = 0;
 #endif
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (mi_step >> 1);
-        int y_idx = (i >> 1) * (mi_step >> 1);
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
@@ -2810,7 +2680,7 @@
 
         vp10_rd_cost_init(&tmp_rdc);
         rd_use_partition(cpi, td, tile_data,
-                         mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist,
 #if CONFIG_SUPERTX
@@ -2831,12 +2701,18 @@
 #endif
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+      assert(0 && "Cannot handle extended partiton types");
+#endif  //  CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
       break;
   }
 
-  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   if (last_part_rdc.rate < INT_MAX) {
     last_part_rdc.rate += cpi->partition_cost[pl][partition];
     last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
@@ -2850,10 +2726,10 @@
       && cpi->sf.adjust_partitioning_from_last_frame
       && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
-      && (mi_row + mi_step < cm->mi_rows ||
-          mi_row + (mi_step >> 1) == cm->mi_rows)
-      && (mi_col + mi_step < cm->mi_cols ||
-          mi_col + (mi_step >> 1) == cm->mi_cols)) {
+      && (mi_row + bs < cm->mi_rows ||
+          mi_row + hbs == cm->mi_rows)
+      && (mi_col + bs < cm->mi_cols ||
+          mi_col + hbs == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
@@ -2867,8 +2743,8 @@
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (mi_step >> 1);
-      int y_idx = (i >> 1) * (mi_step >> 1);
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
       RD_COST tmp_rdc;
 #if CONFIG_SUPERTX
       int rt_nocoef = 0;
@@ -2910,14 +2786,11 @@
         encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize, pc_tree->split[i]);
 
-      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
-                                   split_subsize);
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
 #if CONFIG_SUPERTX
       chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
 #endif
     }
-    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
@@ -2930,7 +2803,7 @@
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mi_8x8[0]->mbmi.sb_type = bsize;
+    mib[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
@@ -2952,11 +2825,11 @@
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == BLOCK_LARGEST)
+  if (bsize == cm->sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == BLOCK_LARGEST);
+    int output_enabled = (bsize == cm->sb_size);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
@@ -2980,13 +2853,13 @@
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-                               BLOCK_8X8,   //                     4x4
-  BLOCK_16X16, BLOCK_16X16,  BLOCK_16X16,   //    4x8,    8x4,     8x8
-  BLOCK_32X32, BLOCK_32X32,  BLOCK_32X32,   //   8x16,   16x8,   16x16
-  BLOCK_64X64, BLOCK_64X64,  BLOCK_64X64,   //  16x32,  32x16,   32x32
-  BLOCK_64X64, BLOCK_64X64,  BLOCK_64X64,   //  32x64,  64x32,   64x64
+                                    BLOCK_8X8,  //                     4x4
+    BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,  //    4x8,    8x4,     8x8
+    BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,  //   8x16,   16x8,   16x16
+    BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,  //  16x32,  32x16,   32x32
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
 #if CONFIG_EXT_PARTITION
-  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST   // 64x128, 128x64, 128x128
 #endif  // CONFIG_EXT_PARTITION
 };
 
@@ -3004,26 +2877,24 @@
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
-// At the moment this is designed to work on a 64x64 SB but could be
+// At the moment this is designed to work on a superblock but could be
 // adjusted to use a size parameter.
 //
 // The min and max are assumed to have been initialized prior to calling this
-// function so repeat calls can accumulate a min and max of more than one sb64.
-static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const VP10_COMMON *const cm,
+                                        MACROBLOCKD *xd, MODE_INFO **mib,
                                         BLOCK_SIZE *min_block_size,
-                                        BLOCK_SIZE *max_block_size,
-                                        int bs_hist[BLOCK_SIZES]) {
-  int sb_width_in_blocks = MI_BLOCK_SIZE;
-  int sb_height_in_blocks  = MI_BLOCK_SIZE;
+                                        BLOCK_SIZE *max_block_size) {
   int i, j;
   int index = 0;
 
   // Check the sb_type for each block that belongs to this region.
-  for (i = 0; i < sb_height_in_blocks; ++i) {
-    for (j = 0; j < sb_width_in_blocks; ++j) {
-      MODE_INFO *mi = mi_8x8[index+j];
-      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
-      bs_hist[sb_type]++;
+  for (i = 0; i < cm->mib_size; ++i) {
+    for (j = 0; j < cm->mib_size; ++j) {
+      MODE_INFO *mi = mib[index+j];
+      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
       *min_block_size = VPXMIN(*min_block_size, sb_type);
       *max_block_size = VPXMAX(*max_block_size, sb_type);
     }
@@ -3042,12 +2913,11 @@
   MODE_INFO **mi = xd->mi;
   const int left_in_image = xd->left_available && mi[-1];
   const int above_in_image = xd->up_available && mi[-xd->mi_stride];
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
   BLOCK_SIZE max_size = BLOCK_LARGEST;
-  int bs_hist[BLOCK_SIZES] = {0};
 
   // Trap case where we do not have a prediction.
   if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
@@ -3061,19 +2931,17 @@
     if (cm->frame_type != KEY_FRAME) {
       MODE_INFO **prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
+      get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
     }
-    // Find the min and max partition sizes used in the left SB64
+    // Find the min and max partition sizes used in the left superblock
     if (left_in_image) {
-      MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
-                                  bs_hist);
+      MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+      get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
     }
-    // Find the min and max partition sizes used in the above SB64.
+    // Find the min and max partition sizes used in the above suprblock.
     if (above_in_image) {
-      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
-                                  bs_hist);
+      MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+      get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
     }
 
     // Adjust observed min and max for "relaxed" auto partition case.
@@ -3084,29 +2952,28 @@
   }
 
   // Check border cases where max and min from neighbors may not be legal.
-  max_size = find_partition_size(max_size,
-                                 row8x8_remaining, col8x8_remaining,
+  max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
                                  &bh, &bw);
+  min_size = VPXMIN(min_size, max_size);
+
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
   if (vp10_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
-    min_size =
-        VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size));
+    min_size = VPXMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
   }
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
   // *min_block_size.
-  if (cpi->sf.use_square_partition_only &&
-      next_square_size[max_size] < min_size) {
-     min_size = next_square_size[max_size];
+  if (cpi->sf.use_square_partition_only) {
+    min_size = VPXMIN(min_size, next_square_size[max_size]);
   }
 
-  *min_block_size = min_size;
-  *max_block_size = max_size;
+  *min_block_size = VPXMIN(min_size, cm->sb_size);
+  *max_block_size = VPXMIN(max_size, cm->sb_size);
 }
 
 // TODO(jingning) refactor functions setting partition search range
@@ -3159,8 +3026,8 @@
     max_size = max_partition_size[max_size];
   }
 
-  *min_bs = min_size;
-  *max_bs = max_size;
+  *min_bs = VPXMIN(min_size, cm->sb_size);
+  *max_bs = VPXMIN(max_size, cm->sb_size);
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -3508,7 +3375,8 @@
 
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 #endif
 
   save_context(x, &x_ctx, mi_row, mi_col, bsize);
@@ -4203,12 +4071,12 @@
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == BLOCK_LARGEST);
+    int output_enabled = (bsize == cm->sb_size);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
               bsize, pc_tree);
   }
 
-  if (bsize == BLOCK_LARGEST) {
+  if (bsize == cm->sb_size) {
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
@@ -4239,7 +4107,7 @@
 
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += MI_BLOCK_SIZE) {
+       mi_col += cm->mib_size) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
@@ -4252,6 +4120,7 @@
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < leaf_nodes; ++i)
@@ -4266,61 +4135,60 @@
     }
 
     vp10_zero(x->pred_mv);
-    td->pc_root->index = 0;
+    pc_root->index = 0;
 
     if (seg->enabled) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_LARGEST, mi_row, mi_col);
+      int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
-      const BLOCK_SIZE bsize =
-          seg_skip ? BLOCK_LARGEST : sf->always_this_block_size;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_LARGEST);
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_LARGEST, &dummy_rate, &dummy_dist,
+                       cm->sb_size, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                       1, td->pc_root);
+                       1, pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_LARGEST);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_LARGEST, &dummy_rate, &dummy_dist,
+                       cm->sb_size, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                       1, td->pc_root);
-    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME) {
-      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+                       1, pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+      choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_LARGEST, &dummy_rate, &dummy_dist,
+                       cm->sb_size, &dummy_rate, &dummy_dist,
 #if CONFIG_SUPERTX
                        &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                       1, td->pc_root);
+                       1, pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_LARGEST);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size,
                                 &x->max_partition_size);
       }
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_LARGEST,
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
                         &dummy_rdc,
 #if CONFIG_SUPERTX
                         &dummy_rate_nocoef,
 #endif  // CONFIG_SUPERTX
-                        INT64_MAX, td->pc_root);
+                        INT64_MAX, pc_root);
     }
   }
 #if CONFIG_ENTROPY
@@ -4334,8 +4202,8 @@
       SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
 
       for (t = TX_4X4; t <= TX_32X32; ++t)
-        full_to_model_counts(cpi->td.counts->coef[t],
-                             cpi->td.rd_counts.coef_counts[t]);
+        vp10_full_to_model_counts(cpi->td.counts->coef[t],
+                                  cpi->td.rd_counts.coef_counts[t]);
       vp10_partial_adapt_probs(cm, mi_row, mi_col);
       ++cm->coef_probs_update_idx;
       vp10_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
@@ -4344,7 +4212,7 @@
                 cpi->td.rd_counts.coef_counts);
       vp10_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
                 cm->counts.eob_branch);
-      fill_token_costs(x->token_costs, cm->fc->coef_probs);
+      vp10_fill_token_costs(x->token_costs, cm->fc->coef_probs);
     }
   }
 #endif  // CONFIG_ENTROPY
@@ -4476,7 +4344,7 @@
   td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
+       mi_row += cm->mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
   }
 
@@ -4519,6 +4387,9 @@
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
   int i;
 
+  x->min_partition_size = VPXMIN(x->min_partition_size, cm->sb_size);
+  x->max_partition_size = VPXMIN(x->max_partition_size, cm->sb_size);
+
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
@@ -4566,6 +4437,10 @@
 #endif
 #endif
 
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+      cpi->td.var_root[0] == NULL)
+    vp10_setup_var_tree(&cpi->common, &cpi->td);
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -4744,7 +4619,6 @@
     }
 #endif
   } else {
-    cm->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
 }
@@ -4848,7 +4722,8 @@
   int idx, idy;
 
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bh)
@@ -4913,7 +4788,8 @@
   int idx, idy;
 
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bh)
@@ -5158,29 +5034,20 @@
 static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
                           int mi_row, int mi_col, BLOCK_SIZE bsize,
                           PC_TREE *pc_tree) {
-  VP10_COMMON *const cm = &cpi->common;
+  const VP10_COMMON *const cm = &cpi->common;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize = bsize;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
   int i;
 #endif
 
+  assert(bsize >= BLOCK_8X8);
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return 1;
 
-  if (bsize >= BLOCK_8X8)
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  else
-    subsize = BLOCK_4X4;
-
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-
   switch (partition) {
     case PARTITION_NONE:
       return check_intra_b(&pc_tree->none);
@@ -5516,14 +5383,15 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
+  const int ctx =  partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
 #if CONFIG_EXT_PARTITION_TYPES
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
 #endif
 
-  int i, ctx;
+  int i;
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
   DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
@@ -5531,6 +5399,12 @@
   int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
   int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
   int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     int len = sizeof(uint16_t);
@@ -5558,23 +5432,8 @@
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-  if (output_enabled && bsize != BLOCK_4X4 && bsize < top_bsize)
-      cm->counts.partition[ctx][partition]++;
+  if (output_enabled && bsize < top_bsize)
+    cm->counts.partition[ctx][partition]++;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
@@ -6112,8 +5971,8 @@
   sse_uv = 0;
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
 #if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[2 * MI_BLOCK_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     int coeff_ctx = 1;
 
@@ -6157,8 +6016,8 @@
 #endif  // CONFIG_EXT_TX
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
 #if CONFIG_VAR_TX
-    ENTROPY_CONTEXT ctxa[2 * MI_BLOCK_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
     const struct macroblockd_plane *const pd = &xd->plane[0];
     int coeff_ctx = 1;
 #endif  // CONFIG_VAR_TX
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 060fe0b..9acf00c 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -29,8 +29,8 @@
 #include "vp10/encoder/tokenize.h"
 
 struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MI_BLOCK_SIZE];
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
 };
 
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index a5bfd1a..a2d0659 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c
@@ -31,7 +31,7 @@
   vp10_tokens_from_tree(mv_fp_encodings, vp10_mv_fp_tree);
 }
 
-static void encode_mv_component(vpx_writer* w, int comp,
+static void encode_mv_component(vp10_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
   const int sign = comp < 0;
@@ -44,7 +44,7 @@
   assert(comp != 0);
 
   // Sign
-  vpx_write(w, sign, mvcomp->sign);
+  vp10_write(w, sign, mvcomp->sign);
 
   // Class
   vp10_write_token(w, vp10_mv_class_tree, mvcomp->classes,
@@ -58,7 +58,7 @@
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < n; ++i)
-      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
+      vp10_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
   // Fractional bits
@@ -68,7 +68,7 @@
 
   // High precision bit
   if (usehp)
-    vpx_write(w, hp,
+    vp10_write(w, hp,
               mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
 }
 
@@ -135,7 +135,7 @@
   }
 }
 
-static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+static void update_mv(vp10_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
                       vpx_prob upd_p) {
   (void) upd_p;
   vp10_cond_prob_diff_update(w, cur_p, ct);
@@ -144,7 +144,7 @@
 static void write_mv_update(const vpx_tree_index *tree,
                             vpx_prob probs[/*n - 1*/],
                             const unsigned int counts[/*n - 1*/],
-                            int n, vpx_writer *w) {
+                            int n, vp10_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -156,7 +156,7 @@
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vpx_writer *w,
+void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vp10_writer *w,
                           nmv_context_counts *const nmv_counts) {
   int i, j;
 #if CONFIG_REF_MV
@@ -235,7 +235,7 @@
 #endif
 }
 
-void vp10_encode_mv(VP10_COMP* cpi, vpx_writer* w,
+void vp10_encode_mv(VP10_COMP* cpi, vp10_writer* w,
                    const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,
diff --git a/vp10/encoder/encodemv.h b/vp10/encoder/encodemv.h
index c753d34..a026b04 100644
--- a/vp10/encoder/encodemv.h
+++ b/vp10/encoder/encodemv.h
@@ -20,10 +20,10 @@
 
 void vp10_entropy_mv_init(void);
 
-void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vpx_writer *w,
+void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vp10_writer *w,
                           nmv_context_counts *const counts);
 
-void vp10_encode_mv(VP10_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
+void vp10_encode_mv(VP10_COMP *cpi, vp10_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
 void vp10_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 77af3dd..b34b15e 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -248,6 +248,29 @@
 #endif
 }
 
+static BLOCK_SIZE select_sb_size(const VP10_COMP *const cpi) {
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size == VPX_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+
+  if (cpi->oxcf.superblock_size == VPX_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
+
+  assert(cpi->oxcf.superblock_size == VPX_SUPERBLOCK_SIZE_DYNAMIC);
+
+  assert(IMPLIES(cpi->common.tile_cols > 1,
+                 cpi->common.tile_width % MAX_MIB_SIZE == 0));
+  assert(IMPLIES(cpi->common.tile_rows > 1,
+                 cpi->common.tile_height % MAX_MIB_SIZE == 0));
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  return BLOCK_128X128;
+#else
+  (void)cpi;
+  return BLOCK_64X64;
+#endif  //  CONFIG_EXT_PARTITION
+}
+
 static void setup_frame(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
   // Set up entropy context depending on frame type. The decoder mandates
@@ -269,6 +292,8 @@
     *cm->fc = cm->frame_contexts[cm->frame_context_idx];
     vp10_zero(cpi->interp_filter_selected[0]);
   }
+
+  set_sb_size(cm, select_sb_size(cpi));
 }
 
 static void vp10_enc_setup_mi(VP10_COMMON *cm) {
@@ -438,6 +463,9 @@
 
   vp10_free_pc_tree(&cpi->td);
 
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+    vp10_free_var_tree(&cpi->td);
+
   if (cpi->common.allow_screen_content_tools)
     vpx_free(cpi->td.mb.palette_buffer);
 
@@ -786,15 +814,31 @@
   vp10_rc_update_framerate(cpi);
 }
 
-static void set_tile_limits(VP10_COMP *cpi) {
+static void set_tile_info(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
+
 #if CONFIG_EXT_TILE
-  cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 64) << MI_BLOCK_SIZE_LOG2;
-  cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64) << MI_BLOCK_SIZE_LOG2;
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size != VPX_SUPERBLOCK_SIZE_64X64) {
+    cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 32);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
+    cm->tile_width  <<= MAX_MIB_SIZE_LOG2;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 64);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+    cm->tile_width  <<= MAX_MIB_SIZE_LOG2 - 1;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
+  }
 
   cm->tile_width  = VPXMIN(cm->tile_width, cm->mi_cols);
   cm->tile_height = VPXMIN(cm->tile_height, cm->mi_rows);
 
+  assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
+  assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
+
   // Get the number of tiles
   cm->tile_cols = 1;
   while (cm->tile_cols * cm->tile_width < cm->mi_cols)
@@ -814,11 +858,14 @@
   cm->tile_cols = 1 << cm->log2_tile_cols;
   cm->tile_rows = 1 << cm->log2_tile_rows;
 
-  cm->tile_width = (mi_cols_aligned_to_sb(cm->mi_cols) >> cm->log2_tile_cols);
-  cm->tile_height = (mi_cols_aligned_to_sb(cm->mi_rows) >> cm->log2_tile_rows);
-  // round to integer multiples of 8
-  cm->tile_width  = mi_cols_aligned_to_sb(cm->tile_width);
-  cm->tile_height = mi_cols_aligned_to_sb(cm->tile_height);
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of max superblock size
+  cm->tile_width  = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
 #endif  // CONFIG_EXT_TILE
 }
 
@@ -832,7 +879,7 @@
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
 
-  set_tile_limits(cpi);
+  set_tile_info(cpi);
 }
 
 static void init_buffer_indices(VP10_COMP *cpi) {
@@ -1955,6 +2002,8 @@
       CHECK_MEM_ERROR(cm, x->palette_buffer,
                       vpx_memalign(16, sizeof(*x->palette_buffer)));
     }
+    // Reallocate the pc_tree, as it's contents depends on
+    // the state of cm->allow_screen_content_tools
     vp10_free_pc_tree(&cpi->td);
     vp10_setup_pc_tree(&cpi->common, &cpi->td);
   }
@@ -2015,7 +2064,7 @@
   cpi->last_frame_distortion = 0;
 #endif
 
-  set_tile_limits(cpi);
+  set_tile_info(cpi);
 
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
@@ -2542,6 +2591,8 @@
         vpx_free(thread_data->td->mb.palette_buffer);
       vpx_free(thread_data->td->counts);
       vp10_free_pc_tree(thread_data->td);
+      if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+        vp10_free_var_tree(thread_data->td);
       vpx_free(thread_data->td);
     }
   }
@@ -3362,13 +3413,9 @@
   model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
 }
 
-#if CONFIG_ENTROPY
-void full_to_model_counts(vp10_coeff_count_model *model_count,
-                                 vp10_coeff_count *full_count) {
-#else
-static void full_to_model_counts(vp10_coeff_count_model *model_count,
-                                 vp10_coeff_count *full_count) {
-#endif  // CONFIG_ENTROPY
+
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+                               vp10_coeff_count *full_count) {
   int i, j, k, l;
 
   for (i = 0; i < PLANE_TYPES; ++i)
@@ -3699,8 +3746,7 @@
   setup_frame(cpi);
 
 #if CONFIG_ENTROPY
-  cm->do_subframe_update =
-      cm->log2_tile_cols == 0 && cm->log2_tile_rows == 0;
+  cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
   vp10_copy(cm->starting_coef_probs, cm->fc->coef_probs);
   vp10_copy(cpi->subframe_stats.enc_starting_coef_probs,
             cm->fc->coef_probs);
@@ -3827,8 +3873,7 @@
 #endif  // CONFIG_ENTROPY
 
 #if CONFIG_ENTROPY
-    cm->do_subframe_update =
-        cm->log2_tile_cols == 0 && cm->log2_tile_rows == 0;
+    cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
     if (loop_count == 0 || frame_is_intra_only(cm) ||
         cm->error_resilient_mode) {
       vp10_copy(cm->starting_coef_probs, cm->fc->coef_probs);
@@ -4361,8 +4406,8 @@
   vp10_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->td.counts->coef[t],
-                         cpi->td.rd_counts.coef_counts[t]);
+    vp10_full_to_model_counts(cpi->td.counts->coef[t],
+                              cpi->td.rd_counts.coef_counts[t]);
 
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
 #if CONFIG_ENTROPY
@@ -5092,10 +5137,16 @@
 void vp10_apply_encoding_flags(VP10_COMP *cpi, vpx_enc_frame_flags_t flags) {
   if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
                VP8_EFLAG_NO_REF_ARF)) {
-    int ref = 7;
+    int ref = VP9_REFFRAME_ALL;
 
-    if (flags & VP8_EFLAG_NO_REF_LAST)
+    if (flags & VP8_EFLAG_NO_REF_LAST) {
       ref ^= VP9_LAST_FLAG;
+#if CONFIG_EXT_REFS
+      ref ^= VP9_LAST2_FLAG;
+      ref ^= VP9_LAST3_FLAG;
+      ref ^= VP9_LAST4_FLAG;
+#endif  // CONFIG_EXT_REFS
+    }
 
     if (flags & VP8_EFLAG_NO_REF_GF)
       ref ^= VP9_GOLD_FLAG;
@@ -5109,10 +5160,16 @@
   if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
                VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
                VP8_EFLAG_FORCE_ARF)) {
-    int upd = 7;
+    int upd = VP9_REFFRAME_ALL;
 
-    if (flags & VP8_EFLAG_NO_UPD_LAST)
+    if (flags & VP8_EFLAG_NO_UPD_LAST) {
       upd ^= VP9_LAST_FLAG;
+#if CONFIG_EXT_REFS
+      upd ^= VP9_LAST2_FLAG;
+      upd ^= VP9_LAST3_FLAG;
+      upd ^= VP9_LAST4_FLAG;
+#endif  // CONFIG_EXT_REFS
+    }
 
     if (flags & VP8_EFLAG_NO_UPD_GF)
       upd ^= VP9_GOLD_FLAG;
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 9e1b6fb..0f0d1f3 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -34,6 +34,7 @@
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/speed_features.h"
 #include "vp10/encoder/tokenize.h"
+#include "vp10/encoder/variance_tree.h"
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp10/encoder/denoiser.h"
@@ -235,6 +236,10 @@
   int color_range;
   int render_width;
   int render_height;
+
+#if CONFIG_EXT_PARTITION
+  vpx_superblock_size_t superblock_size;
+#endif  // CONFIG_EXT_PARTITION
 } VP10EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP10EncoderConfig *cfg) {
@@ -262,7 +267,10 @@
 
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
-  PC_TREE *pc_root;
+  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+  VAR_TREE *var_tree;
+  VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
 } ThreadData;
 
 struct EncWorkerData;
@@ -418,7 +426,7 @@
   // clips, and 300 for < HD clips.
   int encode_breakout;
 
-  unsigned char *segmentation_map;
+  uint8_t *segmentation_map;
 
   // segment threashold for encode breakout
   int  segment_encode_breakout[MAX_SEGMENTS];
@@ -507,6 +515,7 @@
 #if CONFIG_EXT_INTER
   unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
                                        [INTER_COMPOUND_MODES];
+  unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_OBMC
   int obmc_cost[BLOCK_SIZES][2];
@@ -563,9 +572,12 @@
   int resize_count;
 
   // VAR_BASED_PARTITION thresholds
-  // 0 - threshold_64x64; 1 - threshold_32x32;
-  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
-  int64_t vbp_thresholds[4];
+  // 0 - threshold_128x128;
+  // 1 - threshold_64x64;
+  // 2 - threshold_32x32;
+  // 3 - threshold_16x16;
+  // 4 - threshold_8x8;
+  int64_t vbp_thresholds[5];
   int64_t vbp_threshold_minmax;
   int64_t vbp_threshold_sad;
   BLOCK_SIZE vbp_bsize_min;
@@ -625,10 +637,8 @@
 
 int vp10_get_quantizer(struct VP10_COMP *cpi);
 
-#if CONFIG_ENTROPY
-void full_to_model_counts(vp10_coeff_count_model *model_count,
-                          vp10_coeff_count *full_count);
-#endif  // CONFIG_ENTROPY
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+                               vp10_coeff_count *full_count);
 
 static INLINE int frame_is_kf_gf_arf(const VP10_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) ||
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index 2742ed2..e552ec5 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -93,6 +93,10 @@
         thread_data->td->pc_tree = NULL;
         vp10_setup_pc_tree(cm, thread_data->td);
 
+        // Set up variance tree if needed.
+        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+          vp10_setup_var_tree(cm, &cpi->td);
+
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         vpx_calloc(1, sizeof(*thread_data->td->counts)));
diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index dd3e437..5936a24 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -491,7 +491,8 @@
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  const PICK_MODE_CONTEXT *ctx =
+      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
   int i;
 
   int recon_yoffset, recon_uvoffset;
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 4327d97..9423ed2 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -24,6 +24,7 @@
 
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/rdopt.h"
 
 // #define NEW_DIAMOND_SEARCH
 
@@ -2655,6 +2656,29 @@
     v = INT_MAX;                                                       \
   }
 
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_masked_pref_error(xd,                          \
+                                          mask, mask_stride,           \
+                                          vfp, z, src_stride,          \
+                                          upre(y, y_stride, r, c),     \
+                                          y_stride,                    \
+                                          w, h, &sse);    \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
 int vp10_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
                                          const uint8_t *mask, int mask_stride,
                                          MV *bestmv, const MV *ref_mv,
@@ -2671,8 +2695,8 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
-  unsigned int whichdir;
   int thismse;
+  unsigned int whichdir;
   unsigned int halfiters = iters_per_step;
   unsigned int quarteriters = iters_per_step;
   unsigned int eighthiters = iters_per_step;
@@ -2747,6 +2771,276 @@
   return besterr;
 }
 
+static unsigned int setup_masked_center_error(const uint8_t *mask,
+                                              int mask_stride,
+                                              const MV *bestmv,
+                                              const MV *ref_mv,
+                                              int error_per_bit,
+                                              const vp10_variance_fn_ptr_t *vfp,
+                                              const uint8_t *const src,
+                                              const int src_stride,
+                                              const uint8_t *const y,
+                                              int y_stride,
+                                              int offset,
+                                              int *mvjcost, int *mvcost[2],
+                                              unsigned int *sse1,
+                                              int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->mvf(y + offset, y_stride, src, src_stride,
+                     mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
+                                       const uint8_t *mask,
+                                       int mask_stride,
+                                       const vp10_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y, int y_stride,
+                                       int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
+                       mask, mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->mvf(pred, w, src, src_stride,
+                       mask, mask_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_masked_center_error(
+    const MACROBLOCKD *xd,
+    const uint8_t *mask, int mask_stride,
+    const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride,
+    const uint8_t *const y, int y_stride,
+    int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_masked_pref_error(
+      xd, mask, mask_stride, vfp, src, src_stride,
+      y + offset, y_stride, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_masked_sub_pixel_tree_up(VP10_COMP *cpi,
+                                            MACROBLOCK *x,
+                                            const uint8_t *mask,
+                                            int mask_stride,
+                                            int mi_row, int mi_col,
+                                            MV *bestmv, const MV *ref_mv,
+                                            int allow_hp,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            int forced_stop,
+                                            int iters_per_step,
+                                            int *mvjcost, int *mvcost[2],
+                                            int *distortion,
+                                            unsigned int *sse1,
+                                            int is_second,
+                                            int use_upsampled_ref) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_masked_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        w, h, (offset << 3),
+        mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_masked_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_masked_pref_error(xd,
+                                                mask, mask_stride,
+                                                vfp, src_address, src_stride,
+                                                pre_address, y_stride,
+                                                w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride,
+                              mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = {tr, tc};
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_masked_pref_error(xd,
+                                              mask, mask_stride,
+                                              vfp, src_address, src_stride,
+                                              pre_address, y_stride,
+                                              w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 #undef DIST
 #undef MVC
 #undef CHECK_BETTER
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index f99cd8b..c12e7af 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -169,7 +169,24 @@
                                          int iters_per_step,
                                          int *mvjcost, int *mvcost[2],
                                          int *distortion,
-                                         unsigned int *sse1, int is_second);
+                                         unsigned int *sse1,
+                                         int is_second);
+int vp10_find_best_masked_sub_pixel_tree_up(struct VP10_COMP *cpi,
+                                            MACROBLOCK *x,
+                                            const uint8_t *mask,
+                                            int mask_stride,
+                                            int mi_row, int mi_col,
+                                            MV *bestmv, const MV *ref_mv,
+                                            int allow_hp,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            int forced_stop,
+                                            int iters_per_step,
+                                            int *mvjcost, int *mvcost[2],
+                                            int *distortion,
+                                            unsigned int *sse1,
+                                            int is_second,
+                                            int use_upsampled_ref);
 int vp10_masked_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
                                    const uint8_t *mask, int mask_stride,
                                    MV *mvp_full, int step_param,
diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
index d413935..cbc3582 100644
--- a/vp10/encoder/palette.c
+++ b/vp10/encoder/palette.c
@@ -11,20 +11,21 @@
 #include <math.h>
 #include "vp10/encoder/palette.h"
 
-static double calc_dist(const double *p1, const double *p2, int dim) {
-  double dist = 0;
+static float calc_dist(const float *p1, const float *p2, int dim) {
+  float dist = 0;
   int i = 0;
 
   for (i = 0; i < dim; ++i) {
-    dist = dist + (p1[i] - round(p2[i])) * (p1[i] - round(p2[i]));
+    float diff = p1[i] - roundf(p2[i]);
+    dist += diff * diff;
   }
   return dist;
 }
 
-void vp10_calc_indices(const double *data, const double *centroids,
+void vp10_calc_indices(const float *data, const float *centroids,
                        uint8_t *indices, int n, int k, int dim) {
   int i, j;
-  double min_dist, this_dist;
+  float min_dist, this_dist;
 
   for (i = 0; i < n; ++i) {
     min_dist = calc_dist(data + i * dim, centroids, dim);
@@ -45,7 +46,7 @@
   return *state / 65536 % 32768;
 }
 
-static void calc_centroids(const double *data, double *centroids,
+static void calc_centroids(const float *data, float *centroids,
                            const uint8_t *indices, int n, int k, int dim) {
   int i, j, index;
   int count[PALETTE_MAX_SIZE];
@@ -70,16 +71,16 @@
       memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
                  sizeof(centroids[0]) * dim);
     } else {
-      const double norm = 1.0 / count[i];
+      const float norm = 1.0f / count[i];
       for (j = 0; j < dim; ++j)
         centroids[i * dim + j] *= norm;
     }
   }
 }
 
-static double calc_total_dist(const double *data, const double *centroids,
+static float calc_total_dist(const float *data, const float *centroids,
                               const uint8_t *indices, int n, int k, int dim) {
-  double dist = 0;
+  float dist = 0;
   int i;
   (void) k;
 
@@ -89,11 +90,11 @@
   return dist;
 }
 
-int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
+int vp10_k_means(const float *data, float *centroids, uint8_t *indices,
                  uint8_t *pre_indices, int n, int k, int dim, int max_itr) {
   int i = 0;
-  double pre_dist, this_dist;
-  double pre_centroids[2 * PALETTE_MAX_SIZE];
+  float pre_dist, this_dist;
+  float pre_centroids[2 * PALETTE_MAX_SIZE];
 
   vp10_calc_indices(data, centroids, indices, n, k, dim);
   pre_dist = calc_total_dist(data, centroids, indices, n, k, dim);
@@ -121,9 +122,9 @@
   return i;
 }
 
-void vp10_insertion_sort(double *data, int n) {
+void vp10_insertion_sort(float *data, int n) {
   int i, j, k;
-  double val;
+  float val;
 
   if (n <= 1)
     return;
diff --git a/vp10/encoder/palette.h b/vp10/encoder/palette.h
index 124cf74..40d9ef9 100644
--- a/vp10/encoder/palette.h
+++ b/vp10/encoder/palette.h
@@ -17,10 +17,10 @@
 extern "C" {
 #endif
 
-void vp10_insertion_sort(double *data, int n);
-void vp10_calc_indices(const double *data, const double *centroids,
+void vp10_insertion_sort(float *data, int n);
+void vp10_calc_indices(const float *data, const float *centroids,
                        uint8_t *indices, int n, int k, int dim);
-int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
+int vp10_k_means(const float *data, float *centroids, uint8_t *indices,
                  uint8_t *pre_indices, int n, int k, int dim, int max_itr);
 int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index 203ac42..dc34f1f 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -152,13 +152,8 @@
 #endif  // CONFIG_EXT_INTRA
 }
 
-#if CONFIG_ENTROPY
-void fill_token_costs(vp10_coeff_cost *c,
-                      vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
-#else
-static void fill_token_costs(vp10_coeff_cost *c,
-                             vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
-#endif  // CONFIG_ENTROPY
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+                           vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
   for (t = TX_4X4; t <= TX_32X32; ++t)
@@ -397,7 +392,7 @@
 #endif
   }
   if (cpi->oxcf.pass != 1) {
-    fill_token_costs(x->token_costs, cm->fc->coef_probs);
+    vp10_fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
     if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
         cm->frame_type == KEY_FRAME) {
@@ -445,12 +440,16 @@
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
                          cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
-#endif
+#endif  // CONFIG_REF_MV
 #if CONFIG_EXT_INTER
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp10_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
                          cm->fc->inter_compound_mode_probs[i],
                          vp10_inter_compound_mode_tree);
+      for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+        vp10_cost_tokens((int *)cpi->interintra_mode_cost[i],
+                         cm->fc->interintra_mode_prob[i],
+                         vp10_interintra_mode_tree);
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_OBMC
       for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
@@ -563,8 +562,8 @@
 
 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MI_BLOCK_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MI_BLOCK_SIZE]) {
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
@@ -931,14 +930,15 @@
   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
 }
 
-void vp10_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
-                               int bsize, int best_mode_index) {
+void vp10_update_rd_thresh_fact(const VP10_COMMON *const cm,
+                                int (*factor_buf)[MAX_MODES], int rd_thresh,
+                                int bsize, int best_mode_index) {
   if (rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
       const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_LARGEST);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, cm->sb_size);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 533e775..7aad9eb 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -330,20 +330,19 @@
 
 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[2 * MI_BLOCK_SIZE],
-                              ENTROPY_CONTEXT t_left[2 * MI_BLOCK_SIZE]);
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
 
 void vp10_set_rd_speed_thresholds(struct VP10_COMP *cpi);
 
 void vp10_set_rd_speed_thresholds_sub8x8(struct VP10_COMP *cpi);
 
-void vp10_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
-                               int bsize, int best_mode_index);
+void vp10_update_rd_thresh_fact(const VP10_COMMON *const cm,
+                                int (*fact)[MAX_MODES], int rd_thresh,
+                                int bsize, int best_mode_index);
 
-#if CONFIG_ENTROPY
-void fill_token_costs(vp10_coeff_cost *c,
-                      vp10_coeff_probs_model (*p)[PLANE_TYPES]);
-#endif  // CONFIG_ENTROPY
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+                           vp10_coeff_probs_model (*p)[PLANE_TYPES]);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 656e9f7..b3f8336 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -87,8 +87,8 @@
 const double ext_tx_th = 0.99;
 #endif
 
-const double ADST_FLIP_SVM[8] = {-7.3283, -3.0450, -3.2450, 3.6403,  // vert
-                                 -9.4204, -3.1821, -4.6851, 4.1469};  // horz
+const double ADST_FLIP_SVM[8] = {-6.6623, -2.8062, -3.2531, 3.1671,  // vert
+                                 -7.7051, -3.2234, -3.6193, 3.4533};  // horz
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -102,8 +102,8 @@
 struct rdcost_block_args {
   const VP10_COMP *cpi;
   MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[2 * MI_BLOCK_SIZE];
-  ENTROPY_CONTEXT t_left[2 * MI_BLOCK_SIZE];
+  ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
   int this_rate;
   int64_t this_dist;
   int64_t this_sse;
@@ -355,14 +355,14 @@
 // constants for prune 1 and prune 2 decision boundaries
 #define FAST_EXT_TX_CORR_MID 0.0
 #define FAST_EXT_TX_EDST_MID 0.1
-#define FAST_EXT_TX_CORR_MARGIN 0.5
-#define FAST_EXT_TX_EDST_MARGIN 0.05
+#define FAST_EXT_TX_CORR_MARGIN 0.3
+#define FAST_EXT_TX_EDST_MARGIN 0.5
 
 typedef enum {
   DCT_1D = 0,
   ADST_1D = 1,
   FLIPADST_1D = 2,
-  DST_1D = 3,
+  IDTX_1D = 3,
   TX_TYPES_1D = 4,
 } TX_TYPE_1D;
 
@@ -568,18 +568,18 @@
   }
 }
 
-int dct_vs_dst(int16_t *diff, int stride, int w, int h,
-               double *hcorr, double *vcorr) {
+int dct_vs_idtx(int16_t *diff, int stride, int w, int h,
+                double *hcorr, double *vcorr) {
   int prune_bitmask = 0;
   get_horver_correlation(diff, stride, w, h, hcorr, vcorr);
 
   if (*vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << DST_1D;
+    prune_bitmask |= 1 << IDTX_1D;
   else if (*vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
     prune_bitmask |= 1 << DCT_1D;
 
   if (*hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
-    prune_bitmask |= 1 << (DST_1D + 8);
+    prune_bitmask |= 1 << (IDTX_1D + 8);
   else if (*hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
     prune_bitmask |= 1 << (DCT_1D + 8);
   return prune_bitmask;
@@ -600,7 +600,7 @@
   vp10_subtract_plane(x, bsize, 0);
   return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
                           pd->dst.stride, hdist, vdist) |
-         dct_vs_dst(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
+         dct_vs_idtx(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
 }
 
 #endif  // CONFIG_EXT_TX
@@ -653,13 +653,13 @@
     FLIPADST_1D,
     ADST_1D,
     FLIPADST_1D,
-    DST_1D,
+    IDTX_1D,
     DCT_1D,
-    DST_1D,
+    IDTX_1D,
     ADST_1D,
-    DST_1D,
+    IDTX_1D,
     FLIPADST_1D,
-    DST_1D,
+    IDTX_1D,
   };
   static TX_TYPE_1D htx_tab[TX_TYPES] = {
     DCT_1D,
@@ -671,16 +671,14 @@
     FLIPADST_1D,
     FLIPADST_1D,
     ADST_1D,
+    IDTX_1D,
+    IDTX_1D,
     DCT_1D,
-    DST_1D,
+    IDTX_1D,
     ADST_1D,
-    DST_1D,
+    IDTX_1D,
     FLIPADST_1D,
-    DST_1D,
-    DST_1D,
   };
-  if (tx_type >= IDTX)
-    return 1;
   return !(((prune >> vtx_tab[tx_type]) & 1) |
          ((prune >> (htx_tab[tx_type] + 8)) & 1));
 #else
@@ -794,8 +792,8 @@
       dist_sum += dist;
     } else {
       vp10_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> dequant_shift,
-                                   &rate, &dist);
+                                    pd->dequant[1] >> dequant_shift,
+                                    &rate, &dist);
       rate_sum += rate;
       dist_sum += dist;
     }
@@ -1440,7 +1438,9 @@
   last_rd = INT64_MAX;
   for (n = start_tx; n >= end_tx; --n) {
     if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n))
-        continue;
+      continue;
+    if (max_tx_size == TX_32X32 && n == TX_4X4)
+      continue;
 #if CONFIG_EXT_TX
     ext_tx_set = get_ext_tx_set(n, bs, is_inter);
     if (is_inter) {
@@ -1769,12 +1769,12 @@
     const int max_itr = 50;
     int color_ctx, color_idx = 0;
     int color_order[PALETTE_MAX_SIZE];
-    double *const data = x->palette_buffer->kmeans_data_buf;
+    float *const data = x->palette_buffer->kmeans_data_buf;
     uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
     uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
-    double centroids[PALETTE_MAX_SIZE];
+    float centroids[PALETTE_MAX_SIZE];
     uint8_t *const color_map = xd->plane[0].color_index_map;
-    double lb, ub, val;
+    float lb, ub, val;
     MB_MODE_INFO *const mbmi = &mic->mbmi;
     PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1818,6 +1818,9 @@
     mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
 #endif  // CONFIG_EXT_INTRA
 
+    if (rows * cols > PALETTE_MAX_BLOCK_SIZE)
+      return 0;
+
     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
         n >= 2; --n) {
       for (i = 0; i < n; ++i)
@@ -1826,7 +1829,7 @@
                    n, 1, max_itr);
       vp10_insertion_sort(centroids, n);
       for (i = 0; i < n; ++i)
-        centroids[i] = round(centroids[i]);
+        centroids[i] = roundf(centroids[i]);
       // remove duplicates
       i = 1;
       k = n;
@@ -1846,12 +1849,12 @@
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cpi->common.use_highbitdepth)
         for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel_highbd((int)round(centroids[i]),
+          pmi->palette_colors[i] = clip_pixel_highbd((int)lroundf(centroids[i]),
                                                      cpi->common.bit_depth);
       else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel((int)round(centroids[i]));
+          pmi->palette_colors[i] = clip_pixel((int)lroundf(centroids[i]));
       pmi->palette_size[0] = k;
 
       vp10_calc_indices(data, centroids, indices, rows * cols, k, 1);
@@ -2949,8 +2952,8 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
   const int tx_col = blk_col >> (1 - pd->subsampling_x);
-  TX_SIZE (*const inter_tx_size)[MI_BLOCK_SIZE] =
-    (TX_SIZE (*)[MI_BLOCK_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
+  TX_SIZE (*const inter_tx_size)[MAX_MIB_SIZE] =
+    (TX_SIZE (*)[MAX_MIB_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
   int64_t this_rd = INT64_MAX;
@@ -3120,10 +3123,10 @@
     int idx, idy;
     int block = 0;
     int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
-    ENTROPY_CONTEXT ctxa[2 * MI_BLOCK_SIZE];
-    ENTROPY_CONTEXT ctxl[2 * MI_BLOCK_SIZE];
-    TXFM_CONTEXT tx_above[MI_BLOCK_SIZE];
-    TXFM_CONTEXT tx_left[MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
 
     int pnrate = 0, pnskip = 1;
     int64_t pndist = 0, pnsse = 0;
@@ -3235,9 +3238,9 @@
   int64_t best_rd = INT64_MAX;
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
-  TX_SIZE best_tx_size[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
   TX_SIZE best_tx = TX_SIZES;
-  uint8_t best_blk_skip[MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
   const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
   int idx, idy;
   int prune = 0;
@@ -3420,8 +3423,8 @@
     int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
     int pnrate = 0, pnskip = 1;
     int64_t pndist = 0, pnsse = 0;
-    ENTROPY_CONTEXT ta[2 * MI_BLOCK_SIZE];
-    ENTROPY_CONTEXT tl[2 * MI_BLOCK_SIZE];
+    ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
 
     vp10_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
 
@@ -3547,6 +3550,9 @@
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
 
+  if (rows * cols > PALETTE_MAX_BLOCK_SIZE)
+    return;
+
 #if CONFIG_EXT_INTRA
   mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
 #endif  // CONFIG_EXT_INTRA
@@ -3572,12 +3578,12 @@
     int color_ctx, color_idx = 0;
     int color_order[PALETTE_MAX_SIZE];
     int64_t this_sse;
-    double lb_u, ub_u, val_u;
-    double lb_v, ub_v, val_v;
-    double *const data = x->palette_buffer->kmeans_data_buf;
+    float lb_u, ub_u, val_u;
+    float lb_v, ub_v, val_v;
+    float *const data = x->palette_buffer->kmeans_data_buf;
     uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
     uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
-    double centroids[2 * PALETTE_MAX_SIZE];
+    float centroids[2 * PALETTE_MAX_SIZE];
     uint8_t *const color_map = xd->plane[1].color_index_map;
     PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 
@@ -3646,12 +3652,12 @@
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cpi->common.use_highbitdepth)
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel_highbd(round(centroids[j * 2 + i - 1]),
+                clip_pixel_highbd(roundf(centroids[j * 2 + i - 1]),
                                   cpi->common.bit_depth);
           else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel(round(centroids[j * 2 + i - 1]));
+                clip_pixel(roundf(centroids[j * 2 + i - 1]));
         }
       }
       for (r = 0; r < rows; ++r)
@@ -4952,44 +4958,54 @@
 #if !CONFIG_EXT_INTER
         if (filter_idx > 0) {
           BEST_SEG_INFO* ref_bsi = bsi_buf;
-          if (seg_mvs[i][mbmi->ref_frame[0]].as_int ==
-              ref_bsi->rdstat[i][mode_idx].mvs[0].as_int &&
-              ref_bsi->rdstat[i][mode_idx].mvs[0].as_int != INVALID_MV)
-            if (bsi->ref_mv[0]->as_int ==
-                ref_bsi->rdstat[i][mode_idx].pred_mv[0].as_int)
-              --run_mv_search;
+          SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[i][mode_idx];
 
-          if (!has_second_rf) {
-            --run_mv_search;
-          } else {
-            if (seg_mvs[i][mbmi->ref_frame[1]].as_int ==
-                ref_bsi->rdstat[i][mode_idx].mvs[1].as_int &&
-                ref_bsi->rdstat[i][mode_idx].mvs[1].as_int != INVALID_MV)
-              if (bsi->ref_mv[1]->as_int ==
-                  ref_bsi->rdstat[i][mode_idx].pred_mv[1].as_int)
+          if (has_second_rf) {
+            if (seg_mvs[i][mbmi->ref_frame[0]].as_int ==
+                    ref_rdstat->mvs[0].as_int &&
+                ref_rdstat->mvs[0].as_int != INVALID_MV)
+              if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
                 --run_mv_search;
+
+            if (seg_mvs[i][mbmi->ref_frame[1]].as_int ==
+                    ref_rdstat->mvs[1].as_int &&
+                ref_rdstat->mvs[1].as_int != INVALID_MV)
+              if (bsi->ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+                --run_mv_search;
+          } else {
+            if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
+                ref_rdstat->mvs[0].as_int != INVALID_MV) {
+              run_mv_search = 0;
+              seg_mvs[i][mbmi->ref_frame[0]].as_int =
+                  ref_rdstat->mvs[0].as_int;
+            }
           }
 
           if (run_mv_search != 0 && filter_idx > 1) {
             ref_bsi = bsi_buf + 1;
+            ref_rdstat = &ref_bsi->rdstat[i][mode_idx];
             run_mv_search = 2;
 
-            if (seg_mvs[i][mbmi->ref_frame[0]].as_int ==
-                ref_bsi->rdstat[i][mode_idx].mvs[0].as_int &&
-                ref_bsi->rdstat[i][mode_idx].mvs[0].as_int != INVALID_MV)
-              if (bsi->ref_mv[0]->as_int ==
-                  ref_bsi->rdstat[i][mode_idx].pred_mv[0].as_int)
-                --run_mv_search;
-
-            if (!has_second_rf) {
-              --run_mv_search;
-            } else {
-              if (seg_mvs[i][mbmi->ref_frame[1]].as_int ==
-                  ref_bsi->rdstat[i][mode_idx].mvs[1].as_int &&
-                  ref_bsi->rdstat[i][mode_idx].mvs[1].as_int != INVALID_MV)
-                if (bsi->ref_mv[1]->as_int ==
-                    ref_bsi->rdstat[i][mode_idx].pred_mv[1].as_int)
+            if (has_second_rf) {
+              if (seg_mvs[i][mbmi->ref_frame[0]].as_int ==
+                      ref_rdstat->mvs[0].as_int &&
+                  ref_rdstat->mvs[0].as_int != INVALID_MV)
+                if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
                   --run_mv_search;
+
+              if (seg_mvs[i][mbmi->ref_frame[1]].as_int ==
+                      ref_rdstat->mvs[1].as_int &&
+                  ref_rdstat->mvs[1].as_int != INVALID_MV)
+                if (bsi->ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+                  --run_mv_search;
+            } else {
+              if (bsi->ref_mv[0]->as_int ==
+                      ref_rdstat->pred_mv[0].as_int &&
+                  ref_rdstat->mvs[0].as_int != INVALID_MV) {
+                run_mv_search = 0;
+                seg_mvs[i][mbmi->ref_frame[0]].as_int =
+                    ref_rdstat->mvs[0].as_int;
+              }
             }
           }
         }
@@ -5069,8 +5085,8 @@
           }
 
 #if CONFIG_REF_MV
-          mvp_full.row = best_ref_mv->as_mv.row >> 3;
-          mvp_full.col = best_ref_mv->as_mv.col >> 3;
+          mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3;
+          mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3;
 #else
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
@@ -5731,10 +5747,9 @@
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_LARGEST) {
-    int boffset =
-        2 * (b_width_log2_lookup[BLOCK_LARGEST] -
-             VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+    int boffset =  2 * (b_width_log2_lookup[cm->sb_size] -
+         VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = VPXMAX(step_param, boffset);
   }
 
@@ -5905,9 +5920,9 @@
   }
 
   // TODO(debargha): is show_frame needed here?
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_LARGEST &&
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size &&
       cm->show_frame) {
-    int boffset = 2 * (b_width_log2_lookup[BLOCK_LARGEST] -
+    int boffset = 2 * (b_width_log2_lookup[cm->sb_size] -
           VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = VPXMAX(step_param, boffset);
   }
@@ -5958,15 +5973,18 @@
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    vp10_find_best_masked_sub_pixel_tree(x, mask, mask_stride,
-                                         &tmp_mv->as_mv, &ref_mv,
-                                         cm->allow_high_precision_mv,
-                                         x->errorperbit,
-                                         &cpi->fn_ptr[bsize],
-                                         cpi->sf.mv.subpel_force_stop,
-                                         cpi->sf.mv.subpel_iters_per_step,
-                                         x->nmvjointcost, x->mvcost,
-                                         &dis, &x->pred_sse[ref], ref_idx);
+    vp10_find_best_masked_sub_pixel_tree_up(cpi, x, mask, mask_stride,
+                                            mi_row, mi_col,
+                                            &tmp_mv->as_mv, &ref_mv,
+                                            cm->allow_high_precision_mv,
+                                            x->errorperbit,
+                                            &cpi->fn_ptr[bsize],
+                                            cpi->sf.mv.subpel_force_stop,
+                                            cpi->sf.mv.subpel_iters_per_step,
+                                            x->nmvjointcost, x->mvcost,
+                                            &dis, &x->pred_sse[ref],
+                                            ref_idx,
+                                            cpi->sf.use_upsampled_references);
   }
   *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -6197,8 +6215,8 @@
 #if CONFIG_EXT_INTER
   int mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
   int_mv single_newmv[MAX_REF_FRAMES];
-  const int * const intra_mode_cost =
-    cpi->mbmode_cost[size_group_lookup[bsize]];
+  const unsigned int *const interintra_mode_cost =
+    cpi->interintra_mode_cost[size_group_lookup[bsize]];
   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
 #if CONFIG_REF_MV
   uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
@@ -6223,7 +6241,7 @@
   int best_rate_y, best_rate_uv;
 #endif  // CONFIG_SUPERTX
 #if CONFIG_VAR_TX
-  uint8_t best_blk_skip[MAX_MB_PLANE][MI_BLOCK_SIZE * MI_BLOCK_SIZE * 4];
+  uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
 #endif  // CONFIG_VAR_TX
   int64_t best_distortion = INT64_MAX;
   unsigned int best_pred_var = UINT_MAX;
@@ -6623,7 +6641,7 @@
   rs = cm->interp_filter == SWITCHABLE ? vp10_get_switchable_rate(cpi, xd) : 0;
 
 #if CONFIG_EXT_INTER
-  if (is_comp_pred && get_wedge_bits(bsize)) {
+  if (is_comp_pred && is_interinter_wedge_used(bsize)) {
     int wedge_index, best_wedge_index = WEDGE_NONE, rs;
     int rate_sum;
     int64_t dist_sum;
@@ -6773,7 +6791,7 @@
   }
 
   if (is_comp_interintra_pred) {
-    PREDICTION_MODE interintra_mode, best_interintra_mode = DC_PRED;
+    INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
     int64_t best_interintra_rd = INT64_MAX;
     int rmode, rate_sum;
     int64_t dist_sum;
@@ -6786,6 +6804,16 @@
         bh = 4 << b_height_log2_lookup[mbmi->sb_type];
     int_mv tmp_mv;
     int tmp_rate_mv = 0;
+    DECLARE_ALIGNED(16, uint8_t,
+                    intrapred_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+    uint8_t *intrapred;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      intrapred = intrapred_;
+
     mbmi->ref_frame[1] = NONE;
     for (j = 0; j < MAX_MB_PLANE; j++) {
       xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
@@ -6795,44 +6823,56 @@
     restore_dst_buf(xd, orig_dst, orig_dst_stride);
     mbmi->ref_frame[1] = INTRA_FRAME;
 
-    for (interintra_mode = DC_PRED; interintra_mode <= TM_PRED;
-         ++interintra_mode) {
-      mbmi->interintra_mode = interintra_mode;
-      mbmi->interintra_uv_mode = interintra_mode;
-      rmode = intra_mode_cost[mbmi->interintra_mode];
-      vp10_build_interintra_predictors(xd,
-                                       tmp_buf,
-                                       tmp_buf + MAX_SB_SQUARE,
-                                       tmp_buf + 2 * MAX_SB_SQUARE,
-                                       MAX_SB_SIZE,
-                                       MAX_SB_SIZE,
-                                       MAX_SB_SIZE,
-                                       bsize);
+    for (j = 0; j < INTERINTRA_MODES; ++j) {
+      mbmi->interintra_mode = (INTERINTRA_MODE)j;
+      mbmi->interintra_uv_mode = (INTERINTRA_MODE)j;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      vp10_build_intra_predictors_for_interintra(
+          xd, bsize, 0, intrapred, MAX_SB_SIZE);
+      vp10_combine_interintra(xd, bsize, 0, tmp_buf, MAX_SB_SIZE,
+                              intrapred, MAX_SB_SIZE);
+      vp10_build_intra_predictors_for_interintra(
+          xd, bsize, 1, intrapred + MAX_SB_SQUARE, MAX_SB_SIZE);
+      vp10_build_intra_predictors_for_interintra(
+          xd, bsize, 2, intrapred + 2 * MAX_SB_SQUARE, MAX_SB_SIZE);
+      vp10_combine_interintra(xd, bsize, 1,
+                              tmp_buf + MAX_SB_SQUARE, MAX_SB_SIZE,
+                              intrapred + MAX_SB_SQUARE, MAX_SB_SIZE);
+      vp10_combine_interintra(xd, bsize, 2,
+                              tmp_buf + 2 * MAX_SB_SQUARE, MAX_SB_SIZE,
+                              intrapred + 2 * MAX_SB_SQUARE, MAX_SB_SIZE);
       model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                       &skip_txfm_sb, &skip_sse_sb);
       rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
       if (rd < best_interintra_rd) {
         best_interintra_rd = rd;
-        best_interintra_mode = interintra_mode;
+        best_interintra_mode = mbmi->interintra_mode;
       }
     }
     mbmi->interintra_mode = best_interintra_mode;
     mbmi->interintra_uv_mode = best_interintra_mode;
     if (ref_best_rd < INT64_MAX &&
-        best_interintra_rd / 2 > ref_best_rd) {
+        best_interintra_rd > 2 * ref_best_rd) {
       return INT64_MAX;
     }
-    wedge_bits = get_wedge_bits(bsize);
-    rmode = intra_mode_cost[mbmi->interintra_mode];
-    if (wedge_bits) {
-      vp10_build_interintra_predictors(xd,
-                                       tmp_buf,
-                                       tmp_buf + MAX_SB_SQUARE,
-                                       tmp_buf + 2 * MAX_SB_SQUARE,
-                                       MAX_SB_SIZE,
-                                       MAX_SB_SIZE,
-                                       MAX_SB_SIZE,
-                                       bsize);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, intrapred, MAX_SB_SIZE);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 1, intrapred + MAX_SB_SQUARE, MAX_SB_SIZE);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 2, intrapred + 2 * MAX_SB_SQUARE, MAX_SB_SIZE);
+
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    if (is_interintra_wedge_used(bsize)) {
+      wedge_bits = get_wedge_bits(bsize);
+      vp10_combine_interintra(xd, bsize, 0, tmp_buf, MAX_SB_SIZE,
+                              intrapred, MAX_SB_SIZE);
+      vp10_combine_interintra(xd, bsize, 1,
+                              tmp_buf + MAX_SB_SQUARE, MAX_SB_SIZE,
+                              intrapred + MAX_SB_SQUARE, MAX_SB_SIZE);
+      vp10_combine_interintra(xd, bsize, 2,
+                              tmp_buf + 2 * MAX_SB_SQUARE, MAX_SB_SIZE,
+                              intrapred + 2 * MAX_SB_SQUARE, MAX_SB_SIZE);
       model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                       &skip_txfm_sb, &skip_sse_sb);
       rwedge = vp10_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
@@ -6847,14 +6887,15 @@
       for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
         mbmi->interintra_wedge_index = wedge_index;
         mbmi->interintra_uv_wedge_index = wedge_index;
-        vp10_build_interintra_predictors(xd,
-                                         tmp_buf,
-                                         tmp_buf + MAX_SB_SQUARE,
-                                         tmp_buf + 2 * MAX_SB_SQUARE,
-                                         MAX_SB_SIZE,
-                                         MAX_SB_SIZE,
-                                         MAX_SB_SIZE,
-                                         bsize);
+        vp10_combine_interintra(xd, bsize, 0,
+                                tmp_buf, MAX_SB_SIZE,
+                                intrapred, MAX_SB_SIZE);
+        vp10_combine_interintra(xd, bsize, 1,
+                                tmp_buf + MAX_SB_SQUARE, MAX_SB_SIZE,
+                                intrapred + MAX_SB_SQUARE, MAX_SB_SIZE);
+        vp10_combine_interintra(xd, bsize, 2,
+                                tmp_buf + 2 * MAX_SB_SQUARE, MAX_SB_SIZE,
+                                intrapred + 2 * MAX_SB_SQUARE, MAX_SB_SIZE);
         model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
                         &skip_txfm_sb, &skip_sse_sb);
         rd = RDCOST(x->rdmult, x->rddiv,
@@ -6908,9 +6949,9 @@
     pred_exists = 0;
     tmp_rd = best_interintra_rd;
     *compmode_interintra_cost =
-        vp10_cost_bit(cm->fc->interintra_prob[bsize], 1);
-    *compmode_interintra_cost += intra_mode_cost[mbmi->interintra_mode];
-    if (get_wedge_bits(bsize)) {
+        vp10_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
+    *compmode_interintra_cost += interintra_mode_cost[mbmi->interintra_mode];
+    if (is_interintra_wedge_used(bsize)) {
       *compmode_interintra_cost += vp10_cost_bit(
           cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
       if (mbmi->use_wedge_interintra) {
@@ -6919,7 +6960,7 @@
     }
   } else if (is_interintra_allowed(mbmi)) {
     *compmode_interintra_cost =
-      vp10_cost_bit(cm->fc->interintra_prob[bsize], 0);
+      vp10_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
   }
 
 #if CONFIG_EXT_INTERP
@@ -7416,8 +7457,8 @@
 // bars embedded in the stream.
 int vp10_active_edge_sb(VP10_COMP *cpi,
                        int mi_row, int mi_col) {
-  return vp10_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
-         vp10_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+  return vp10_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
+         vp10_active_v_edge(cpi, mi_col, cpi->common.mib_size);
 }
 
 static void restore_uv_color_map(VP10_COMP *cpi, MACROBLOCK *x) {
@@ -7432,9 +7473,9 @@
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
-  double *const data = x->palette_buffer->kmeans_data_buf;
+  float *const data = x->palette_buffer->kmeans_data_buf;
   uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
-  double centroids[2 * PALETTE_MAX_SIZE];
+  float centroids[2 * PALETTE_MAX_SIZE];
   uint8_t *const color_map = xd->plane[1].color_index_map;
   int r, c;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -7552,6 +7593,8 @@
   uint8_t ref_frame_skip_mask[2] = { 0 };
 #if CONFIG_EXT_INTER
   uint32_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+  MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
+  int64_t best_single_inter_rd = INT64_MAX;
 #else
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
 #endif  // CONFIG_EXT_INTER
@@ -8178,6 +8221,8 @@
 #endif
 #if CONFIG_EXT_INTER
       if (second_ref_frame == INTRA_FRAME) {
+        if (best_single_inter_ref != ref_frame)
+          continue;
         mbmi->interintra_mode = best_intra_mode;
         mbmi->interintra_uv_mode = best_intra_mode;
 #if CONFIG_EXT_INTRA
@@ -8477,7 +8522,6 @@
 #endif  // CONFIG_OBMC
     }
 
-
     // Apply an adjustment to the rd value based on the similarity of the
     // source variance and reconstructed variance.
     rd_variance_adjustment(cpi, x, bsize, &this_rd, ref_frame,
@@ -8487,11 +8531,18 @@
                            x->source_variance);
 
     if (ref_frame == INTRA_FRAME) {
-    // Keep record of best intra rd
+      // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
+#if CONFIG_EXT_INTER
+    } else if (second_ref_frame == NONE) {
+      if (this_rd < best_single_inter_rd) {
+        best_single_inter_rd = this_rd;
+        best_single_inter_ref = mbmi->ref_frame[0];
+      }
+#endif  // CONFIG_EXT_INTER
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -8525,7 +8576,7 @@
         *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
                                             mbmi->ref_frame[0] != INTRA_FRAME);
 #if CONFIG_OBMC
-        if (is_inter_block(mbmi) && is_obmc_allowed(mbmi))
+        if (is_neighbor_overlappable(mbmi) && is_obmc_allowed(mbmi))
           *returnrate_nocoef -= cpi->obmc_cost[bsize][mbmi->obmc];
 #endif  // CONFIG_OBMC
 #endif  // CONFIG_SUPERTX
@@ -8898,8 +8949,8 @@
          !is_inter_block(&best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref)
-    vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+    vp10_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                               sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -9045,8 +9096,8 @@
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
 
-  vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+  vp10_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp10_zero(best_pred_diff);
 
@@ -9767,8 +9818,8 @@
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
 
-  vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                            sf->adaptive_rd_thresh, bsize, best_ref_index);
+  vp10_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                             sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index f719467..f3fa210 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -180,8 +180,7 @@
   if (bsize == BLOCK_8X8)
     partition = PARTITION_NONE;
   else
-    partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                              mi_row, mi_col, bsize);
+    partition = get_partition(cm, mi_row, mi_col, bsize);
   switch (partition) {
     case PARTITION_NONE:
       count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
@@ -328,13 +327,13 @@
       mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
                  tile_info.mi_col_start;
       for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-           mi_row += MI_BLOCK_SIZE, mi_ptr += MI_BLOCK_SIZE * cm->mi_stride) {
+           mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
         MODE_INFO **mi = mi_ptr;
         for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-             mi_col += MI_BLOCK_SIZE, mi += MI_BLOCK_SIZE) {
+             mi_col += cm->mib_size, mi += cm->mib_size) {
           count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
                         temporal_predictor_count, t_unpred_seg_counts,
-                        mi_row, mi_col, BLOCK_LARGEST);
+                        mi_row, mi_col, cm->sb_size);
         }
       }
     }
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index ea4df6e..6ba074d 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -319,8 +319,8 @@
   // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
 
-  // Sets min and max partition sizes for this 64x64 region based on the
-  // same 64x64 in last encoded frame, and the left and above neighbor.
+  // Sets min and max partition sizes for this superblock based on the
+  // same superblock in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
   // Ensures the rd based auto partition search will always
   // go down at least to the specified level.
diff --git a/vp10/encoder/subexp.c b/vp10/encoder/subexp.c
index d944d01..6d9c45f 100644
--- a/vp10/encoder/subexp.c
+++ b/vp10/encoder/subexp.c
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_dsp/bitwriter.h"
+#include "vp10/encoder/bitwriter.h"
 
 #include "vp10/common/common.h"
 #include "vp10/common/entropy.h"
@@ -83,35 +83,35 @@
   return update_bits[delp] << VP9_PROB_COST_SHIFT;
 }
 
-static void encode_uniform(vpx_writer *w, int v) {
+static void encode_uniform(vp10_writer *w, int v) {
   const int l = 8;
   const int m = (1 << l) - 190;
   if (v < m) {
-    vpx_write_literal(w, v, l - 1);
+    vp10_write_literal(w, v, l - 1);
   } else {
-    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vpx_write_literal(w, (v - m) & 1, 1);
+    vp10_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp10_write_literal(w, (v - m) & 1, 1);
   }
 }
 
-static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
-  vpx_write_literal(w, word >= test, 1);
+static INLINE int write_bit_gte(vp10_writer *w, int word, int test) {
+  vp10_write_literal(w, word >= test, 1);
   return word >= test;
 }
 
-static void encode_term_subexp(vpx_writer *w, int word) {
+static void encode_term_subexp(vp10_writer *w, int word) {
   if (!write_bit_gte(w, word, 16)) {
-    vpx_write_literal(w, word, 4);
+    vp10_write_literal(w, word, 4);
   } else if (!write_bit_gte(w, word, 32)) {
-    vpx_write_literal(w, word - 16, 4);
+    vp10_write_literal(w, word - 16, 4);
   } else if (!write_bit_gte(w, word, 64)) {
-    vpx_write_literal(w, word - 32, 5);
+    vp10_write_literal(w, word - 32, 5);
   } else {
     encode_uniform(w, word - 64);
   }
 }
 
-void vp10_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
+void vp10_write_prob_diff_update(vp10_writer *w, vpx_prob newp, vpx_prob oldp) {
   const int delp = remap_prob(newp, oldp);
   encode_term_subexp(w, delp);
 }
@@ -262,7 +262,7 @@
 }
 #endif  // CONFIG_ENTROPY
 
-void vp10_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
+void vp10_cond_prob_diff_update(vp10_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
   const vpx_prob upd = DIFF_UPDATE_PROB;
   vpx_prob newp = get_binary_prob(ct[0], ct[1]);
@@ -270,11 +270,11 @@
                                                           upd);
   assert(newp >= 1);
   if (savings > 0) {
-    vpx_write(w, 1, upd);
+    vp10_write(w, 1, upd);
     vp10_write_prob_diff_update(w, newp, *oldp);
     *oldp = newp;
   } else {
-    vpx_write(w, 0, upd);
+    vp10_write(w, 0, upd);
   }
 }
 
diff --git a/vp10/encoder/subexp.h b/vp10/encoder/subexp.h
index 0f9227c..756b499 100644
--- a/vp10/encoder/subexp.h
+++ b/vp10/encoder/subexp.h
@@ -18,12 +18,12 @@
 
 #include "vpx_dsp/prob.h"
 
-struct vpx_writer;
+struct vp10_writer;
 
-void vp10_write_prob_diff_update(struct vpx_writer *w,
+void vp10_write_prob_diff_update(struct vp10_writer *w,
                                 vpx_prob newp, vpx_prob oldp);
 
-void vp10_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
+void vp10_cond_prob_diff_update(struct vp10_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
 int vp10_prob_diff_update_savings_search(const unsigned int *ct,
diff --git a/vp10/encoder/variance_tree.c b/vp10/encoder/variance_tree.c
new file mode 100644
index 0000000..d11ef2d
--- /dev/null
+++ b/vp10/encoder/variance_tree.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/variance_tree.h"
+#include "vp10/encoder/encoder.h"
+
+
+
+void vp10_setup_var_tree(struct VP10Common *cm, ThreadData *td) {
+  int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 1024;
+  const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int index = 0;
+  VAR_TREE *this_var;
+  int nodes;
+
+  vpx_free(td->var_tree);
+  CHECK_MEM_ERROR(cm, td->var_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->var_tree)));
+
+  this_var = &td->var_tree[0];
+
+  // Sets up all the leaf nodes in the tree.
+  for (index = 0; index < leaf_nodes; ++index) {
+    VAR_TREE *const leaf = &td->var_tree[index];
+    leaf->split[0] = NULL;
+  }
+
+  // Each node has 4 leaf nodes, fill in the child pointers
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i, ++index) {
+      VAR_TREE *const node = &td->var_tree[index];
+      for (j = 0; j < 4; j++)
+        node->split[j] = this_var++;
+    }
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->var_root[i] = &td->var_tree[tree_nodes - 1];
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->var_root[i] = td->var_root[i+1]->split[0];
+  }
+}
+
+void vp10_free_var_tree(ThreadData *td) {
+  vpx_free(td->var_tree);
+  td->var_tree = NULL;
+}
diff --git a/vp10/encoder/variance_tree.h b/vp10/encoder/variance_tree.h
new file mode 100644
index 0000000..a10f7e7
--- /dev/null
+++ b/vp10/encoder/variance_tree.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_VARIANCE_TREE_H_
+#define VP10_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct ThreadData;
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+  int force_split;
+  partition_variance variances;
+  struct VAR_TREE *split[4];
+  BLOCK_SIZE bsize;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int width;
+  int height;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+} VAR_TREE;
+
+void vp10_setup_var_tree(struct VP10Common *cm, struct ThreadData *td);
+void vp10_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static INLINE void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+  sum_2_variances(&vt->split[0]->variances.none,
+                  &vt->split[1]->variances.none,
+                  &vt->variances.horz[0]);
+  sum_2_variances(&vt->split[2]->variances.none,
+                  &vt->split[3]->variances.none,
+                  &vt->variances.horz[1]);
+  sum_2_variances(&vt->split[0]->variances.none,
+                  &vt->split[2]->variances.none,
+                  &vt->variances.vert[0]);
+  sum_2_variances(&vt->split[1]->variances.none,
+                  &vt->split[3]->variances.none,
+                  &vt->variances.vert[1]);
+  sum_2_variances(&vt->variances.vert[0],
+                  &vt->variances.vert[1],
+                  &vt->variances.none);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_VARIANCE_TREE_H_ */
diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index 047fcfb..0cad961 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c
@@ -49,40 +49,42 @@
   int                         color_range;
   int                         render_width;
   int                         render_height;
+  vpx_superblock_size_t       superblock_size;
 };
 
 static struct vp10_extracfg default_extra_cfg = {
-  0,                          // cpu_used
-  1,                          // enable_auto_alt_ref
-  0,                          // noise_sensitivity
-  0,                          // sharpness
-  0,                          // static_thresh
+  0,                            // cpu_used
+  1,                            // enable_auto_alt_ref
+  0,                            // noise_sensitivity
+  0,                            // sharpness
+  0,                            // static_thresh
 #if CONFIG_EXT_TILE
-  64,                         // tile_columns
-  64,                         // tile_rows
+  UINT_MAX,                     // tile_columns
+  UINT_MAX,                     // tile_rows
 #else
-  0,                          // tile_columns
-  0,                          // tile_rows
+  0,                            // tile_columns
+  0,                            // tile_rows
 #endif  // CONFIG_EXT_TILE
-  7,                          // arnr_max_frames
-  5,                          // arnr_strength
-  0,                          // min_gf_interval; 0 -> default decision
-  0,                          // max_gf_interval; 0 -> default decision
-  VP8_TUNE_PSNR,              // tuning
-  10,                         // cq_level
-  0,                          // rc_max_intra_bitrate_pct
-  0,                          // rc_max_inter_bitrate_pct
-  0,                          // gf_cbr_boost_pct
-  0,                          // lossless
-  1,                          // frame_parallel_decoding_mode
-  NO_AQ,                      // aq_mode
-  0,                          // frame_periodic_delta_q
-  VPX_BITS_8,                 // Bit depth
-  VP9E_CONTENT_DEFAULT,       // content
-  VPX_CS_UNKNOWN,             // color space
-  0,                          // color range
-  0,                          // render width
-  0,                          // render height
+  7,                            // arnr_max_frames
+  5,                            // arnr_strength
+  0,                            // min_gf_interval; 0 -> default decision
+  0,                            // max_gf_interval; 0 -> default decision
+  VP8_TUNE_PSNR,                // tuning
+  10,                           // cq_level
+  0,                            // rc_max_intra_bitrate_pct
+  0,                            // rc_max_inter_bitrate_pct
+  0,                            // gf_cbr_boost_pct
+  0,                            // lossless
+  1,                            // frame_parallel_decoding_mode
+  NO_AQ,                        // aq_mode
+  0,                            // frame_periodic_delta_q
+  VPX_BITS_8,                   // Bit depth
+  VP9E_CONTENT_DEFAULT,         // content
+  VPX_CS_UNKNOWN,               // color space
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  VPX_SUPERBLOCK_SIZE_DYNAMIC   // superblock_size
 };
 
 struct vpx_codec_alg_priv {
@@ -199,12 +201,26 @@
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, superblock_size,
+              VPX_SUPERBLOCK_SIZE_64X64, VPX_SUPERBLOCK_SIZE_DYNAMIC);
 #if CONFIG_EXT_TILE
   // TODO(any): Waring. If CONFIG_EXT_TILE is true, tile_columns really
   // means tile_width, and tile_rows really means tile_hight. The interface
   // should be sanitized.
-  RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
-  RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
+#if CONFIG_EXT_PARTITION
+  if (extra_cfg->superblock_size != VPX_SUPERBLOCK_SIZE_64X64) {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 32);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 32);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
+  }
 #else
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -416,8 +432,25 @@
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
 
+#if CONFIG_EXT_PARTITION
+  oxcf->superblock_size = extra_cfg->superblock_size;
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_TILE
+  {
+#if CONFIG_EXT_PARTITION
+    const unsigned int max =
+      extra_cfg->superblock_size == VPX_SUPERBLOCK_SIZE_64X64 ? 64 : 32;
+#else
+    const unsigned int max = 64;
+#endif  // CONFIG_EXT_PARTITION
+    oxcf->tile_columns = VPXMIN(extra_cfg->tile_columns, max);
+    oxcf->tile_rows    = VPXMIN(extra_cfg->tile_rows, max);
+  }
+#else
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows    = extra_cfg->tile_rows;
+#endif  // CONFIG_EXT_TILE
 
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
@@ -1247,6 +1280,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_superblock_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.superblock_size = CAST(VP10E_SET_SUPERBLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
   {VP8E_USE_REFERENCE,                ctrl_use_reference},
@@ -1283,6 +1323,7 @@
   {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
   {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
   {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
+  {VP10E_SET_SUPERBLOCK_SIZE,         ctrl_set_superblock_size},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 34b766f..d174c8b 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -21,6 +21,8 @@
 VP10_CX_SRCS-yes += encoder/bitwriter.h
 VP10_CX_SRCS-yes += encoder/context_tree.c
 VP10_CX_SRCS-yes += encoder/context_tree.h
+VP10_CX_SRCS-yes += encoder/variance_tree.c
+VP10_CX_SRCS-yes += encoder/variance_tree.h
 VP10_CX_SRCS-yes += encoder/cost.h
 VP10_CX_SRCS-yes += encoder/cost.c
 VP10_CX_SRCS-yes += encoder/dct.c
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 5600ed4..9f8004b 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -210,7 +210,7 @@
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
-      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
+      assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d9764a4..1306481 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -560,6 +560,15 @@
    * Supported in codecs: VP9
    */
   VP9E_SET_RENDER_SIZE,
+
+  /*!\brief Codec control function to set intended superblock size.
+   *
+   * By default, the superblock size is determined separately for each
+   * frame by the encoder.
+   *
+   * Supported in codecs: VP10
+   */
+  VP10E_SET_SUPERBLOCK_SIZE,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -820,6 +829,9 @@
  */
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
+
+VPX_CTRL_USE_TYPE(VP10E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define VPX_CTRL_VP10E_SET_SUPERBLOCK_SIZE
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index b6037bb..e65e3f4 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -222,6 +222,18 @@
     VPX_BITS_12 = 12,  /**< 12 bits */
   } vpx_bit_depth_t;
 
+  /*!\brief Superblock size selection.
+   *
+   * Defines the superblock size used for encoding. The superblock size can
+   * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+   * selected by the encoder for each frame.
+   */
+  typedef enum vpx_superblock_size {
+    VPX_SUPERBLOCK_SIZE_64X64,    /**< Always use 64x64 superblocks. */
+    VPX_SUPERBLOCK_SIZE_128X128,  /**< Always use 128x128 superblocks. */
+    VPX_SUPERBLOCK_SIZE_DYNAMIC   /**< Select superblock size dynamically. */
+  } vpx_superblock_size_t;
+
   /*
    * Library Version Number Interface
    *
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 26fe785..d3695a9 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -12,22 +12,22 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 8; ++i, s+=p)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
+  for (i = 0; i < 8; ++i, src += stride)
+    for (j = 0; j < 8; sum += src[j], ++j) {}
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 4; ++i, s+=p)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
+  for (i = 0; i < 4; ++i, src += stride)
+    for (j = 0; j < 4; sum += src[j], ++j) {}
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
@@ -176,14 +176,15 @@
   return var;
 }
 
-void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
+                      const uint8_t *ref, int ref_stride,
                       int *min, int *max) {
   int i, j;
   *min = 255;
   *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
+  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
     for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
+      int diff = abs(src[j]-ref[j]);
       *min = diff < *min ? diff : *min;
       *max = diff > *max ? diff : *max;
     }
@@ -191,24 +192,24 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 8; ++i, s+=p)
+  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 8; ++i, s += stride)
     for (j = 0; j < 8; sum += s[j], ++j) {}
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 4; ++i, s+=p)
+  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 4; ++i, s+=stride)
     for (j = 0; j < 4; sum += s[j], ++j) {}
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 24f42df..e6be1dd 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -433,7 +433,7 @@
   return *sse; \
 }
 
-static void highbd_var_filter_block2d_bil_first_pass(
+void vpx_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -459,7 +459,7 @@
   }
 }
 
-static void highbd_var_filter_block2d_bil_second_pass(
+void vpx_highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -492,13 +492,14 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                          dst_stride, sse); \
+                                            dst_stride, sse); \
 } \
 \
 uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
@@ -509,10 +510,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -526,10 +528,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -546,16 +549,17 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                          dst_stride, sse); \
+                                            dst_stride, sse);           \
 } \
 \
 uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
@@ -568,13 +572,14 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -590,13 +595,14 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -635,9 +641,9 @@
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -914,11 +920,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                           H + 1, W, \
-                                           bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                  W, dst, dst_stride, \
@@ -934,11 +940,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                           H + 1, W, \
-                                           bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
@@ -954,11 +960,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
-                                           H + 1, W, \
-                                           bilinear_filters_2t[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters_2t[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, dst, dst_stride, \
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 161d647..dea2af9 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -23,10 +23,10 @@
 #define FILTER_WEIGHT 128
 
 typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
-                                    const uint8_t *b_ptr, int b_stride);
+                                    const uint8_t *b, int b_stride);
 
-typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
-                                        const uint8_t *b_ptr, int b_stride,
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
                                         const uint8_t *second_pred);
 
 typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
@@ -50,10 +50,10 @@
                                                 const uint8_t *b, int b_stride,
                                                 unsigned int *sse);
 
-typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a,
                                                    int a_stride,
                                                    int xoffset, int yoffset,
-                                                   const uint8_t *b_ptr,
+                                                   const uint8_t *b,
                                                    int b_stride,
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
@@ -75,26 +75,25 @@
 #endif  // CONFIG_VP8
 
 #if CONFIG_VP10 && CONFIG_EXT_INTER
-typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src_ptr,
-                                           int source_stride,
-                                           const uint8_t *ref_ptr,
+typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src,
+                                           int src_stride,
+                                           const uint8_t *ref,
                                            int ref_stride,
                                            const uint8_t *msk_ptr,
                                            int msk_stride);
-typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src_ptr,
-                                                 int source_stride,
-                                                 const uint8_t *ref_ptr,
+typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *ref,
                                                  int ref_stride,
-                                                 const uint8_t *msk_ptr,
+                                                 const uint8_t *msk,
                                                  int msk_stride,
                                                  unsigned int *sse);
-typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src_ptr,
-                                                       int source_stride,
-                                                       int xoffset,
-                                                       int yoffset,
-                                                       const uint8_t *ref_ptr,
-                                                       int Refstride,
-                                                       const uint8_t *msk_ptr,
+typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src,
+                                                       int src_stride,
+                                                       int xoffset, int yoffset,
+                                                       const uint8_t *ref,
+                                                       int ref_stride,
+                                                       const uint8_t *msk,
                                                        int msk_stride,
                                                        unsigned int *sse);
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
@@ -130,6 +129,24 @@
 } vp10_variance_fn_ptr_t;
 #endif  // CONFIG_VP10
 
+void vpx_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
+void vpx_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e371849..46ef5fc 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -266,6 +266,11 @@
 endif
 endif
 
+# high bit depth subtract
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
+endif
+
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
 ifeq ($(CONFIG_VP10_ENCODER),yes)
@@ -350,6 +355,7 @@
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index e127031..3571eea 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -30,6 +30,8 @@
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
+#define IMPLIES(a, b)  (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+
 // These can be used to give a hint about branch outcomes.
 // This can have an effect, even if your target processor has a
 // good branch predictor, as these hints can affect basic block
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d01e81d..a648e45 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -965,10 +965,6 @@
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block/;
-}
 
 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
   #
@@ -991,6 +987,8 @@
     specialize qw/vpx_highbd_avg_8x8/;
     add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
     specialize qw/vpx_highbd_avg_4x4/;
+    add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+    specialize qw/vpx_highbd_subtract_block sse2/;
   }
 
   #
@@ -1316,10 +1314,17 @@
       if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
         specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
       }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse4_1";
+      }
       if ($w != 128 && $h != 128 && $w != 4) {
         specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
         specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
       }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+      }
     }
   }
 }  # CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/highbd_subtract_sse2.c b/vpx_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000..33e464b
--- /dev/null
+++ b/vpx_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,366 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(
+    int16_t *diff, ptrdiff_t diff_stride,
+    const uint16_t *src, ptrdiff_t src_stride,
+    const uint16_t *pred, ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *) (diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *) (diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *) (diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *) (diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *) (diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *) (diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *) (diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *) (diff + 7 * diff_stride), x7);
+}
+
+static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 8;
+  src += 8;
+  pred += 8;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 16;
+  src += 16;
+  pred += 16;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 32;
+  src += 32;
+  pred += 32;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 64;
+  src += 64;
+  pred += 64;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
+                             const uint16_t *src, ptrdiff_t src_stride,
+                             const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  SubtractWxHFuncType ret_func_ptr = NULL;
+  if (rows == 4) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x4;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x4;
+    }
+  } else if (rows == 8) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x8;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x8;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x8;
+    }
+  } else if (rows == 16) {
+    if (cols == 8) {
+      ret_func_ptr = subtract_8x16;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x16;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x16;
+    }
+  } else if (rows == 32) {
+    if (cols == 16) {
+      ret_func_ptr = subtract_16x32;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x32;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x32;
+    }
+  } else if (rows == 64) {
+    if (cols == 32) {
+      ret_func_ptr = subtract_32x64;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x64;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x64;
+    }
+  } else if (rows == 128) {
+    if (cols == 64) {
+      ret_func_ptr = subtract_64x128;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x128;
+    }
+  }
+  if (!ret_func_ptr) {
+    assert(0);
+  }
+  return ret_func_ptr;
+}
+
+void vpx_highbd_subtract_block_sse2(
+    int rows, int cols,
+    int16_t *diff, ptrdiff_t diff_stride,
+    const uint8_t *src8, ptrdiff_t src_stride,
+    const uint8_t *pred8,
+    ptrdiff_t pred_stride,
+    int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+  (void) bd;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000..5c1dfe4
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
+  a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
+  a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
+  a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
+
+  b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
+  b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
+  b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
+  b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
+                                         int a_stride,
+                                         const uint8_t *b,
+                                         int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+// Sub-pixel
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}
diff --git a/vpx_ports/msvc.h b/vpx_ports/msvc.h
index cab7740..d6b8503 100644
--- a/vpx_ports/msvc.h
+++ b/vpx_ports/msvc.h
@@ -26,6 +26,20 @@
   else
     return floor(x + 0.5);
 }
+
+static INLINE float roundf(float x) {
+  if (x < 0)
+    return (float)ceil(x - 0.5f);
+  else
+    return (float)floor(x + 0.5f);
+}
+
+static INLINE long lroundf(float x) {
+  if (x < 0)
+    return (long)(x - 0.5f);
+  else
+    return (long)(x + 0.5f);
+}
 #endif  // _MSC_VER < 1800
 
 #endif  // _MSC_VER
diff --git a/vpxenc.c b/vpxenc.c
index 5e14934..ca29816 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -476,6 +476,17 @@
 #endif
 
 #if CONFIG_VP10_ENCODER
+#if CONFIG_EXT_PARTITION
+static const struct arg_enum_list superblock_size_enum[] = {
+  {"dynamic", VPX_SUPERBLOCK_SIZE_DYNAMIC},
+  {"64", VPX_SUPERBLOCK_SIZE_64X64},
+  {"128", VPX_SUPERBLOCK_SIZE_128X128},
+  {NULL, 0}
+};
+static const arg_def_t superblock_size = ARG_DEF_ENUM(
+    NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
+#endif  // CONFIG_EXT_PARTITION
+
 static const arg_def_t *vp10_args[] = {
   &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -484,6 +495,9 @@
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
   &noise_sens, &tune_content, &input_color_space,
   &min_gf_interval, &max_gf_interval,
+#if CONFIG_EXT_PARTITION
+  &superblock_size,
+#endif  // CONFIG_EXT_PARTITION
 #if CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -500,6 +514,9 @@
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
   VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
   VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+#if CONFIG_EXT_PARTITION
+  VP10E_SET_SUPERBLOCK_SIZE,
+#endif  // CONFIG_EXT_PARTITION
   0
 };
 #endif