Merge "Small speed up for super_block_uvrd" into nextgenv2
diff --git a/configure b/configure
index ed1d048..eda83f1 100755
--- a/configure
+++ b/configure
@@ -282,6 +282,7 @@
     ans
     loop_restoration
     ext_partition
+    ext_partition_types
     ext_tile
     obmc
     entropy
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 193bd45..e074461 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -86,11 +86,11 @@
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = 25;
+  cfg_.g_lag_in_frames = 12;
   cfg_.rc_end_usage = VPX_VBR;
 
   libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144,
-                                     timebase.den, timebase.num, 0, 30);
+                                     timebase.den, timebase.num, 0, 15);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   const char *md5_fw_str = md5_fw_order_.Get();
@@ -104,5 +104,5 @@
 
 VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
 
-VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 1, 1));
 }  // namespace
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index d501e10..3967149 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc
@@ -103,20 +103,6 @@
       make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7,
                  VPX_BITS_8, 256),
       make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 9,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14,
-                 VPX_BITS_8, 256),
-      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15,
                  VPX_BITS_8, 256)));
 #endif  // !CONFIG_EXT_TX
 #endif  // HAVE_SSE2
diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
index d2598f9..bee1a0c 100644
--- a/test/vp10_fht4x4_test.cc
+++ b/test/vp10_fht4x4_test.cc
@@ -102,20 +102,6 @@
       make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 7,
                  VPX_BITS_8, 16),
       make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 8,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 9,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 10,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 11,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 12,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 13,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 14,
-                 VPX_BITS_8, 16),
-      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 15,
                  VPX_BITS_8, 16)));
 #endif  // !CONFIG_EXT_TX
 #endif  // HAVE_SSE2
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
index 47feb3d..96f5632 100644
--- a/test/vp10_fht8x8_test.cc
+++ b/test/vp10_fht8x8_test.cc
@@ -102,20 +102,6 @@
       make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 7,
                  VPX_BITS_8, 64),
       make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 8,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 9,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 10,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 11,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 12,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 13,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 14,
-                 VPX_BITS_8, 64),
-      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 15,
                  VPX_BITS_8, 64)));
 #endif  // !CONFIG_EXT_TX
 #endif  // HAVE_SSE2
diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc
index a39e0ef..bcbc617 100644
--- a/test/vp10_fwd_txfm1d_test.cc
+++ b/test/vp10_fwd_txfm1d_test.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "test/vp10_txfm_test.h"
 #include "vp10/common/vp10_fwd_txfm1d.h"
+#include "test/vp10_txfm_test.h"
 
 using libvpx_test::ACMRandom;
 
@@ -17,12 +17,14 @@
 static int txfm_type_num = 2;
 static TYPE_TXFM txfm_type_ls[2] = {TYPE_DCT, TYPE_ADST};
 
-static int txfm_size_num = 4;
-static int txfm_size_ls[4] = {4, 8, 16, 32};
+static int txfm_size_num = 5;
+static int txfm_size_ls[5] = {4, 8, 16, 32, 64};
 
-static TxfmFunc fwd_txfm_func_ls[2][4] = {
-    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new},
-    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new}};
+static TxfmFunc fwd_txfm_func_ls[2][5] = {
+    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new,
+     vp10_fdct64_new},
+    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new,
+     NULL}};
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
 static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
@@ -104,19 +106,21 @@
       int max_error = 7;
 
       const int count_test_block = 5000;
-      for (int ti = 0; ti < count_test_block; ++ti) {
-        for (int ni = 0; ni < txfm_size; ++ni) {
-          input[ni] = rnd.Rand16() % base - rnd.Rand16() % base;
-          ref_input[ni] = static_cast<double>(input[ni]);
-        }
+      if (fwd_txfm_func != NULL) {
+        for (int ti = 0; ti < count_test_block; ++ti) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % base - rnd.Rand16() % base;
+            ref_input[ni] = static_cast<double>(input[ni]);
+          }
 
-        fwd_txfm_func(input, output, cos_bit, range_bit);
-        reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
 
-        for (int ni = 0; ni < txfm_size; ++ni) {
-          EXPECT_LE(
-              abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
-              max_error);
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            EXPECT_LE(
+                abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+                max_error);
+          }
         }
       }
     }
diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
index e6416cc..137f653 100644
--- a/test/vp10_fwd_txfm2d_test.cc
+++ b/test/vp10_fwd_txfm2d_test.cc
@@ -8,36 +8,36 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "test/vp10_txfm_test.h"
-#include "vp10/common/vp10_fwd_txfm2d.h"
 #include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "./vp10_rtcd.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-const int txfm_size_num = 4;
-const int txfm_size_ls[4] = {4, 8, 16, 32};
-const TXFM_2D_CFG fwd_txfm_cfg_ls[4][4] = {
-    {fwd_txfm_2d_cfg_dct_dct_4, fwd_txfm_2d_cfg_dct_adst_4,
-     fwd_txfm_2d_cfg_adst_adst_4, fwd_txfm_2d_cfg_adst_dct_4},
-    {fwd_txfm_2d_cfg_dct_dct_8, fwd_txfm_2d_cfg_dct_adst_8,
-     fwd_txfm_2d_cfg_adst_adst_8, fwd_txfm_2d_cfg_adst_dct_8},
-    {fwd_txfm_2d_cfg_dct_dct_16, fwd_txfm_2d_cfg_dct_adst_16,
-     fwd_txfm_2d_cfg_adst_adst_16, fwd_txfm_2d_cfg_adst_dct_16},
-    {fwd_txfm_2d_cfg_dct_dct_32, fwd_txfm_2d_cfg_dct_adst_32,
-     fwd_txfm_2d_cfg_adst_adst_32, fwd_txfm_2d_cfg_adst_dct_32}};
+#if CONFIG_VP9_HIGHBITDEPTH
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
+const TXFM_2D_CFG* fwd_txfm_cfg_ls[5][4] = {
+    {&fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_adst_4,
+     &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_dct_4},
+    {&fwd_txfm_2d_cfg_dct_dct_8, &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_8, &fwd_txfm_2d_cfg_adst_dct_8},
+    {&fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_adst_16,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_dct_16},
+    {&fwd_txfm_2d_cfg_dct_dct_32, &fwd_txfm_2d_cfg_dct_adst_32,
+     &fwd_txfm_2d_cfg_adst_adst_32, &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
 
-const Fwd_Txfm2d_Func fwd_txfm_func_ls[4] = {
+const Fwd_Txfm2d_Func fwd_txfm_func_ls[5] = {
     vp10_fwd_txfm2d_4x4, vp10_fwd_txfm2d_8x8, vp10_fwd_txfm2d_16x16,
-    vp10_fwd_txfm2d_32x32};
+    vp10_fwd_txfm2d_32x32, vp10_fwd_txfm2d_64x64};
 
 const int txfm_type_num = 4;
 const TYPE_TXFM type_ls_0[4] = {TYPE_DCT, TYPE_DCT, TYPE_ADST, TYPE_ADST};
@@ -54,44 +54,48 @@
 
     for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
          ++txfm_type_idx) {
-      TXFM_2D_CFG fwd_txfm_cfg = fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
-      Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
-      TYPE_TXFM type0 = type_ls_0[txfm_type_idx];
-      TYPE_TXFM type1 = type_ls_1[txfm_type_idx];
-      int amplify_bit =
-          fwd_txfm_cfg.shift[0] + fwd_txfm_cfg.shift[1] + fwd_txfm_cfg.shift[2];
-      double amplify_factor =
-          amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+      const TXFM_2D_CFG* fwd_txfm_cfg =
+          fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
+      if (fwd_txfm_cfg != NULL) {
+        Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
+        TYPE_TXFM type0 = type_ls_0[txfm_type_idx];
+        TYPE_TXFM type1 = type_ls_1[txfm_type_idx];
+        int amplify_bit = fwd_txfm_cfg->shift[0] + fwd_txfm_cfg->shift[1] +
+                          fwd_txfm_cfg->shift[2];
+        double amplify_factor =
+            amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
 
-      ACMRandom rnd(ACMRandom::DeterministicSeed());
-      int count = 5000;
-      double avg_abs_error = 0;
-      for (int ci = 0; ci < count; ci++) {
-        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-          input[ni] = rnd.Rand16() % base;
-          ref_input[ni] = static_cast<double>(input[ni]);
-          output[ni] = 0;
-          ref_output[ni] = 0;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+        int count = 500;
+        double avg_abs_error = 0;
+        for (int ci = 0; ci < count; ci++) {
+          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % base;
+            ref_input[ni] = static_cast<double>(input[ni]);
+            output[ni] = 0;
+            ref_output[ni] = 0;
+          }
+
+          fwd_txfm_func(input, output, txfm_size, fwd_txfm_cfg, bd);
+          reference_hybrid_2d(ref_input, ref_output, txfm_size, type0, type1);
+
+          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+            ref_output[ni] = round(ref_output[ni] * amplify_factor);
+            EXPECT_LE(fabs(output[ni] - ref_output[ni]) / amplify_factor, 60);
+          }
+          avg_abs_error += compute_avg_abs_error<int32_t, double>(
+              output, ref_output, sqr_txfm_size);
         }
 
-        fwd_txfm_func(input, output, txfm_size, &fwd_txfm_cfg, bd);
-        reference_hybrid_2d(ref_input, ref_output, txfm_size, type0, type1);
-
-        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-          ref_output[ni] = round(ref_output[ni] * amplify_factor);
-          EXPECT_LE(fabs(output[ni] - ref_output[ni]) / amplify_factor, 30);
-        }
-        avg_abs_error += compute_avg_abs_error<int32_t, double>(
-            output, ref_output, sqr_txfm_size);
+        avg_abs_error /= amplify_factor;
+        avg_abs_error /= count;
+        // max_abs_avg_error comes from upper bound of avg_abs_error
+        // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
+        // %f\n",
+        // type0, type1, txfm_size, avg_abs_error);
+        double max_abs_avg_error = 5;
+        EXPECT_LE(avg_abs_error, max_abs_avg_error);
       }
-
-      avg_abs_error /= amplify_factor;
-      avg_abs_error /= count;
-      // max_abs_avg_error comes from upper bound of avg_abs_error
-      // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
-      // %f\n", type0, type1, txfm_size, avg_abs_error);
-      double max_abs_avg_error = 1.5;
-      EXPECT_LE(avg_abs_error, max_abs_avg_error);
     }
 
     delete[] input;
@@ -100,5 +104,6 @@
     delete[] ref_output;
   }
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 }  // anonymous namespace
diff --git a/test/vp10_inv_txfm1d_test.cc b/test/vp10_inv_txfm1d_test.cc
index 3b716c8..2e9e58d 100644
--- a/test/vp10_inv_txfm1d_test.cc
+++ b/test/vp10_inv_txfm1d_test.cc
@@ -16,16 +16,20 @@
 
 namespace {
 static int txfm_type_num = 2;
-static int txfm_size_num = 4;
-static int txfm_size_ls[4] = {4, 8, 16, 32};
+static int txfm_size_num = 5;
+static int txfm_size_ls[5] = {4, 8, 16, 32, 64};
 
-static TxfmFunc fwd_txfm_func_ls[2][4] = {
-    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new},
-    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new}};
+static TxfmFunc fwd_txfm_func_ls[2][5] = {
+    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new,
+     vp10_fdct64_new},
+    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new,
+     NULL}};
 
-static TxfmFunc inv_txfm_func_ls[2][4] = {
-    {vp10_idct4_new, vp10_idct8_new, vp10_idct16_new, vp10_idct32_new},
-    {vp10_iadst4_new, vp10_iadst8_new, vp10_iadst16_new, vp10_iadst32_new}};
+static TxfmFunc inv_txfm_func_ls[2][5] = {
+    {vp10_idct4_new, vp10_idct8_new, vp10_idct16_new, vp10_idct32_new,
+     vp10_idct64_new},
+    {vp10_iadst4_new, vp10_iadst8_new, vp10_iadst16_new, vp10_iadst32_new,
+     NULL}};
 
 // the maximum stage number of fwd/inv 1d dct/adst txfm is 12
 static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
@@ -44,19 +48,22 @@
       TxfmFunc inv_txfm_func = inv_txfm_func_ls[ti][si];
       int max_error = 2;
 
-      const int count_test_block = 5000;
-      for (int ci = 0; ci < count_test_block; ++ci) {
-        for (int ni = 0; ni < txfm_size; ++ni) {
-          input[ni] = rnd.Rand16() % base - rnd.Rand16() % base;
-        }
+      if (fwd_txfm_func != NULL) {
+        const int count_test_block = 5000;
+        for (int ci = 0; ci < count_test_block; ++ci) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % base - rnd.Rand16() % base;
+          }
 
-        fwd_txfm_func(input, output, cos_bit, range_bit);
-        inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
 
-        for (int ni = 0; ni < txfm_size; ++ni) {
-          EXPECT_LE(abs(input[ni] - round_shift(round_trip_output[ni],
-                                                get_max_bit(txfm_size) - 1)),
-                    max_error);
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            int node_err =
+                abs(input[ni] - round_shift(round_trip_output[ni],
+                                            get_max_bit(txfm_size) - 1));
+            EXPECT_LE(node_err, max_error);
+          }
         }
       }
     }
diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
index 603821e..9257244 100644
--- a/test/vp10_inv_txfm2d_test.cc
+++ b/test/vp10_inv_txfm2d_test.cc
@@ -12,47 +12,48 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
+#include "./vp10_rtcd.h"
 #include "test/acm_random.h"
 #include "test/vp10_txfm_test.h"
-#include "vp10/common/vp10_fwd_txfm2d.h"
 #include "vp10/common/vp10_fwd_txfm2d_cfg.h"
-#include "vp10/common/vp10_inv_txfm2d.h"
 #include "vp10/common/vp10_inv_txfm2d_cfg.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-const int txfm_size_num = 4;
-const int txfm_size_ls[4] = {4, 8, 16, 32};
-const TXFM_2D_CFG fwd_txfm_cfg_ls[4][4] = {
-    {fwd_txfm_2d_cfg_dct_dct_4, fwd_txfm_2d_cfg_dct_adst_4,
-     fwd_txfm_2d_cfg_adst_adst_4, fwd_txfm_2d_cfg_adst_dct_4},
-    {fwd_txfm_2d_cfg_dct_dct_8, fwd_txfm_2d_cfg_dct_adst_8,
-     fwd_txfm_2d_cfg_adst_adst_8, fwd_txfm_2d_cfg_adst_dct_8},
-    {fwd_txfm_2d_cfg_dct_dct_16, fwd_txfm_2d_cfg_dct_adst_16,
-     fwd_txfm_2d_cfg_adst_adst_16, fwd_txfm_2d_cfg_adst_dct_16},
-    {fwd_txfm_2d_cfg_dct_dct_32, fwd_txfm_2d_cfg_dct_adst_32,
-     fwd_txfm_2d_cfg_adst_adst_32, fwd_txfm_2d_cfg_adst_dct_32}};
+#if CONFIG_VP9_HIGHBITDEPTH
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
+const TXFM_2D_CFG* fwd_txfm_cfg_ls[5][4] = {
+    {&fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_adst_4,
+     &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_dct_4},
+    {&fwd_txfm_2d_cfg_dct_dct_8, &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_8, &fwd_txfm_2d_cfg_adst_dct_8},
+    {&fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_adst_16,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_dct_16},
+    {&fwd_txfm_2d_cfg_dct_dct_32, &fwd_txfm_2d_cfg_dct_adst_32,
+     &fwd_txfm_2d_cfg_adst_adst_32, &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
 
-const TXFM_2D_CFG inv_txfm_cfg_ls[4][4] = {
-    {inv_txfm_2d_cfg_dct_dct_4, inv_txfm_2d_cfg_dct_adst_4,
-     inv_txfm_2d_cfg_adst_adst_4, inv_txfm_2d_cfg_adst_dct_4},
-    {inv_txfm_2d_cfg_dct_dct_8, inv_txfm_2d_cfg_dct_adst_8,
-     inv_txfm_2d_cfg_adst_adst_8, inv_txfm_2d_cfg_adst_dct_8},
-    {inv_txfm_2d_cfg_dct_dct_16, inv_txfm_2d_cfg_dct_adst_16,
-     inv_txfm_2d_cfg_adst_adst_16, inv_txfm_2d_cfg_adst_dct_16},
-    {inv_txfm_2d_cfg_dct_dct_32, inv_txfm_2d_cfg_dct_adst_32,
-     inv_txfm_2d_cfg_adst_adst_32, inv_txfm_2d_cfg_adst_dct_32}};
+const TXFM_2D_CFG* inv_txfm_cfg_ls[5][4] = {
+    {&inv_txfm_2d_cfg_dct_dct_4, &inv_txfm_2d_cfg_dct_adst_4,
+     &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_dct_4},
+    {&inv_txfm_2d_cfg_dct_dct_8, &inv_txfm_2d_cfg_dct_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_8, &inv_txfm_2d_cfg_adst_dct_8},
+    {&inv_txfm_2d_cfg_dct_dct_16, &inv_txfm_2d_cfg_dct_adst_16,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_dct_16},
+    {&inv_txfm_2d_cfg_dct_dct_32, &inv_txfm_2d_cfg_dct_adst_32,
+     &inv_txfm_2d_cfg_adst_adst_32, &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
 
-const Fwd_Txfm2d_Func fwd_txfm_func_ls[4] = {
-    vp10_fwd_txfm2d_4x4, vp10_fwd_txfm2d_8x8, vp10_fwd_txfm2d_16x16,
-    vp10_fwd_txfm2d_32x32};
-const Inv_Txfm2d_Func inv_txfm_func_ls[4] = {
-    vp10_inv_txfm2d_add_4x4, vp10_inv_txfm2d_add_8x8, vp10_inv_txfm2d_add_16x16,
-    vp10_inv_txfm2d_add_32x32};
+const Fwd_Txfm2d_Func fwd_txfm_func_ls[5] = {
+    vp10_fwd_txfm2d_4x4_c, vp10_fwd_txfm2d_8x8_c, vp10_fwd_txfm2d_16x16_c,
+    vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c};
+const Inv_Txfm2d_Func inv_txfm_func_ls[5] = {
+    vp10_inv_txfm2d_add_4x4_c, vp10_inv_txfm2d_add_8x8_c,
+    vp10_inv_txfm2d_add_16x16_c, vp10_inv_txfm2d_add_32x32_c,
+    vp10_inv_txfm2d_add_64x64_c};
 
 const int txfm_type_num = 4;
 
@@ -66,44 +67,46 @@
 
     for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
          ++txfm_type_idx) {
-      const TXFM_2D_CFG fwd_txfm_cfg =
+      const TXFM_2D_CFG* fwd_txfm_cfg =
           fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
-      const TXFM_2D_CFG inv_txfm_cfg =
+      const TXFM_2D_CFG* inv_txfm_cfg =
           inv_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
-      const Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
-      const Inv_Txfm2d_Func inv_txfm_func = inv_txfm_func_ls[txfm_size_idx];
-      const int count = 5000;
-      double avg_abs_error = 0;
-      ACMRandom rnd(ACMRandom::DeterministicSeed());
-      for (int ci = 0; ci < count; ci++) {
-        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-          if (ci == 0) {
-            int extreme_input = base - 1;
-            input[ni] = extreme_input;  // extreme case
-            ref_input[ni] = 0;
-          } else {
-            input[ni] = rnd.Rand16() % base;
-            ref_input[ni] = 0;
+      if (fwd_txfm_cfg != NULL) {
+        const Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
+        const Inv_Txfm2d_Func inv_txfm_func = inv_txfm_func_ls[txfm_size_idx];
+        const int count = 1000;
+        double avg_abs_error = 0;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+        for (int ci = 0; ci < count; ci++) {
+          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+            if (ci == 0) {
+              int extreme_input = base - 1;
+              input[ni] = extreme_input;  // extreme case
+              ref_input[ni] = 0;
+            } else {
+              input[ni] = rnd.Rand16() % base;
+              ref_input[ni] = 0;
+            }
           }
+
+          fwd_txfm_func(input, output, txfm_size, fwd_txfm_cfg, bd);
+          inv_txfm_func(output, ref_input, txfm_size, inv_txfm_cfg, bd);
+
+          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+            EXPECT_LE(abs(input[ni] - ref_input[ni]), 2);
+          }
+          avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
+              input, ref_input, sqr_txfm_size);
         }
 
-        fwd_txfm_func(input, output, txfm_size, &fwd_txfm_cfg, bd);
-        inv_txfm_func(output, ref_input, txfm_size, &inv_txfm_cfg, bd);
-
-        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-          EXPECT_LE(abs(input[ni] - ref_input[ni]), 2);
-        }
-        avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
-            input, ref_input, sqr_txfm_size);
+        avg_abs_error /= count;
+        // max_abs_avg_error comes from upper bound of
+        // printf("txfm_size: %d accuracy_avg_abs_error: %f\n",
+        // txfm_size, avg_abs_error);
+        // TODO(angiebird): this upper bound is from adst_adst_8
+        const double max_abs_avg_error = 0.024;
+        EXPECT_LE(avg_abs_error, max_abs_avg_error);
       }
-
-      avg_abs_error /= count;
-      // max_abs_avg_error comes from upper bound of
-      // printf("txfm_size: %d accuracy_avg_abs_error: %f\n", txfm_size,
-      // avg_abs_error);
-      // TODO(angiebird): this upper bound is from adst_adst_8
-      const double max_abs_avg_error = 0.024;
-      EXPECT_LE(avg_abs_error, max_abs_avg_error);
     }
 
     delete[] input;
@@ -111,5 +114,6 @@
     delete[] output;
   }
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 }  // anonymous namespace
diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index e14aee7..b3c216e 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@@ -97,10 +97,13 @@
 }
 
 void vp10_free_context_buffers(VP10_COMMON *cm) {
+  int i;
   cm->free_mi(cm);
   free_seg_map(cm);
-  vpx_free(cm->above_context);
-  cm->above_context = NULL;
+  for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
 #if CONFIG_VAR_TX
@@ -128,11 +131,14 @@
   }
 
   if (cm->above_context_alloc_cols < cm->mi_cols) {
-    vpx_free(cm->above_context);
-    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
-        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-        sizeof(*cm->above_context));
-    if (!cm->above_context) goto fail;
+    int i;
+    for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+      cm->above_context[i] = (ENTROPY_CONTEXT *)vpx_calloc(
+          2 * mi_cols_aligned_to_sb(cm->mi_cols),
+          sizeof(*cm->above_context[0]));
+      if (!cm->above_context[i]) goto fail;
+    }
 
     vpx_free(cm->above_seg_context);
     cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 016fc75..de91431 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -167,9 +167,9 @@
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
 #if CONFIG_VAR_TX
-  // TODO(jingning): This effectively assigned 64 entries for each 8x8 block.
+  // TODO(jingning): This effectively assigned an entry for each 8x8 block.
   // Apparently it takes much more space than needed.
-  TX_SIZE inter_tx_size[64];
+  TX_SIZE inter_tx_size[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
 #endif
   int8_t skip;
   int8_t has_no_coeffs;
@@ -212,6 +212,9 @@
 #if CONFIG_REF_MV
   uint8_t ref_mv_idx;
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
@@ -356,6 +359,37 @@
   return subsize_lookup[partition][bsize];
 }
 
+#if CONFIG_EXT_PARTITION_TYPES
+static INLINE PARTITION_TYPE get_partition(const MODE_INFO *const mi,
+                                           int mi_stride, int mi_rows,
+                                           int mi_cols, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) / 4;
+  MODE_INFO m = mi[mi_row * mi_stride + mi_col];
+  PARTITION_TYPE partition = partition_lookup[bsl][m.mbmi.sb_type];
+  if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
+      mi_row + bs < mi_rows && mi_col + bs < mi_cols) {
+    BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+    BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+    MODE_INFO m_right = mi[mi_row * mi_stride + mi_col + bs];
+    MODE_INFO m_below = mi[(mi_row + bs) * mi_stride + mi_col];
+    if (m.mbmi.sb_type == h) {
+      return m_below.mbmi.sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+    } else if (m.mbmi.sb_type == v) {
+      return m_right.mbmi.sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+    } else if (m_below.mbmi.sb_type == h) {
+      return PARTITION_HORZ_A;
+    } else if (m_right.mbmi.sb_type == v) {
+      return PARTITION_VERT_A;
+    } else {
+      return PARTITION_SPLIT;
+    }
+  }
+  return partition;
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
   DCT_DCT,    // DC
   ADST_DCT,   // V
@@ -383,10 +417,10 @@
 #define USE_MSKTX_FOR_32X32      0
 
 static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
-  1, 19, 12, 2
+  1, 16, 12, 2
 };
 static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
-  1, 17, 10
+  1, 12, 10
 };
 
 #if EXT_TX_SIZES == 4
@@ -437,17 +471,17 @@
 
 // Transform types used in each intra set
 static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, },
+  {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0},
 };
 
 // Transform types used in each inter set
 static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
-  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0},
+  {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1},
+  {1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
 };
 
 static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h
index 84476fa..67d6e3a 100644
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@@ -80,6 +80,59 @@
   }
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+    BLOCK_64X64,
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X32,
+  }, {  // PARTITION_HORZ_A
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+  }, {  // PARTITION_HORZ_B
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_64X32,
+  }, {  // PARTITION_VERT_A
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+  }, {  // PARTITION_VERT_B
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
+    BLOCK_32X64,
+  }
+};
+#else
 static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
   {     // PARTITION_NONE
     BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
@@ -107,6 +160,7 @@
     BLOCK_32X32,
   }
 };
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
   TX_4X4,   TX_4X4,   TX_4X4,
@@ -180,9 +234,16 @@
   {{TX_32X32, TX_16X16}, {TX_16X16, TX_16X16}},
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+static const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
+  -1, 0, 0, 1, 0, 0, 0, 0
+};
+
+#else
 static const int partition_supertx_context_lookup[PARTITION_TYPES] = {
   -1, 0, 0, 1
 };
+#endif  // CONFIG_EXT_PARTITION_TYPES
 #endif  // CONFIG_SUPERTX
 
 #ifdef __cplusplus
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index d48679e..b57ed7a 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -148,6 +148,31 @@
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
+                                             [EXT_PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 199, 122, 141, 128, 128, 128, 128 },  // a/l both not split
+  { 147,  63, 159, 128, 128, 128, 128 },  // a split, l not split
+  { 148, 133, 118, 128, 128, 128, 128 },  // l split, a not split
+  { 121, 104, 114, 128, 128, 128, 128 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87, 128, 128, 128, 128 },  // a/l both not split
+  {  92,  41,  83, 128, 128, 128, 128 },  // a split, l not split
+  {  82,  99,  50, 128, 128, 128, 128 },  // l split, a not split
+  {  53,  39,  39, 128, 128, 128, 128 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59, 128, 128, 128, 128 },  // a/l both not split
+  {  68,  26,  63, 128, 128, 128, 128 },  // a split, l not split
+  {  52,  79,  25, 128, 128, 128, 128 },  // l split, a not split
+  {  17,  14,  12, 128, 128, 128, 128 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30, 128, 128, 128, 128 },  // a/l both not split
+  {  72,  16,  44, 128, 128, 128, 128 },  // a split, l not split
+  {  58,  32,  12, 128, 128, 128, 128 },  // l split, a not split
+  {  10,   7,   6, 128, 128, 128, 128 },  // a/l both split
+};
+#else
 static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
@@ -171,6 +196,7 @@
   {  58,  32,  12 },  // l split, a not split
   {  10,   7,   6 },  // a/l both split
 };
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_REF_MV
 static const vpx_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
@@ -186,7 +212,7 @@
 };
 
 static const vpx_prob default_drl_prob[DRL_MODE_CONTEXTS] = {
-    128, 128, 128,
+    128, 160, 180, 128, 160
 };
 
 #if CONFIG_EXT_INTER
@@ -292,6 +318,18 @@
   -PARTITION_VERT, -PARTITION_SPLIT
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+const vpx_tree_index vp10_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2,
+  6, 4,
+  8, -PARTITION_SPLIT,
+  -PARTITION_HORZ, 10,
+  -PARTITION_VERT, 12,
+  -PARTITION_HORZ_A, -PARTITION_HORZ_B,
+  -PARTITION_VERT_A, -PARTITION_VERT_B
+};
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
 };
@@ -836,47 +874,27 @@
 const vpx_tree_index vp10_ext_tx_inter_tree[EXT_TX_SETS_INTER]
                                            [TREE_SIZE(TX_TYPES)] = {
   { // ToDo(yaowu): remove used entry 0.
-    -IDTX, 2,
-    -V_DCT, 4,
-    -H_DCT, 6,
-    -DCT_DCT, 8,
-    -DST_DST, 10,
-    12, 22,
-    14, 16,
-    -DST_DCT, -DCT_DST,
-    18, 20,
-    -ADST_DCT, -DCT_ADST,
-    -FLIPADST_DCT, -DCT_FLIPADST,
-    24, 30,
-    26, 28,
-    -DST_ADST, -ADST_DST,
-    -DST_FLIPADST, -FLIPADST_DST,
-    32, 34,
-    -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST,
+    0
   }, {
     -IDTX, 2,
-    -V_DCT, 4,
-    -H_DCT, 6,
-    -DCT_DCT, 8,
-    -DST_DST, 10,
-    12, 22,
-    14, 16,
-    -DST_DCT, -DCT_DST,
-    18, 20,
+    4, 14,
+    6, 8,
+    -V_DCT, -H_DCT,
+    10, 12,
+    -V_ADST, -H_ADST,
+    -V_FLIPADST, -H_FLIPADST,
+    -DCT_DCT, 16,
+    18, 24,
+    20, 22,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    24, 30,
     26, 28,
-    -DST_ADST, -ADST_DST,
-    -DST_FLIPADST, -FLIPADST_DST,
-    32, 34,
     -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
   }, {
     -IDTX, 2,
-    -V_DCT, 4,
-    -H_DCT, 6,
+    4, 6,
+    -V_DCT, -H_DCT,
     -DCT_DCT, 8,
     10, 16,
     12, 14,
@@ -893,39 +911,19 @@
 const vpx_tree_index vp10_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
                                            [TREE_SIZE(TX_TYPES)] = {
   {  // ToDo(yaowu): remove unused entry 0.
-    -IDTX, 2,
-    -DCT_DCT, 4,
-    -DST_DST, 6,
-    8, 18,
-    10, 12,
-    -DST_DCT, -DCT_DST,
-    14, 16,
-    -ADST_DCT, -DCT_ADST,
-    -FLIPADST_DCT, -DCT_FLIPADST,
-    20, 26,
-    22, 24,
-    -DST_ADST, -ADST_DST,
-    -DST_FLIPADST, -FLIPADST_DST,
-    28, 30,
-    -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST,
+    0
   }, {
     -IDTX, 2,
     -DCT_DCT, 4,
-    -DST_DST, 6,
-    8, 18,
-    10, 12,
-    -DST_DCT, -DCT_DST,
-    14, 16,
+    6, 8,
+    -V_DCT, -H_DCT,
+    10, 16,
+    12, 14,
     -ADST_DCT, -DCT_ADST,
     -FLIPADST_DCT, -DCT_FLIPADST,
-    20, 26,
-    22, 24,
-    -DST_ADST, -ADST_DST,
-    -DST_FLIPADST, -FLIPADST_DST,
-    28, 30,
+    18, 20,
     -ADST_ADST, -FLIPADST_FLIPADST,
-    -ADST_FLIPADST, -FLIPADST_ADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
   }, {
     -IDTX, 2,
     -DCT_DCT, 4,
@@ -942,33 +940,25 @@
 static const vpx_prob
 default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
   { // ToDo(yaowu): remove unused entry 0.
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 0 },
+    { 0 },
+    { 0 },
 #if EXT_TX_SIZES == 4
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 0 },
 #endif
   }, {
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 15, 15, 112, 16, 128, 128, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
 #if EXT_TX_SIZES == 4
-    { 12, 15, 15, 160, 16, 144, 160, 128, 128, 128,
-      128, 128, 128, 128, 128, 128, 128, 128 },
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
 #endif
   }, {
-    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
-    { 12, 15, 15, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
 #if EXT_TX_SIZES == 4
-    { 12, 15, 15, 160, 128, 128, 128, 128, 128, 128, 128 },
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
 #endif
   }, {
     { 12, },
@@ -985,266 +975,110 @@
                          [INTRA_MODES][TX_TYPES - 1] = {
   { // ToDo(yaowu): remove unused entry 0.
     {
-      { 8, 11, 24, 112, 87, 137, 127, 134,
-      128, 86, 128, 124, 125, 133, 176, 123, },
-      { 10, 9, 39, 106, 73, 155, 163, 228,
-      35, 62, 129, 127, 133, 114, 213, 234, },
-      { 10, 9, 14, 88, 91, 127, 151, 51,
-      210, 89, 126, 58, 52, 116, 217, 24, },
-      { 9, 6, 29, 113, 98, 131, 149, 210,
-      119, 60, 124, 93, 90, 143, 170, 197, },
-      { 8, 8, 38, 101, 111, 166, 167, 141,
-      130, 105, 128, 75, 75, 118, 197, 117, },
-      { 7, 8, 39, 91, 101, 153, 166, 200,
-      99, 77, 123, 90, 83, 144, 224, 192, },
-      { 7, 10, 26, 86, 119, 154, 130, 101,
-      152, 91, 129, 75, 79, 137, 219, 77, },
-      { 10, 13, 20, 86, 102, 162, 112, 76,
-      171, 86, 134, 122, 106, 124, 196, 44, },
-      { 8, 9, 33, 108, 100, 144, 148, 215,
-      77, 60, 125, 125, 128, 126, 198, 220, },
-      { 3, 10, 29, 111, 69, 141, 204, 141,
-      139, 93, 120, 75, 77, 163, 242, 124, },
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
     }, {
-      { 2, 53, 18, 147, 96, 98, 136, 133,
-      131, 120, 153, 163, 169, 137, 173, 124, },
-      { 4, 18, 34, 133, 54, 130, 179, 228,
-      28, 72, 153, 164, 168, 118, 227, 239, },
-      { 4, 18, 13, 125, 72, 110, 176, 36,
-      221, 104, 148, 75, 72, 117, 225, 19, },
-      { 8, 33, 24, 162, 113, 99, 147, 226,
-      103, 85, 153, 143, 153, 124, 155, 210, },
-      { 2, 15, 35, 107, 127, 158, 192, 128,
-      126, 116, 151, 95, 88, 182, 241, 119, },
-      { 3, 15, 36, 112, 100, 146, 194, 189,
-      90, 98, 152, 99, 100, 165, 235, 175, },
-      { 3, 16, 29, 109, 103, 140, 182, 76,
-      173, 104, 147, 82, 85, 159, 235, 70, },
-      { 9, 24, 14, 120, 86, 156, 161, 34,
-      177, 121, 142, 128, 128, 126, 185, 37, },
-      { 5, 24, 29, 152, 98, 99, 174, 228,
-      82, 76, 147, 149, 128, 132, 191, 225, },
-      { 2, 15, 29, 111, 77, 126, 200, 135,
-      117, 93, 152, 96, 84, 191, 245, 135, },
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
     }, {
-      { 2, 69, 13, 173, 111, 69, 137, 159,
-      159, 146, 151, 193, 203, 131, 180, 123, },
-      { 1, 12, 33, 164, 32, 98, 204, 242,
-      23, 99, 149, 215, 232, 110, 239, 245, },
-      { 1, 17, 9, 136, 82, 83, 171, 28,
-      231, 128, 135, 76, 64, 118, 235, 17, },
-      { 4, 41, 17, 195, 131, 58, 161, 237,
-      141, 97, 153, 189, 191, 117, 182, 202, },
-      { 2, 17, 36, 104, 149, 137, 217, 139,
-      191, 119, 125, 107, 115, 223, 249, 110, },
-      { 2, 14, 24, 127, 91, 135, 219, 198,
-      113, 91, 164, 125, 173, 211, 250, 116, },
-      { 3, 19, 24, 120, 102, 130, 209, 81,
-      187, 95, 143, 102, 50, 190, 244, 56, },
-      { 4, 27, 10, 128, 91, 157, 181, 33,
-      181, 150, 141, 141, 166, 114, 215, 25, },
-      { 2, 34, 27, 187, 102, 77, 210, 245,
-      113, 107, 136, 184, 188, 121, 210, 234, },
-      { 1, 15, 22, 141, 59, 94, 208, 133,
-      154, 95, 152, 112, 105, 191, 242, 111, },
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
 #if EXT_TX_SIZES == 4
     }, {
-      { 2, 69, 13, 173, 111, 69, 137, 159,
-      159, 146, 151, 193, 203, 131, 180, 123, },
-      { 1, 12, 33, 164, 32, 98, 204, 242,
-      23, 99, 149, 215, 232, 110, 239, 245, },
-      { 1, 17, 9, 136, 82, 83, 171, 28,
-      231, 128, 135, 76, 64, 118, 235, 17, },
-      { 4, 41, 17, 195, 131, 58, 161, 237,
-      141, 97, 153, 189, 191, 117, 182, 202, },
-      { 2, 17, 36, 104, 149, 137, 217, 139,
-      191, 119, 125, 107, 115, 223, 249, 110, },
-      { 2, 14, 24, 127, 91, 135, 219, 198,
-      113, 91, 164, 125, 173, 211, 250, 116, },
-      { 3, 19, 24, 120, 102, 130, 209, 81,
-      187, 95, 143, 102, 50, 190, 244, 56, },
-      { 4, 27, 10, 128, 91, 157, 181, 33,
-      181, 150, 141, 141, 166, 114, 215, 25, },
-      { 2, 34, 27, 187, 102, 77, 210, 245,
-      113, 107, 136, 184, 188, 121, 210, 234, },
-      { 1, 15, 22, 141, 59, 94, 208, 133,
-      154, 95, 152, 112, 105, 191, 242, 111, },
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
 #endif
     },
   }, {
     {
-      {   8,  11,  24, 112,  87, 137, 127, 134,
-        128,  86, 128, 124, 125, 133, 176, 123, },
-      {  10,   9,  39, 106,  73, 155, 163, 228,
-        35,  62, 129, 127, 133, 114, 213, 234, },
-      {  10,   9,  14,  88,  91, 127, 151,  51,
-        210,  89, 126,  58,  52, 116, 217,  24, },
-      {   9,   6,  29, 113,  98, 131, 149, 210,
-        119,  60, 124,  93,  90, 143, 170, 197, },
-      {   8,   8,  38, 101, 111, 166, 167, 141,
-        130, 105, 128,  75,  75, 118, 197, 117, },
-      {   7,   8,  39,  91, 101, 153, 166, 200,
-        99,  77, 123,  90,  83, 144, 224, 192, },
-      {   7,  10,  26,  86, 119, 154, 130, 101,
-        152,  91, 129,  75,  79, 137, 219,  77, },
-      {  10,  13,  20,  86, 102, 162, 112,  76,
-        171,  86, 134, 122, 106, 124, 196,  44, },
-      {   8,   9,  33, 108, 100, 144, 148, 215,
-        77,  60, 125, 125, 128, 126, 198, 220, },
-      {   3,  10,  29, 111,  69, 141, 204, 141,
-        139,  93, 120,  75,  77, 163, 242, 124, },
+      {   8, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {  10,  28,  32, 128, 176, 192, 208, 128, 128, 128, 128, },
+      {  10,  28,  32, 128, 176, 192,  48, 128, 128, 128, 128, },
+      {   9, 160,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   8,  28,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   7,  28,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   7,  20,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {  10,  23,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   8,  29,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  20,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
     }, {
-      {   2,  53,  18, 147,  96,  98, 136, 133,
-        131, 120, 153, 163, 169, 137, 173, 124, },
-      {   4,  18,  34, 133,  54, 130, 179, 228,
-        28,  72, 153, 164, 168, 118, 227, 239, },
-      {   4,  18,  13, 125,  72, 110, 176,  36,
-        221, 104, 148,  75,  72, 117, 225,  19, },
-      {   8,  33,  24, 162, 113,  99, 147, 226,
-        103,  85, 153, 143, 153, 124, 155, 210, },
-      {   2,  15,  35, 107, 127, 158, 192, 128,
-        126, 116, 151,  95,  88, 182, 241, 119, },
-      {   3,  15,  36, 112, 100, 146, 194, 189,
-        90,  98, 152,  99, 100, 165, 235, 175, },
-      {   3,  16,  29, 109, 103, 140, 182,  76,
-        173, 104, 147,  82,  85, 159, 235,  70, },
-      {   9,  24,  14, 120,  86, 156, 161,  34,
-        177, 121, 142, 128, 128, 126, 185,  37, },
-      {   5,  24,  29, 152,  98,  99, 174, 228,
-        82,  76, 147, 149, 128, 132, 191, 225, },
-      {   2,  15,  29, 111,  77, 126, 200, 135,
-        117,  93, 152,  96,  84, 191, 245, 135, },
+      {   2, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  28,  32, 128, 176, 192, 208, 128, 128, 128, 128, },
+      {   4,  28,  32, 128, 176, 192,  48, 128, 128, 128, 128, },
+      {   8, 160,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   3,  28,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  26,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   9,  24,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   5,  24,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   2,  25,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
     }, {
-      {   2,  69,  13, 173, 111,  69, 137, 159,
-        159, 146, 151, 193, 203, 131, 180, 123, },
-      {   1,  12,  33, 164,  32,  98, 204, 242,
-         23,  99, 149, 215, 232, 110, 239, 245, },
-      {   1,  17,   9, 136,  82,  83, 171,  28,
-        231, 128, 135,  76,  64, 118, 235,  17, },
-      {   4,  41,  17, 195, 131,  58, 161, 237,
-        141,  97, 153, 189, 191, 117, 182, 202, },
-      {   2,  17,  36, 104, 149, 137, 217, 139,
-        191, 119, 125, 107, 115, 223, 249, 110, },
-      {   2,  14,  24, 127,  91, 135, 219, 198,
-        113,  91, 164, 125, 173, 211, 250, 116, },
-      {   3,  19,  24, 120, 102, 130, 209,  81,
-        187,  95, 143, 102,  50, 190, 244,  56, },
-      {   4,  27,  10, 128,  91, 157, 181,  33,
-        181, 150, 141, 141, 166, 114, 215,  25, },
-      {   2,  34,  27, 187, 102,  77, 210, 245,
-        113, 107, 136, 184, 188, 121, 210, 234, },
-      {   1,  15,  22, 141,  59,  94, 208, 133,
-        154,  95, 152, 112, 105, 191, 242, 111, },
+      {   2, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  28,  32, 128, 176, 192, 208, 128, 128, 128, 128, },
+      {   1,  28,  32, 128, 176, 192,  48, 128, 128, 128, 128, },
+      {   4, 160,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   2,  28,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  29,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   4,  27,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   2,  34,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   1,  25,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
 #if EXT_TX_SIZES == 4
     }, {
-      {   2,  69,  13, 173, 111,  69, 137, 159,
-        159, 146, 151, 193, 203, 131, 180, 123, },
-      {   1,  12,  33, 164,  32,  98, 204, 242,
-        23,  99, 149, 215, 232, 110, 239, 245, },
-      {   1,  17,   9, 136,  82,  83, 171,  28,
-        231, 128, 135,  76,  64, 118, 235,  17, },
-      {   4,  41,  17, 195, 131,  58, 161, 237,
-        141,  97, 153, 189, 191, 117, 182, 202, },
-      {   2,  17,  36, 104, 149, 137, 217, 139,
-        191, 119, 125, 107, 115, 223, 249, 110, },
-      {   2,  14,  24, 127,  91, 135, 219, 198,
-        113,  91, 164, 125, 173, 211, 250, 116, },
-      {   3,  19,  24, 120, 102, 130, 209,  81,
-        187,  95, 143, 102,  50, 190, 244,  56, },
-      {   4,  27,  10, 128,  91, 157, 181,  33,
-        181, 150, 141, 141, 166, 114, 215,  25, },
-      {   2,  34,  27, 187, 102,  77, 210, 245,
-        113, 107, 136, 184, 188, 121, 210, 234, },
-      {   1,  15,  22, 141,  59,  94, 208, 133,
-        154,  95, 152, 112, 105, 191, 242, 111, },
+      {   2, 176,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  12,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   1,  17,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   4,  41,  32, 128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  17,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
+      {   2,  14,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  19,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   4,  27,  32, 128, 160, 176,  64, 128, 128, 128, 128, },
+      {   2,  34,  32, 128, 160, 176, 192, 128, 128, 128, 128, },
+      {   1,  15,  32, 128,  96, 128, 128, 128, 160, 192, 128, },
 #endif
     },
   }, {
     {
-      {   8, 176, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {  10,  28, 176, 192, 208, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {  10,  28, 176, 192,  48, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   9, 160, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   8,  28,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   7,  28, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   7,  20, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {  10,  23, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   8,  29, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   3,  20,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   8, 176, 128, 128, 128, 128, 128, 128, 128, },
+      {  10,  28, 176, 192, 208, 128, 128, 128, 128, },
+      {  10,  28, 176, 192,  48, 128, 128, 128, 128, },
+      {   9, 160, 128, 128, 128, 128, 128, 128, 128, },
+      {   8,  28,  96, 128, 128, 128, 160, 192, 128, },
+      {   7,  28, 160, 176, 192, 128, 128, 128, 128, },
+      {   7,  20, 160, 176,  64, 128, 128, 128, 128, },
+      {  10,  23, 160, 176,  64, 128, 128, 128, 128, },
+      {   8,  29, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  20,  96, 128, 128, 128, 160, 192, 128, },
     }, {
-      {   2, 176, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  28, 176, 192, 208, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  28, 176, 192,  48, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   8, 160, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   3,  28, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   3,  26, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   9,  24, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   5,  24, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  25,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2, 176, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  28, 176, 192, 208, 128, 128, 128, 128, },
+      {   4,  28, 176, 192,  48, 128, 128, 128, 128, },
+      {   8, 160, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28,  96, 128, 128, 128, 160, 192, 128, },
+      {   3,  28, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  26, 160, 176,  64, 128, 128, 128, 128, },
+      {   9,  24, 160, 176,  64, 128, 128, 128, 128, },
+      {   5,  24, 160, 176, 192, 128, 128, 128, 128, },
+      {   2,  25,  96, 128, 128, 128, 160, 192, 128, },
     }, {
-      {   2, 176, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  28, 176, 192, 208, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  28, 176, 192,  48, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4, 160, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  28, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   3,  29, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  27, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  34, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  25,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2, 176, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  28, 176, 192, 208, 128, 128, 128, 128, },
+      {   1,  28, 176, 192,  48, 128, 128, 128, 128, },
+      {   4, 160, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28,  96, 128, 128, 128, 160, 192, 128, },
+      {   2,  28, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  29, 160, 176,  64, 128, 128, 128, 128, },
+      {   4,  27, 160, 176,  64, 128, 128, 128, 128, },
+      {   2,  34, 160, 176, 192, 128, 128, 128, 128, },
+      {   1,  25,  96, 128, 128, 128, 160, 192, 128, },
 #if EXT_TX_SIZES == 4
     }, {
-      {   2, 176, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  12, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  17, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  41, 128, 128, 128, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  17,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  14, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   3,  19, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   4,  27, 160, 176,  64, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   2,  34, 160, 176, 192, 128, 128, 128,
-        128, 128, 128, 128, 128, 128, 128, 128, },
-      {   1,  15,  96, 128, 128, 128, 160, 192,
-        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2, 176, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  12, 160, 176, 192, 128, 128, 128, 128, },
+      {   1,  17, 160, 176,  64, 128, 128, 128, 128, },
+      {   4,  41, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  17,  96, 128, 128, 128, 160, 192, 128, },
+      {   2,  14, 160, 176, 192, 128, 128, 128, 128, },
+      {   3,  19, 160, 176,  64, 128, 128, 128, 128, },
+      {   4,  27, 160, 176,  64, 128, 128, 128, 128, },
+      {   2,  34, 160, 176, 192, 128, 128, 128, 128, },
+      {   1,  15,  96, 128, 128, 128, 160, 192, 128, },
 #endif
     },
   },
@@ -1323,8 +1157,7 @@
   vp10_copy(fc->newmv_prob, default_newmv_prob);
   vp10_copy(fc->zeromv_prob, default_zeromv_prob);
   vp10_copy(fc->refmv_prob, default_refmv_prob);
-  vp10_copy(fc->drl_prob0, default_drl_prob);
-  vp10_copy(fc->drl_prob1, default_drl_prob);
+  vp10_copy(fc->drl_prob, default_drl_prob);
 #if CONFIG_EXT_INTER
   fc->new2mv_prob = default_new2mv_prob;
 #endif  // CONFIG_EXT_INTER
@@ -1408,12 +1241,8 @@
                                             counts->refmv_mode[i]);
 
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    fc->drl_prob0[i] = mode_mv_merge_probs(pre_fc->drl_prob0[i],
-                                           counts->drl_mode0[i]);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    fc->drl_prob1[i] = mode_mv_merge_probs(pre_fc->drl_prob1[i],
-                                           counts->drl_mode1[i]);
-
+    fc->drl_prob[i] = mode_mv_merge_probs(pre_fc->drl_prob[i],
+                                          counts->drl_mode[i]);
 #if CONFIG_EXT_INTER
   fc->new2mv_prob = mode_mv_merge_probs(pre_fc->new2mv_prob,
                                         counts->new2mv_mode);
@@ -1558,9 +1387,17 @@
     vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
                          counts->uv_mode[i], fc->uv_mode_prob[i]);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[0],
+                       counts->partition[0], fc->partition_prob[0]);
+  for (i = 1; i < PARTITION_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp10_ext_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+#else
   for (i = 0; i < PARTITION_CONTEXTS; i++)
     vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_EXT_INTRA
   for (i = 0; i < PLANE_TYPES; ++i) {
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 2443d60..3d5fe9e 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -46,7 +46,11 @@
 typedef struct frame_contexts {
   vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_EXT_PARTITION_TYPES
+  vpx_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
+#else
   vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+#endif
   vp10_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
   vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
@@ -55,8 +59,7 @@
   vpx_prob newmv_prob[NEWMV_MODE_CONTEXTS];
   vpx_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
   vpx_prob refmv_prob[REFMV_MODE_CONTEXTS];
-  vpx_prob drl_prob0[DRL_MODE_CONTEXTS];
-  vpx_prob drl_prob1[DRL_MODE_CONTEXTS];
+  vpx_prob drl_prob[DRL_MODE_CONTEXTS];
 
 #if CONFIG_EXT_INTER
   vpx_prob new2mv_prob;
@@ -111,7 +114,11 @@
   unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+#if CONFIG_EXT_PARTITION_TYPES
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
   unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
   vp10_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
   unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
                          [COEF_BANDS][COEFF_CONTEXTS];
@@ -121,8 +128,7 @@
   unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
   unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
   unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
-  unsigned int drl_mode0[DRL_MODE_CONTEXTS][2];
-  unsigned int drl_mode1[DRL_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
 #if CONFIG_EXT_INTER
   unsigned int new2mv_mode[2];
 #endif  // CONFIG_EXT_INTER
@@ -193,6 +199,10 @@
                             [TREE_SIZE(INTER_COMPOUND_MODES)];
 #endif  // CONFIG_EXT_INTER
 extern const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+#if CONFIG_EXT_PARTITION_TYPES
+extern const vpx_tree_index vp10_ext_partition_tree
+                                [TREE_SIZE(EXT_PARTITION_TYPES)];
+#endif
 extern const vpx_tree_index vp10_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
 extern const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 87bcc8a..16e4520 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -69,6 +69,21 @@
 
 typedef uint8_t BLOCK_SIZE;
 
+#if CONFIG_EXT_PARTITION_TYPES
+typedef enum PARTITION_TYPE {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_HORZ_A,  // HORZ split and the left partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the right partition is split again
+  PARTITION_VERT_A,  // VERT split and the top partition is split again
+  PARTITION_VERT_B,  // VERT split and the bottom partition is split again
+  EXT_PARTITION_TYPES,
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = EXT_PARTITION_TYPES
+} PARTITION_TYPE;
+#else
 typedef enum PARTITION_TYPE {
   PARTITION_NONE,
   PARTITION_HORZ,
@@ -77,10 +92,11 @@
   PARTITION_TYPES,
   PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#define PARTITION_CONTEXTS   (4 * PARTITION_PLOFFSET)
 
 // block transform size
 typedef uint8_t TX_SIZE;
@@ -111,21 +127,17 @@
   FLIPADST_FLIPADST = 6,
   ADST_FLIPADST = 7,
   FLIPADST_ADST = 8,
-  DST_DCT = 9,
-  DCT_DST = 10,
-  DST_ADST = 11,
-  ADST_DST = 12,
-  DST_FLIPADST = 13,
-  FLIPADST_DST = 14,
-  DST_DST = 15,
-  IDTX = 16,
-  V_DCT = 17,
-  H_DCT = 18,
+  IDTX = 9,
+  V_DCT = 10,
+  H_DCT = 11,
+  V_ADST = 12,
+  H_ADST = 13,
+  V_FLIPADST = 14,
+  H_FLIPADST = 15,
 #endif  // CONFIG_EXT_TX
   TX_TYPES,
 } TX_TYPE;
 
-
 #if CONFIG_EXT_TX
 #define EXT_TX_SIZES       4  // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER  4  // Sets of transform selections for INTER
@@ -247,7 +259,7 @@
 #define NEWMV_MODE_CONTEXTS  7
 #define ZEROMV_MODE_CONTEXTS 2
 #define REFMV_MODE_CONTEXTS  9
-#define DRL_MODE_CONTEXTS    3
+#define DRL_MODE_CONTEXTS    5
 
 #define ZEROMV_OFFSET 3
 #define REFMV_OFFSET  4
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 863f0db..0e211ad 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -19,247 +19,6 @@
 #include "vpx_ports/mem.h"
 
 #if CONFIG_EXT_TX
-void idst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[3] + input[1]) * cospi_16_64;
-  temp2 = (input[3] - input[1]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = input[2] * cospi_24_64 - input[0] * cospi_8_64;
-  temp2 = input[2] * cospi_8_64 + input[0] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(-step[1] - step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[3] - step[0], 8);
-}
-
-void idst8_c(const tran_low_t *input, tran_low_t *output) {
-  // vp9_igentx8(input, output, Tx8);
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[7];
-  step1[2] = input[3];
-  step1[1] = input[5];
-  step1[3] = input[1];
-  temp1 = input[6] * cospi_28_64 - input[0] * cospi_4_64;
-  temp2 = input[6] * cospi_4_64 + input[0] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = input[2] * cospi_12_64 - input[4] * cospi_20_64;
-  temp2 = input[2] * cospi_20_64 + input[4] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(-step1[1] - step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(-step1[3] - step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(-step1[2] + step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(-step1[0] + step1[7], 8);
-}
-
-void idst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[15];
-  step1[1] = input[7];
-  step1[2] = input[11];
-  step1[3] = input[3];
-  step1[4] = input[13];
-  step1[5] = input[5];
-  step1[6] = input[9];
-  step1[7] = input[1];
-  step1[8] = input[14];
-  step1[9] = input[6];
-  step1[10] = input[10];
-  step1[11] = input[2];
-  step1[12] = input[12];
-  step1[13] = input[4];
-  step1[14] = input[8];
-  step1[15] = input[0];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(-step2[1] - step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(-step2[3] - step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(-step2[5] - step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(-step2[7] - step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(-step2[6] + step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(-step2[4] + step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(-step2[2] + step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(-step2[0] + step2[15], 8);
-}
-
-#if CONFIG_EXT_TX
 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
   int i;
   for (i = 0; i < 4; ++i)
@@ -285,21 +44,6 @@
 }
 
 // For use in lieu of DST
-static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 8; ++i) {
-    output[i] = input[16 + i] * 4;
-    output[24 + i] = input[24 + i] * 4;
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
-  }
-  idct16_c(inputhalf, output + 8);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
   tran_low_t inputhalf[16];
@@ -379,7 +123,6 @@
   // Note overall scaling factor is 4 times orthogonal
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // CONFIG_EXT_TX
 
 // Inverse identity transform and add.
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -412,24 +155,21 @@
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
     case IDTX:
     case V_DCT:
     case H_DCT:
+    case V_ADST:
+    case H_ADST:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
-    case FLIPADST_DST:
+    case V_FLIPADST:
       // flip UD
       FLIPUD_PTR(*dst, *dstride, size);
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
-    case DST_FLIPADST:
+    case H_FLIPADST:
       // flip LR
       FLIPUD_PTR(*src, *sstride, size);
       break;
@@ -716,24 +456,21 @@
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
     case IDTX:
     case V_DCT:
     case H_DCT:
+    case V_ADST:
+    case H_ADST:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
-    case FLIPADST_DST:
+    case V_FLIPADST:
       // flip UD
       FLIPUD_PTR(*dst, *dstride, size);
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
-    case DST_FLIPADST:
+    case H_FLIPADST:
       // flip LR
       FLIPUD_PTR(*src, *sstride, size);
       break;
@@ -754,26 +491,23 @@
 void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                           int tx_type) {
   static const transform_2d IHT_4[] = {
-    { idct4_c,  idct4_c  },  // DCT_DCT           = 0,
-    { iadst4_c, idct4_c  },  // ADST_DCT          = 1,
-    { idct4_c,  iadst4_c },  // DCT_ADST          = 2,
-    { iadst4_c, iadst4_c },  // ADST_ADST         = 3,
+    { idct4_c,  idct4_c  },  // DCT_DCT
+    { iadst4_c, idct4_c  },  // ADST_DCT
+    { idct4_c,  iadst4_c },  // DCT_ADST
+    { iadst4_c, iadst4_c },  // ADST_ADST
 #if CONFIG_EXT_TX
-    { iadst4_c, idct4_c  },  // FLIPADST_DCT      = 4,
-    { idct4_c,  iadst4_c },  // DCT_FLIPADST      = 5,
-    { iadst4_c, iadst4_c },  // FLIPADST_FLIPADST = 6,
-    { iadst4_c, iadst4_c },  // ADST_FLIPADST     = 7,
-    { iadst4_c, iadst4_c },  // FLIPADST_ADST     = 8,
-    { idst4_c,  idct4_c  },  // DST_DCT           = 9,
-    { idct4_c,  idst4_c  },  // DCT_DST           = 10,
-    { idst4_c,  iadst4_c },  // DST_ADST          = 11,
-    { iadst4_c, idst4_c  },  // ADST_DST          = 12,
-    { idst4_c,  iadst4_c },  // DST_FLIPADST      = 13,
-    { iadst4_c, idst4_c  },  // FLIPADST_DST      = 14,
-    { idst4_c,  idst4_c  },  // DST_DST           = 15
-    { iidtx4_c, iidtx4_c },  // IDTX              = 16
-    { idct4_c,  iidtx4_c },  // V_DCT             = 17
-    { iidtx4_c, idct4_c  },  // H_DCT             = 18
+    { iadst4_c, idct4_c  },  // FLIPADST_DCT
+    { idct4_c,  iadst4_c },  // DCT_FLIPADST
+    { iadst4_c, iadst4_c },  // FLIPADST_FLIPADST
+    { iadst4_c, iadst4_c },  // ADST_FLIPADST
+    { iadst4_c, iadst4_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx4_c },  // IDTX
+    { idct4_c,  iidtx4_c },  // V_DCT
+    { iidtx4_c, idct4_c  },  // H_DCT
+    { iadst4_c, iidtx4_c },  // V_ADST
+    { iidtx4_c, iadst4_c },  // H_ADST
+    { iadst4_c, iidtx4_c },  // V_FLIPADST
+    { iidtx4_c, iadst4_c },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
   };
 
@@ -820,26 +554,23 @@
 void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_8[] = {
-    { idct8_c,  idct8_c  },  // DCT_DCT           = 0,
-    { iadst8_c, idct8_c  },  // ADST_DCT          = 1,
-    { idct8_c,  iadst8_c },  // DCT_ADST          = 2,
-    { iadst8_c, iadst8_c },  // ADST_ADST         = 3,
+    { idct8_c,  idct8_c  },  // DCT_DCT
+    { iadst8_c, idct8_c  },  // ADST_DCT
+    { idct8_c,  iadst8_c },  // DCT_ADST
+    { iadst8_c, iadst8_c },  // ADST_ADST
 #if CONFIG_EXT_TX
-    { iadst8_c, idct8_c  },  // FLIPADST_DCT      = 4,
-    { idct8_c,  iadst8_c },  // DCT_FLIPADST      = 5,
-    { iadst8_c, iadst8_c },  // FLIPADST_FLIPADST = 6,
-    { iadst8_c, iadst8_c },  // ADST_FLIPADST     = 7,
-    { iadst8_c, iadst8_c },  // FLIPADST_ADST     = 8,
-    { idst8_c,  idct8_c  },  // DST_DCT           = 9,
-    { idct8_c,  idst8_c  },  // DCT_DST           = 10,
-    { idst8_c,  iadst8_c },  // DST_ADST          = 11,
-    { iadst8_c, idst8_c  },  // ADST_DST          = 12,
-    { idst8_c,  iadst8_c },  // DST_FLIPADST      = 13,
-    { iadst8_c, idst8_c  },  // FLIPADST_DST      = 14,
-    { idst8_c,  idst8_c  },  // DST_DST           = 15
-    { iidtx8_c, iidtx8_c },  // IDTX              = 16
-    { idct8_c,  iidtx8_c },  // V_DCT             = 17
-    { iidtx8_c, idct8_c  },  // H_DCT             = 18
+    { iadst8_c, idct8_c  },  // FLIPADST_DCT
+    { idct8_c,  iadst8_c },  // DCT_FLIPADST
+    { iadst8_c, iadst8_c },  // FLIPADST_FLIPADST
+    { iadst8_c, iadst8_c },  // ADST_FLIPADST
+    { iadst8_c, iadst8_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx8_c },  // IDTX
+    { idct8_c,  iidtx8_c },  // V_DCT
+    { iidtx8_c, idct8_c  },  // H_DCT
+    { iadst8_c, iidtx8_c },  // V_ADST
+    { iidtx8_c, iadst8_c },  // H_ADST
+    { iadst8_c, iidtx8_c },  // V_FLIPADST
+    { iidtx8_c, iadst8_c },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
   };
 
@@ -886,26 +617,23 @@
 void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_16[] = {
-    { idct16_c,  idct16_c  },  // DCT_DCT           = 0,
-    { iadst16_c, idct16_c  },  // ADST_DCT          = 1,
-    { idct16_c,  iadst16_c },  // DCT_ADST          = 2,
-    { iadst16_c, iadst16_c },  // ADST_ADST         = 3,
+    { idct16_c,  idct16_c  },  // DCT_DCT
+    { iadst16_c, idct16_c  },  // ADST_DCT
+    { idct16_c,  iadst16_c },  // DCT_ADST
+    { iadst16_c, iadst16_c },  // ADST_ADST
 #if CONFIG_EXT_TX
-    { iadst16_c, idct16_c  },  // FLIPADST_DCT      = 4,
-    { idct16_c,  iadst16_c },  // DCT_FLIPADST      = 5,
-    { iadst16_c, iadst16_c },  // FLIPADST_FLIPADST = 6,
-    { iadst16_c, iadst16_c },  // ADST_FLIPADST     = 7,
-    { iadst16_c, iadst16_c },  // FLIPADST_ADST     = 8,
-    { idst16_c,  idct16_c  },  // DST_DCT           = 9,
-    { idct16_c,  idst16_c  },  // DCT_DST           = 10,
-    { idst16_c,  iadst16_c },  // DST_ADST          = 11,
-    { iadst16_c, idst16_c  },  // ADST_DST          = 12,
-    { idst16_c,  iadst16_c },  // DST_FLIPADST      = 13,
-    { iadst16_c, idst16_c  },  // FLIPADST_DST      = 14,
-    { idst16_c,  idst16_c  },  // DST_DST           = 15
-    { iidtx16_c, iidtx16_c },  // IDTX              = 16
-    { idct16_c,  iidtx16_c },  // V_DCT             = 17
-    { iidtx16_c, idct16_c  },  // H_DCT             = 18
+    { iadst16_c, idct16_c  },  // FLIPADST_DCT
+    { idct16_c,  iadst16_c },  // DCT_FLIPADST
+    { iadst16_c, iadst16_c },  // FLIPADST_FLIPADST
+    { iadst16_c, iadst16_c },  // ADST_FLIPADST
+    { iadst16_c, iadst16_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx16_c },  // IDTX
+    { idct16_c,  iidtx16_c },  // V_DCT
+    { iidtx16_c, idct16_c  },  // H_DCT
+    { iadst16_c, iidtx16_c },  // V_ADST
+    { iidtx16_c, iadst16_c },  // H_ADST
+    { iadst16_c, iidtx16_c },  // V_FLIPADST
+    { iidtx16_c, iadst16_c },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
   };
 
@@ -953,25 +681,22 @@
 void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
                               int stride, int tx_type) {
   static const transform_2d IHT_32[] = {
-    { idct32_c,  idct32_c  },                // DCT_DCT           = 0,
-    { ihalfright32_c, idct32_c  },           // ADST_DCT          = 1,
-    { idct32_c,  ihalfright32_c },           // DCT_ADST          = 2,
-    { ihalfright32_c, ihalfright32_c },      // ADST_ADST         = 3,
-    { ihalfright32_c, idct32_c  },           // FLIPADST_DCT      = 4,
-    { idct32_c,  ihalfright32_c },           // DCT_FLIPADST      = 5,
-    { ihalfright32_c, ihalfright32_c },      // FLIPADST_FLIPADST = 6,
-    { ihalfright32_c, ihalfright32_c },      // ADST_FLIPADST     = 7,
-    { ihalfright32_c, ihalfright32_c },      // FLIPADST_ADST     = 8,
-    { ihalfcenter32_c,  idct32_c  },         // DST_DCT           = 9,
-    { idct32_c,  ihalfcenter32_c  },         // DCT_DST           = 10,
-    { ihalfcenter32_c,  ihalfright32_c },    // DST_ADST          = 11,
-    { ihalfright32_c, ihalfcenter32_c  },    // ADST_DST          = 12,
-    { ihalfcenter32_c,  ihalfright32_c },    // DST_FLIPADST      = 13,
-    { ihalfright32_c, ihalfcenter32_c  },    // FLIPADST_DST      = 14,
-    { ihalfcenter32_c,  ihalfcenter32_c  },  // DST_DST           = 15
-    { iidtx32_c, iidtx32_c },                // IDTX              = 16
-    { idct32_c,  iidtx32_c },                // V_DCT             = 17
-    { iidtx32_c, idct32_c  },                // H_DCT             = 18
+    { idct32_c,  idct32_c  },                // DCT_DCT
+    { ihalfright32_c, idct32_c  },           // ADST_DCT
+    { idct32_c,  ihalfright32_c },           // DCT_ADST
+    { ihalfright32_c, ihalfright32_c },      // ADST_ADST
+    { ihalfright32_c, idct32_c  },           // FLIPADST_DCT
+    { idct32_c,  ihalfright32_c },           // DCT_FLIPADST
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },      // ADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_ADST
+    { iidtx32_c, iidtx32_c },                // IDTX
+    { idct32_c,  iidtx32_c },                // V_DCT
+    { iidtx32_c, idct32_c  },                // H_DCT
+    { ihalfright32_c, iidtx16_c },           // V_ADST
+    { iidtx16_c, ihalfright32_c },           // H_ADST
+    { ihalfright32_c, iidtx16_c },           // V_FLIPADST
+    { iidtx16_c, ihalfright32_c },           // H_FLIPADST
   };
 
   int i, j;
@@ -1098,15 +823,12 @@
     case FLIPADST_ADST:
       vp10_iht4x4_16_add(input, dest, stride, tx_type);
       break;
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST only exists in C code
       vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
@@ -1139,15 +861,12 @@
     case FLIPADST_ADST:
       vp10_iht8x8_64_add(input, dest, stride, tx_type);
       break;
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST only exists in C code
       vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
@@ -1180,15 +899,12 @@
     case FLIPADST_ADST:
       vp10_iht16x16_256_add(input, dest, stride, tx_type);
       break;
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST only exists in C code
       vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
@@ -1217,15 +933,12 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
       break;
     case IDTX:
@@ -1242,26 +955,23 @@
 void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_4[] = {
-    { vpx_highbd_idct4_c,  vpx_highbd_idct4_c  },  // DCT_DCT           = 0,
-    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // ADST_DCT          = 1,
-    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_ADST          = 2,
-    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_ADST         = 3,
+    { vpx_highbd_idct4_c,  vpx_highbd_idct4_c  },  // DCT_DCT
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // ADST_DCT
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_ADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_ADST
 #if CONFIG_EXT_TX
-    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // FLIPADST_DCT      = 4,
-    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_FLIPADST      = 5,
-    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_FLIPADST = 6,
-    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_FLIPADST     = 7,
-    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_ADST     = 8,
-    {     highbd_idst4_c,  vpx_highbd_idct4_c  },  // DST_DCT           = 9,
-    { vpx_highbd_idct4_c,      highbd_idst4_c  },  // DCT_DST           = 10,
-    {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_ADST          = 11,
-    { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // ADST_DST          = 12,
-    {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_FLIPADST      = 13,
-    { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // FLIPADST_DST      = 14,
-    {     highbd_idst4_c,      highbd_idst4_c  },  // DST_DST           = 15
-    {     highbd_iidtx4_c,     highbd_iidtx4_c },  // IDTX              = 16
-    { vpx_highbd_idct4_c,      highbd_iidtx4_c },  // V_DCT             = 17
-    {     highbd_iidtx4_c, vpx_highbd_idct4_c  },  // H_DCT             = 18
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_ADST
+    {     highbd_iidtx4_c,     highbd_iidtx4_c },  // IDTX
+    { vpx_highbd_idct4_c,      highbd_iidtx4_c },  // V_DCT
+    {     highbd_iidtx4_c, vpx_highbd_idct4_c  },  // H_DCT
+    { vpx_highbd_iadst4_c,     highbd_iidtx4_c },  // V_ADST
+    {     highbd_iidtx4_c, vpx_highbd_iadst4_c },  // H_ADST
+    { vpx_highbd_iadst4_c,     highbd_iidtx4_c },  // V_FLIPADST
+    {     highbd_iidtx4_c, vpx_highbd_iadst4_c },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1311,26 +1021,23 @@
 void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_8[] = {
-    { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT           = 0,
-    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT          = 1,
-    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST          = 2,
-    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_ADST         = 3,
+    { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT
+    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_ADST
 #if CONFIG_EXT_TX
-    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // FLIPADST_DCT      = 4,
-    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_FLIPADST      = 5,
-    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_FLIPADST = 6,
-    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_FLIPADST     = 7,
-    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_ADST     = 8,
-    {     highbd_idst8_c,  vpx_highbd_idct8_c  },  // DST_DCT           = 9,
-    { vpx_highbd_idct8_c,      highbd_idst8_c  },  // DCT_DST           = 10,
-    {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_ADST          = 11,
-    { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // ADST_DST          = 12,
-    {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_FLIPADST      = 13,
-    { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // FLIPADST_DST      = 14,
-    {     highbd_idst8_c,      highbd_idst8_c  },  // DST_DST           = 15
-    {     highbd_iidtx8_c,     highbd_iidtx8_c },  // IDTX              = 16
-    { vpx_highbd_idct8_c,      highbd_iidtx8_c },  // V_DCT             = 17
-    {     highbd_iidtx8_c, vpx_highbd_idct8_c  },  // H_DCT             = 18
+    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_ADST
+    {     highbd_iidtx8_c,     highbd_iidtx8_c },  // IDTX
+    { vpx_highbd_idct8_c,      highbd_iidtx8_c },  // V_DCT
+    {     highbd_iidtx8_c, vpx_highbd_idct8_c  },  // H_DCT
+    { vpx_highbd_iadst8_c,     highbd_iidtx8_c },  // V_ADST
+    {     highbd_iidtx8_c, vpx_highbd_iadst8_c },  // H_ADST
+    { vpx_highbd_iadst8_c,     highbd_iidtx8_c },  // V_FLIPADST
+    {     highbd_iidtx8_c, vpx_highbd_iadst8_c },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1380,26 +1087,23 @@
 void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_16[] = {
-    { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT           = 0,
-    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT          = 1,
-    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST          = 2,
-    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_ADST         = 3,
+    { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT
+    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT
+    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_ADST
 #if CONFIG_EXT_TX
-    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // FLIPADST_DCT      = 4,
-    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_FLIPADST      = 5,
-    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_FLIPADST = 6,
-    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_FLIPADST     = 7,
-    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_ADST     = 8,
-    {     highbd_idst16_c,  vpx_highbd_idct16_c  },  // DST_DCT           = 9,
-    { vpx_highbd_idct16_c,      highbd_idst16_c  },  // DCT_DST           = 10,
-    {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_ADST          = 11,
-    { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // ADST_DST          = 12,
-    {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_FLIPADST      = 13,
-    { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // FLIPADST_DST      = 14,
-    {     highbd_idst16_c,      highbd_idst16_c  },  // DST_DST           = 15
-    {     highbd_iidtx16_c,     highbd_iidtx16_c },  // IDTX              = 16
-    { vpx_highbd_idct16_c,      highbd_iidtx16_c },  // V_DCT             = 17
-    {     highbd_iidtx16_c, vpx_highbd_idct16_c  },  // H_DCT             = 18
+    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_ADST
+    {     highbd_iidtx16_c,     highbd_iidtx16_c },  // IDTX
+    { vpx_highbd_idct16_c,      highbd_iidtx16_c },  // V_DCT
+    {     highbd_iidtx16_c, vpx_highbd_idct16_c  },  // H_DCT
+    { vpx_highbd_iadst16_c,     highbd_iidtx16_c },  // V_ADST
+    {     highbd_iidtx16_c, vpx_highbd_iadst16_c },  // H_ADST
+    { vpx_highbd_iadst16_c,     highbd_iidtx16_c },  // V_FLIPADST
+    {     highbd_iidtx16_c, vpx_highbd_iadst16_c },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1459,16 +1163,13 @@
     { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_FLIPADST
     { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_FLIPADST
     { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_ADST
-    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c    },  // DST_DCT
-    { vpx_highbd_idct32_c,    highbd_ihalfcenter32_c },  // DCT_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_ADST
-    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // ADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_FLIPADST
-    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // FLIPADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c },  // DST_DST
-    {     highbd_iidtx32_c,   highbd_iidtx32_c       },  // IDTX
+    { highbd_iidtx32_c,       highbd_iidtx32_c       },  // IDTX
     { vpx_highbd_idct32_c,    highbd_iidtx32_c       },  // V_DCT
-    {     highbd_iidtx32_c,   vpx_highbd_idct32_c    },  // H_DCT
+    { highbd_iidtx32_c,       vpx_highbd_idct32_c    },  // H_DCT
+    { highbd_ihalfright32_c,  highbd_iidtx32_c       },  // V_ADST
+    { highbd_iidtx32_c,       highbd_ihalfright32_c  },  // H_ADST
+    { highbd_ihalfright32_c,  highbd_iidtx32_c       },  // V_FLIPADST
+    { highbd_iidtx32_c,       highbd_ihalfright32_c  },  // H_FLIPADST
   };
 
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1602,15 +1303,12 @@
     case FLIPADST_ADST:
       vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
       break;
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST only exists in C code
       vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
       break;
@@ -1644,15 +1342,12 @@
     case FLIPADST_ADST:
       vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
       break;
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST only exists in C code
       vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
       break;
@@ -1686,15 +1381,12 @@
     case FLIPADST_ADST:
       vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
       break;
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST only exists in C code
       vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
       break;
@@ -1724,15 +1416,12 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DST_DCT:
-    case DCT_DST:
-    case DST_ADST:
-    case ADST_DST:
-    case FLIPADST_DST:
-    case DST_FLIPADST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
       break;
     case IDTX:
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index d9891bb..25941d0 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -1276,9 +1276,9 @@
 #if CONFIG_VAR_TX
       if (is_inter_block(mbmi) && !mbmi->skip)
         tx_size = (plane->plane_type == PLANE_TYPE_UV) ?
-            get_uv_tx_size_impl(mbmi->inter_tx_size[blk_row * 8 + blk_col],
+            get_uv_tx_size_impl(mbmi->inter_tx_size[blk_row][ blk_col],
                                 sb_type, ss_x, ss_y) :
-            mbmi->inter_tx_size[blk_row * 8 + blk_col];
+            mbmi->inter_tx_size[blk_row][blk_col];
 
       tx_size_r = VPXMIN(tx_size, cm->above_txfm_context[mi_col + c]);
       tx_size_c = VPXMIN(tx_size, cm->left_txfm_context[(mi_row + r) & 0x07]);
@@ -1634,7 +1634,7 @@
                            int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
-#if !CONFIG_VAR_TX
+#if !CONFIG_VAR_TX && !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path;
   LOOP_FILTER_MASK lfm;
 
@@ -1646,7 +1646,7 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
-#endif
+#endif  // !CONFIG_VAR_TX && !CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_VAR_TX
   memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
@@ -1661,7 +1661,7 @@
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-#if CONFIG_VAR_TX
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION_TYPES
       for (plane = 0; plane < num_planes; ++plane)
         vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
                                        mi_row, mi_col);
@@ -1684,7 +1684,7 @@
             break;
         }
       }
-#endif
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION_TYPES
     }
   }
 }
diff --git a/vp10/common/mvref_common.c b/vp10/common/mvref_common.c
index c67beed..2a8bc78 100644
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@@ -246,31 +246,51 @@
   return newmv_count;
 }
 
+// This function assumes MI blocks are 8x8 and coding units are 64x64
 static int has_top_right(const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
+  // In a split partition all apart from the bottom right has a top right
   int has_tr = !((mi_row & bs) & (bs * 2 - 1)) ||
                !((mi_col & bs) & (bs * 2 - 1));
 
   // Filter out partial right-most boundaries
+  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+  // to the right have not been decoded therefore the second from bottom in the
+  // right-most column does not have a top right
   if ((mi_col & bs) & (bs * 2 - 1)) {
     if (((mi_col & (2 * bs)) & (bs * 4 - 1)) &&
         ((mi_row & (2 * bs)) & (bs * 4 - 1)))
       has_tr = 0;
   }
 
+  // If the right had side of the block lines up with the right had edge end of
+  // a group of 8x8 MI blocks (i.e. edge of a coding unit) and is not on the top
+  // row of that coding unit, it does not have a top right
   if (has_tr)
     if (((mi_col + xd->n8_w) & 0x07) == 0)
       if ((mi_row & 0x07) > 0)
         has_tr = 0;
 
+  // The left had of two vertical rectangles always has a top right (as the
+  // block above will have been decoded)
   if (xd->n8_w < xd->n8_h)
     if (!xd->is_sec_rect)
       has_tr = 1;
 
+  // The bottom of two horizontal rectangles never has a top right (as the block
+  // to the right won't have been decoded)
   if (xd->n8_w > xd->n8_h)
     if (xd->is_sec_rect)
       has_tr = 0;
 
+#if CONFIG_EXT_PARTITION_TYPES
+  // The bottom left square of a Vertical A does not have a top right as it is
+  // decoded before the right hand rectangle of the partition
+  if (xd->mi[0]->mbmi.partition == PARTITION_VERT_A)
+    if ((mi_row & bs) && !(mi_col & bs))
+      has_tr = 0;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
   return has_tr;
 }
 
@@ -749,6 +769,10 @@
 
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
                                     int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                    CANDIDATE_MV *ref_mv_stack,
+                                    uint8_t *ref_mv_count,
+#endif
 #if CONFIG_EXT_INTER
                                     int_mv *mv_list,
 #endif  // CONFIG_EXT_INTER
@@ -760,11 +784,11 @@
   b_mode_info *bmi = mi->bmi;
   int n;
 #if CONFIG_REF_MV
-  CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE];
   CANDIDATE_MV tmp_mv;
-  uint8_t ref_mv_count = 0, idx;
+  uint8_t idx;
   uint8_t above_count = 0, left_count = 0;
   MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE };
+  *ref_mv_count = 0;
 #endif
 
   assert(MAX_MV_REF_CANDIDATES == 2);
@@ -774,12 +798,12 @@
 
 #if CONFIG_REF_MV
   scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
-                -1, 0, ref_mv_stack, &ref_mv_count);
-  above_count = ref_mv_count;
+                -1, 0, ref_mv_stack, ref_mv_count);
+  above_count = *ref_mv_count;
 
   scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
-                0, -1, ref_mv_stack, &ref_mv_count);
-  left_count = ref_mv_count - above_count;
+                0, -1, ref_mv_stack, ref_mv_count);
+  left_count = *ref_mv_count - above_count;
 
   if (above_count > 1 && left_count > 0) {
     tmp_mv = ref_mv_stack[1];
@@ -787,7 +811,7 @@
     ref_mv_stack[above_count] = tmp_mv;
   }
 
-  for (idx = 0; idx < VPXMIN(MAX_MV_REF_CANDIDATES, ref_mv_count); ++idx) {
+  for (idx = 0; idx < VPXMIN(MAX_MV_REF_CANDIDATES, *ref_mv_count); ++idx) {
     mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
     clamp_mv_ref(&mv_list[idx].as_mv,
                  xd->n8_w << 3, xd->n8_h << 3, xd);
diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index bc6d824..76530e9 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@@ -289,16 +289,24 @@
 static INLINE uint8_t vp10_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
                                    int ref_idx) {
   if (ref_mv_stack[ref_idx].weight > REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight > REF_CAT_LEVEL)
-    return 0;
+      ref_mv_stack[ref_idx + 1].weight > REF_CAT_LEVEL) {
+    if (ref_mv_stack[ref_idx].weight == ref_mv_stack[ref_idx + 1].weight)
+      return 0;
+    else
+      return 1;
+  }
 
   if (ref_mv_stack[ref_idx].weight > REF_CAT_LEVEL &&
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
-    return 1;
+    return 2;
 
   if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
-    return 2;
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL) {
+    if (ref_mv_stack[ref_idx].weight == ref_mv_stack[ref_idx + 1].weight)
+      return 3;
+    else
+      return 4;
+  }
 
   assert(0);
   return 0;
@@ -327,6 +335,10 @@
 
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
                                     int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                    CANDIDATE_MV *ref_mv_stack,
+                                    uint8_t *ref_mv_count,
+#endif
 #if CONFIG_EXT_INTER
                                     int_mv *mv_list,
 #endif  // CONFIG_EXT_INTER
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index bcc69f3..2dd09b5 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -313,7 +313,7 @@
   BufferPool *buffer_pool;
 
   PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT left_txfm_context[8];
@@ -405,9 +405,7 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
-    xd->above_context[i] = cm->above_context +
-        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
-
+    xd->above_context[i] = cm->above_context[i];
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
     } else {
@@ -501,6 +499,12 @@
   PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
   PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+#else
   // num_4x4_blocks_wide_lookup[bsize] / 2
   const int bs = num_8x8_blocks_wide_lookup[bsize];
 
@@ -509,8 +513,50 @@
   // bits of smaller block sizes to be zero.
   memset(above_ctx, partition_context_lookup[subsize].above, bs);
   memset(left_ctx, partition_context_lookup[subsize].left, bs);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
+#if CONFIG_EXT_PARTITION_TYPES
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8)
+          break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static INLINE int partition_plane_context(const MACROBLOCKD *xd,
                                           int mi_row, int mi_col,
                                           BLOCK_SIZE bsize) {
@@ -525,6 +571,27 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+static INLINE void vp10_zero_above_context(VP10_COMMON *const cm,
+                             int mi_col_start, int mi_col_end) {
+  const int width = mi_col_end - mi_col_start;
+  int i;
+
+  for (i = 0 ; i < MAX_MB_PLANE ; i++)
+    vp10_zero_array(cm->above_context[i] + 2 * mi_col_start, 2 * width);
+  vp10_zero_array(cm->above_seg_context + mi_col_start, width);
+#if CONFIG_VAR_TX
+  vp10_zero_array(cm->above_txfm_context + mi_col_start, width);
+#endif  // CONFIG_VAR_TX
+}
+
+static INLINE void vp10_zero_left_context(MACROBLOCKD *const xd) {
+  vp10_zero(xd->left_context);
+  vp10_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+  vp10_zero(xd->left_txfm_context_buffer);
+#endif
+}
+
 #if CONFIG_VAR_TX
 static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx,
                                 TX_SIZE tx_size,
diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index 83a3597..385a3e1 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -192,9 +192,9 @@
                              TX_SIZE max_tx_size, int ctx) {
   const struct macroblockd_plane *const pd = &xd->plane[0];
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-               (blk_col >> (1 - pd->subsampling_x));
-  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
 
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 174ff80..5175389 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -1372,6 +1372,189 @@
     }
   }  // each mi in the left column
 }
+
+void vp10_build_prediction_by_above_preds(VP10_COMMON *cm,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          uint8_t *tmp_buf[MAX_MB_PLANE],
+                                          int tmp_stride[MAX_MB_PLANE]) {
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+
+  if (mi_row == 0)
+    return;
+
+  for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+    int mi_row_offset = -1;
+    int mi_col_offset = i;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *above_mi = xd->mi[mi_col_offset +
+                                 mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+
+    mi_step = VPXMIN(xd->n8_w,
+                     num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(above_mbmi))
+      continue;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst,
+                       tmp_buf[j], tmp_stride[j],
+                       0, i, NULL,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
+      MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!vp10_is_valid_scale(&ref_buf->sf)))
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
+                            &ref_buf->sf);
+    }
+
+    xd->mb_to_left_edge   = -(((mi_col + i) * MI_SIZE) * 8);
+    mi_x = (mi_col + i) << MI_SIZE_LOG2;
+    mi_y = mi_row << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = (mi_step * 8) >> pd->subsampling_x;
+      bh = VPXMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
+                  4);
+
+      if (above_mbmi->sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - above_mbmi->sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+        int x, y;
+
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT)
+                && y == 0 && !pd->subsampling_y)
+              continue;
+
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh,
+                                   4 * x, 0, pw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
+          }
+      } else {
+        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                               0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
+      }
+    }
+  }
+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
+}
+
+void vp10_build_prediction_by_left_preds(VP10_COMMON *cm,
+                                         MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+
+  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start) ||
+      (mi_col - 1) >= tile->mi_col_end)
+    return;
+
+  for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+    int mi_row_offset = i;
+    int mi_col_offset = -1;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *left_mi = xd->mi[mi_col_offset +
+                                mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+    const int is_compound = has_second_ref(left_mbmi);
+
+    mi_step = VPXMIN(xd->n8_h,
+                     num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(left_mbmi))
+      continue;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst,
+                       tmp_buf[j], tmp_stride[j],
+                       i, 0, NULL,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!vp10_is_valid_scale(&ref_buf->sf)))
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
+                            &ref_buf->sf);
+    }
+
+    xd->mb_to_top_edge    = -(((mi_row + i) * MI_SIZE) * 8);
+    mi_x = mi_col << MI_SIZE_LOG2;
+    mi_y = (mi_row + i) << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = VPXMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
+                  4);
+      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+      if (left_mbmi->sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - left_mbmi->sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+        int x, y;
+
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT)
+                && x == 0 && !pd->subsampling_x)
+              continue;
+
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh,
+                                   0, 4 * y, bw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
+          }
+      } else {
+        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0,
+                               bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
+      }
+    }
+  }
+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
+}
 #endif  // CONFIG_OBMC
 
 #if CONFIG_EXT_INTER
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index c6e89df..4dcd203 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -430,6 +430,16 @@
                                       int tmp_stride1[MAX_MB_PLANE],
                                       uint8_t *tmp_buf2[MAX_MB_PLANE],
                                       int tmp_stride2[MAX_MB_PLANE]);
+void vp10_build_prediction_by_above_preds(VP10_COMMON *cm,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          uint8_t *tmp_buf[MAX_MB_PLANE],
+                                          int tmp_stride[MAX_MB_PLANE]);
+void vp10_build_prediction_by_left_preds(VP10_COMMON *cm,
+                                         MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
 #endif  // CONFIG_OBMC
 
 #if CONFIG_EXT_INTER
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index e28f01c..10a66f8 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -99,9 +99,40 @@
   orders_16x32, orders_32x16, orders_32x32,
   orders_32x64, orders_64x32, orders_64x64,
 };
+#if CONFIG_EXT_PARTITION_TYPES
+static const uint8_t orders_verta_32x32[4] = {
+  0, 2,
+  1, 2,
+};
+static const uint8_t orders_verta_16x16[16] = {
+  0,   2,  4,  6,
+  1,   2,  5,  6,
+  8,  10, 12, 14,
+  9,  10, 13, 14,
+};
+static const uint8_t orders_verta_8x8[64] = {
+  0,   2,  4,  6, 16, 18, 20, 22,
+  1,   2,  5,  6, 17, 18, 21, 22,
+  8,  10, 12, 14, 24, 26, 28, 30,
+  9,  10, 13, 14, 25, 26, 29, 30,
+  32, 34, 36, 38, 48, 50, 52, 54,
+  33, 34, 37, 38, 49, 50, 53, 54,
+  40, 42, 44, 46, 56, 58, 60, 62,
+  41, 42, 45, 46, 57, 58, 61, 62,
+};
+static const uint8_t *const orders_verta[BLOCK_SIZES] = {
+  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
+  orders_8x16, orders_16x8, orders_verta_16x16,
+  orders_16x32, orders_32x16, orders_verta_32x32,
+  orders_32x64, orders_64x32, orders_64x64,
+};
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
                           int right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                          PARTITION_TYPE partition,
+#endif
                           TX_SIZE txsz, int y, int x, int ss_x) {
   const int wl = mi_width_log2_lookup[bsize];
   const int w = VPXMAX(num_4x4_blocks_wide_lookup[bsize] >> ss_x, 1);
@@ -113,8 +144,14 @@
 
   if (y == 0) {
     const int hl = mi_height_log2_lookup[bsize];
-    const uint8_t *order = orders[bsize];
+    const uint8_t *order;
     int my_order, tr_order;
+#if CONFIG_EXT_PARTITION_TYPES
+    if (partition == PARTITION_VERT_A)
+      order = orders_verta[bsize];
+    else
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    order = orders[bsize];
 
     if (x + step < w)
       return 1;
@@ -122,9 +159,11 @@
     mi_row = (mi_row & 7) >> hl;
     mi_col = (mi_col & 7) >> wl;
 
+    // If top row of coding unit
     if (mi_row == 0)
       return right_available;
 
+    // If rightmost column of coding unit
     if (((mi_col + 1) << wl) >= 8)
       return 0;
 
@@ -1346,8 +1385,14 @@
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int right_available =
       mi_col + (1 << mi_width_log2_lookup[bsize]) < xd->tile.mi_col_end;
+#if CONFIG_EXT_PARTITION_TYPES
+  const PARTITION_TYPE partition = xd->mi[0]->mbmi.partition;
+#endif
   const int have_right = vp10_has_right(bsize, mi_row, mi_col,
                                         right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                                        partition,
+#endif
                                         tx_size, row_off, col_off,
                                         pd->subsampling_x);
   const int have_bottom = vp10_has_bottom(bsize, mi_row, mi_col,
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 6dc5604..2644ecf 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -2882,13 +2882,10 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
     {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
     {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
   }, {  // TX_8X8
@@ -2902,13 +2899,10 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
     {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
     {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
   }, {  // TX_16X16
@@ -2930,22 +2924,12 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
-     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -2965,26 +2949,14 @@
      qtr_scan_32x32_neighbors},
     {qtr_scan_32x32, vp10_qtr_iscan_32x32,
      qtr_scan_32x32_neighbors},
-    {h2_scan_32x32, vp10_h2_iscan_32x32,
-     h2_scan_32x32_neighbors},
-    {v2_scan_32x32, vp10_v2_iscan_32x32,
-     v2_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
-     {h2_scan_32x32, vp10_h2_iscan_32x32,
-      h2_scan_32x32_neighbors},
-     {v2_scan_32x32, vp10_v2_iscan_32x32,
-      v2_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
   }
 };
 
@@ -3000,13 +2972,10 @@
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
-    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
     {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
     {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
   }, {  // TX_8X8
@@ -3020,13 +2989,10 @@
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
-    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
     {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
     {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
   }, {  // TX_16X16
@@ -3050,22 +3016,12 @@
      default_scan_16x16_neighbors},
     {default_scan_16x16, vp10_default_iscan_16x16,
      default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16,
-     default_scan_16x16_neighbors},
-     {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
-     {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
   }, {  // TX_32X32
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
@@ -3085,24 +3041,14 @@
      qtr_scan_32x32_neighbors},
     {qtr_scan_32x32, vp10_qtr_iscan_32x32,
      qtr_scan_32x32_neighbors},
-    {h2_scan_32x32, vp10_h2_iscan_32x32,
-     h2_scan_32x32_neighbors},
-    {v2_scan_32x32, vp10_v2_iscan_32x32,
-     v2_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
-    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
-     qtr_scan_32x32_neighbors},
     {default_scan_32x32, vp10_default_iscan_32x32,
      default_scan_32x32_neighbors},
     {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
     {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
   }
 };
 
diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index f8bfc89..aebcb11 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -96,7 +96,9 @@
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
   int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path;
+  LOOP_FILTER_MASK lfm;
   if (y_only)
     path = LF_PATH_444;
   else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
@@ -105,6 +107,7 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
 
   for (mi_row = start; mi_row < stop;
        mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
@@ -113,13 +116,17 @@
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
       const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
-      LOOP_FILTER_MASK lfm;
       int plane;
 
       sync_read(lf_sync, r, c);
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                       mi_row, mi_col);
+#else
       // TODO(JBB): Make setup_mask work for non 420.
       vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
                      &lfm);
@@ -139,7 +146,7 @@
             break;
         }
       }
-
+#endif  // CONFIG_EXT_PARTITION_TYPES
       sync_write(lf_sync, r, c, sb_cols);
     }
   }
@@ -331,7 +338,11 @@
       cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
+#if CONFIG_EXT_PARTITION_TYPES
+    for (j = 0; j < (i ? EXT_PARTITION_TYPES : PARTITION_TYPES); j++)
+#else
     for (j = 0; j < PARTITION_TYPES; j++)
+#endif
       cm->counts.partition[i][j] += counts->partition[i][j];
 
   if (is_dec) {
@@ -387,11 +398,7 @@
 
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
     for (j = 0; j < 2; ++j)
-      cm->counts.drl_mode0[i][j] += counts->drl_mode0[i][j];
-
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    for (j = 0; j < 2; ++j)
-      cm->counts.drl_mode1[i][j] += counts->drl_mode1[i][j];
+      cm->counts.drl_mode[i][j] += counts->drl_mode[i][j];
 
 #if CONFIG_EXT_INTER
   for (j = 0; j < 2; ++j)
diff --git a/vp10/common/vp10_fwd_txfm1d.c b/vp10/common/vp10_fwd_txfm1d.c
index f3da5c9..ef24362 100644
--- a/vp10/common/vp10_fwd_txfm1d.c
+++ b/vp10/common/vp10_fwd_txfm1d.c
@@ -15,8 +15,8 @@
   {                                                                       \
     int i, j;                                                             \
     for (i = 0; i < size; ++i) {                                          \
-      int buf_bit = get_max_bit(abs(buf[i])) + 1;                         \
-      if (buf_bit > bit) {                                                \
+        int buf_bit = get_max_bit(abs(buf[i])) + 1;                       \
+        if (buf_bit > bit) {                                              \
         printf("======== %s overflow ========\n", __func__);              \
         printf("stage: %d node: %d\n", stage, i);                         \
         printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
@@ -32,11 +32,11 @@
 #else
 #define range_check(stage, input, buf, size, bit) \
   {                                               \
-    (void) stage;                                 \
-    (void) input;                                 \
-    (void) buf;                                   \
-    (void) size;                                  \
-    (void) bit;                                   \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
   }
 #endif
 
@@ -1092,7 +1092,6 @@
   bf1[14] = bf0[9];
   bf1[15] = -bf0[1];
   range_check(stage, input, bf1, size, stage_range[stage]);
-
 }
 
 void vp10_fadst32_new(const int32_t *input, int32_t *output,
@@ -1529,3 +1528,796 @@
   bf1[31] = -bf0[1];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
+
+void vp10_fdct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/vp10/common/vp10_fwd_txfm1d.h b/vp10/common/vp10_fwd_txfm1d.h
index d5b9f40..d06e305 100644
--- a/vp10/common/vp10_fwd_txfm1d.h
+++ b/vp10/common/vp10_fwd_txfm1d.h
@@ -25,6 +25,8 @@
                      const int8_t *cos_bit, const int8_t *stage_range);
 void vp10_fdct32_new(const int32_t *input, int32_t *output,
                      const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
 
 void vp10_fadst4_new(const int32_t *input, int32_t *output,
                      const int8_t *cos_bit, const int8_t *stage_range);
diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
index 67449ec..32214ae 100644
--- a/vp10/common/vp10_fwd_txfm2d.c
+++ b/vp10/common/vp10_fwd_txfm2d.c
@@ -9,8 +9,47 @@
  */
 
 #include "vp10/common/vp10_txfm.h"
+#include "vp10/common/vp10_fwd_txfm1d.h"
 
-static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range);
+
+static inline TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4:
+      return vp10_fdct4_new;
+      break;
+    case TXFM_TYPE_DCT8:
+      return vp10_fdct8_new;
+      break;
+    case TXFM_TYPE_DCT16:
+      return vp10_fdct16_new;
+      break;
+    case TXFM_TYPE_DCT32:
+      return vp10_fdct32_new;
+      break;
+    case TXFM_TYPE_DCT64:
+      return vp10_fdct64_new;
+      break;
+    case TXFM_TYPE_ADST4:
+      return vp10_fadst4_new;
+      break;
+    case TXFM_TYPE_ADST8:
+      return vp10_fadst8_new;
+      break;
+    case TXFM_TYPE_ADST16:
+      return vp10_fadst16_new;
+      break;
+    case TXFM_TYPE_ADST32:
+      return vp10_fadst32_new;
+      break;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static inline void fwd_txfm2d_c(const int16_t *input, int32_t *output,
                                 const int stride, const TXFM_2D_CFG *cfg,
                                 int32_t *txfm_buf) {
   int i, j;
@@ -20,8 +59,8 @@
   const int8_t *stage_range_row = cfg->stage_range_row;
   const int8_t *cos_bit_col = cfg->cos_bit_col;
   const int8_t *cos_bit_row = cfg->cos_bit_row;
-  const TxfmFunc txfm_func_col = cfg->txfm_func_col;
-  const TxfmFunc txfm_func_row = cfg->txfm_func_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
 
   // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
   // it is used for intermediate data buffering
@@ -51,7 +90,7 @@
   }
 }
 
-void vp10_fwd_txfm2d_4x4(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output,
                          const int stride, const TXFM_2D_CFG *cfg,
                          const int bd) {
   int txfm_buf[4 * 4 + 4 + 4];
@@ -59,7 +98,7 @@
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_8x8(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output,
                          const int stride, const TXFM_2D_CFG *cfg,
                          const int bd) {
   int txfm_buf[8 * 8 + 8 + 8];
@@ -67,7 +106,7 @@
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_16x16(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output,
                            const int stride, const TXFM_2D_CFG *cfg,
                            const int bd) {
   int txfm_buf[16 * 16 + 16 + 16];
@@ -75,10 +114,18 @@
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_32x32(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output,
                            const int stride, const TXFM_2D_CFG *cfg,
                            const int bd) {
   int txfm_buf[32 * 32 + 32 + 32];
   (void)bd;
   fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
 }
+
+void vp10_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output,
+                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int bd) {
+  int txfm_buf[64 * 64 + 64 + 64];
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+}
diff --git a/vp10/common/vp10_fwd_txfm2d.h b/vp10/common/vp10_fwd_txfm2d.h
deleted file mode 100644
index 64e6f56..0000000
--- a/vp10/common/vp10_fwd_txfm2d.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP10_FWD_TXFM2D_H_
-#define VP10_FWD_TXFM2D_H_
-
-#include "vp10/common/vp10_txfm.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-void vp10_fwd_txfm2d_4x4(const int16_t *input, int32_t *output,
-                         const int stride, const TXFM_2D_CFG *cfg,
-                         const int bd);
-void vp10_fwd_txfm2d_8x8(const int16_t *input, int32_t *output,
-                         const int stride, const TXFM_2D_CFG *cfg,
-                         const int bd);
-void vp10_fwd_txfm2d_16x16(const int16_t *input, int32_t *output,
-                           const int stride, const TXFM_2D_CFG *cfg,
-                           const int bd);
-void vp10_fwd_txfm2d_32x32(const int16_t *input, int32_t *output,
-                           const int stride, const TXFM_2D_CFG *cfg,
-                           const int bd);
-#ifdef __cplusplus
-}
-#endif
-#endif  // VP10_FWD_TXFM2D_H_
diff --git a/vp10/common/vp10_fwd_txfm2d_cfg.h b/vp10/common/vp10_fwd_txfm2d_cfg.h
index 5c2b4ca..3c0a906 100644
--- a/vp10/common/vp10_fwd_txfm2d_cfg.h
+++ b/vp10/common/vp10_fwd_txfm2d_cfg.h
@@ -27,8 +27,8 @@
     fwd_stage_range_row_dct_dct_4,  // .stage_range_row
     fwd_cos_bit_col_dct_dct_4,      // .cos_bit_col
     fwd_cos_bit_row_dct_dct_4,      // .cos_bit_row
-    vp10_fdct4_new,                 // .txfm_func_col
-    vp10_fdct4_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_dct_8 ----------------
 static const int8_t fwd_shift_dct_dct_8[3] = {5, -3, -1};
@@ -46,8 +46,8 @@
     fwd_stage_range_row_dct_dct_8,  // .stage_range_row
     fwd_cos_bit_col_dct_dct_8,      // .cos_bit_col
     fwd_cos_bit_row_dct_dct_8,      // .cos_bit_row
-    vp10_fdct8_new,                 // .txfm_func_col
-    vp10_fdct8_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_dct_16 ----------------
 static const int8_t fwd_shift_dct_dct_16[3] = {4, -3, -1};
@@ -69,8 +69,8 @@
     fwd_stage_range_row_dct_dct_16,  // .stage_range_row
     fwd_cos_bit_col_dct_dct_16,      // .cos_bit_col
     fwd_cos_bit_row_dct_dct_16,      // .cos_bit_row
-    vp10_fdct16_new,                 // .txfm_func_col
-    vp10_fdct16_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_dct_32 ----------------
 static const int8_t fwd_shift_dct_dct_32[3] = {3, -3, -1};
@@ -92,8 +92,31 @@
     fwd_stage_range_row_dct_dct_32,  // .stage_range_row
     fwd_cos_bit_col_dct_dct_32,      // .cos_bit_col
     fwd_cos_bit_row_dct_dct_32,      // .cos_bit_row
-    vp10_fdct32_new,                 // .txfm_func_col
-    vp10_fdct32_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_64 ----------------
+static const int8_t fwd_shift_dct_dct_64[3] = {2, -2, -2};
+static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
+    13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19};
+static const int8_t fwd_stage_range_row_dct_dct_64[12] = {
+    17, 18, 19, 20, 21, 22, 22, 22, 22, 22, 22, 22};
+static const int8_t fwd_cos_bit_col_dct_dct_64[12] = {15, 15, 15, 15, 15, 14,
+                                                      13, 13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_dct_64[12] = {15, 14, 13, 12, 11, 10,
+                                                      10, 10, 10, 10, 10, 10};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_64 = {
+    64,                              // .txfm_size
+    12,                              // .stage_num_col
+    12,                              // .stage_num_row
+    fwd_shift_dct_dct_64,            // .shift
+    fwd_stage_range_col_dct_dct_64,  // .stage_range_col
+    fwd_stage_range_row_dct_dct_64,  // .stage_range_row
+    fwd_cos_bit_col_dct_dct_64,      // .cos_bit_col
+    fwd_cos_bit_row_dct_dct_64,      // .cos_bit_row
+    TXFM_TYPE_DCT64,                 // .txfm_type_col
+    TXFM_TYPE_DCT64};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_adst_4 ----------------
 static const int8_t fwd_shift_dct_adst_4[3] = {5, -2, -1};
@@ -112,8 +135,8 @@
     fwd_stage_range_row_dct_adst_4,  // .stage_range_row
     fwd_cos_bit_col_dct_adst_4,      // .cos_bit_col
     fwd_cos_bit_row_dct_adst_4,      // .cos_bit_row
-    vp10_fdct4_new,                  // .txfm_func_col
-    vp10_fadst4_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_adst_8 ----------------
 static const int8_t fwd_shift_dct_adst_8[3] = {7, -3, -3};
@@ -134,8 +157,8 @@
     fwd_stage_range_row_dct_adst_8,  // .stage_range_row
     fwd_cos_bit_col_dct_adst_8,      // .cos_bit_col
     fwd_cos_bit_row_dct_adst_8,      // .cos_bit_row
-    vp10_fdct8_new,                  // .txfm_func_col
-    vp10_fadst8_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_adst_16 ----------------
 static const int8_t fwd_shift_dct_adst_16[3] = {4, -1, -3};
@@ -157,8 +180,8 @@
     fwd_stage_range_row_dct_adst_16,  // .stage_range_row
     fwd_cos_bit_col_dct_adst_16,      // .cos_bit_col
     fwd_cos_bit_row_dct_adst_16,      // .cos_bit_row
-    vp10_fdct16_new,                  // .txfm_func_col
-    vp10_fadst16_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                // .txfm_type_row
 
 //  ---------------- config fwd_dct_adst_32 ----------------
 static const int8_t fwd_shift_dct_adst_32[3] = {3, -1, -3};
@@ -180,8 +203,8 @@
     fwd_stage_range_row_dct_adst_32,  // .stage_range_row
     fwd_cos_bit_col_dct_adst_32,      // .cos_bit_col
     fwd_cos_bit_row_dct_adst_32,      // .cos_bit_row
-    vp10_fdct32_new,                  // .txfm_func_col
-    vp10_fadst32_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                // .txfm_type_row
 
 //  ---------------- config fwd_adst_adst_4 ----------------
 static const int8_t fwd_shift_adst_adst_4[3] = {6, 1, -5};
@@ -201,8 +224,8 @@
     fwd_stage_range_row_adst_adst_4,  // .stage_range_row
     fwd_cos_bit_col_adst_adst_4,      // .cos_bit_col
     fwd_cos_bit_row_adst_adst_4,      // .cos_bit_row
-    vp10_fadst4_new,                  // .txfm_func_col
-    vp10_fadst4_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_adst_8 ----------------
 static const int8_t fwd_shift_adst_adst_8[3] = {3, -1, -1};
@@ -224,8 +247,8 @@
     fwd_stage_range_row_adst_adst_8,  // .stage_range_row
     fwd_cos_bit_col_adst_adst_8,      // .cos_bit_col
     fwd_cos_bit_row_adst_adst_8,      // .cos_bit_row
-    vp10_fadst8_new,                  // .txfm_func_col
-    vp10_fadst8_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_adst_16 ----------------
 static const int8_t fwd_shift_adst_adst_16[3] = {2, 0, -2};
@@ -247,8 +270,8 @@
     fwd_stage_range_row_adst_adst_16,  // .stage_range_row
     fwd_cos_bit_col_adst_adst_16,      // .cos_bit_col
     fwd_cos_bit_row_adst_adst_16,      // .cos_bit_row
-    vp10_fadst16_new,                  // .txfm_func_col
-    vp10_fadst16_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_adst_32 ----------------
 static const int8_t fwd_shift_adst_adst_32[3] = {4, -2, -3};
@@ -270,8 +293,8 @@
     fwd_stage_range_row_adst_adst_32,  // .stage_range_row
     fwd_cos_bit_col_adst_adst_32,      // .cos_bit_col
     fwd_cos_bit_row_adst_adst_32,      // .cos_bit_row
-    vp10_fadst32_new,                  // .txfm_func_col
-    vp10_fadst32_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_dct_4 ----------------
 static const int8_t fwd_shift_adst_dct_4[3] = {5, -4, 1};
@@ -290,8 +313,8 @@
     fwd_stage_range_row_adst_dct_4,  // .stage_range_row
     fwd_cos_bit_col_adst_dct_4,      // .cos_bit_col
     fwd_cos_bit_row_adst_dct_4,      // .cos_bit_row
-    vp10_fadst4_new,                 // .txfm_func_col
-    vp10_fdct4_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_dct_8 ----------------
 static const int8_t fwd_shift_adst_dct_8[3] = {5, 1, -5};
@@ -312,8 +335,8 @@
     fwd_stage_range_row_adst_dct_8,  // .stage_range_row
     fwd_cos_bit_col_adst_dct_8,      // .cos_bit_col
     fwd_cos_bit_row_adst_dct_8,      // .cos_bit_row
-    vp10_fadst8_new,                 // .txfm_func_col
-    vp10_fdct8_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_dct_16 ----------------
 static const int8_t fwd_shift_adst_dct_16[3] = {4, -3, -1};
@@ -335,8 +358,8 @@
     fwd_stage_range_row_adst_dct_16,  // .stage_range_row
     fwd_cos_bit_col_adst_dct_16,      // .cos_bit_col
     fwd_cos_bit_row_adst_dct_16,      // .cos_bit_row
-    vp10_fadst16_new,                 // .txfm_func_col
-    vp10_fdct16_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                 // .txfm_type_row
 
 //  ---------------- config fwd_adst_dct_32 ----------------
 static const int8_t fwd_shift_adst_dct_32[3] = {5, -4, -2};
@@ -358,7 +381,7 @@
     fwd_stage_range_row_adst_dct_32,  // .stage_range_row
     fwd_cos_bit_col_adst_dct_32,      // .cos_bit_col
     fwd_cos_bit_row_adst_dct_32,      // .cos_bit_row
-    vp10_fadst32_new,                 // .txfm_func_col
-    vp10_fdct32_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                 // .txfm_type_row
 
 #endif  // VP10_FWD_TXFM2D_CFG_H_
diff --git a/vp10/common/vp10_inv_txfm1d.c b/vp10/common/vp10_inv_txfm1d.c
index 606ca55..494000f 100644
--- a/vp10/common/vp10_inv_txfm1d.c
+++ b/vp10/common/vp10_inv_txfm1d.c
@@ -32,11 +32,11 @@
 #else
 #define range_check(stage, input, buf, size, bit) \
   {                                               \
-    (void) stage;                                 \
-    (void) input;                                 \
-    (void) buf;                                   \
-    (void) size;                                  \
-    (void) bit;                                   \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
   }
 #endif
 
@@ -1535,3 +1535,796 @@
   bf1[31] = bf0[0];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
+
+void vp10_idct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = bf0[32] - bf0[33];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[34] + bf0[35];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = bf0[36] - bf0[37];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[38] + bf0[39];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = bf0[40] - bf0[41];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[42] + bf0[43];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = bf0[44] - bf0[45];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[46] + bf0[47];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = bf0[48] - bf0[49];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[50] + bf0[51];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = bf0[52] - bf0[53];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[54] + bf0[55];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = bf0[56] - bf0[57];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[58] + bf0[59];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = bf0[60] - bf0[61];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[62] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = bf0[33] - bf0[34];
+  bf1[35] = bf0[32] - bf0[35];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[37] + bf0[38];
+  bf1[39] = bf0[36] + bf0[39];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = bf0[41] - bf0[42];
+  bf1[43] = bf0[40] - bf0[43];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[45] + bf0[46];
+  bf1[47] = bf0[44] + bf0[47];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = bf0[49] - bf0[50];
+  bf1[51] = bf0[48] - bf0[51];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[53] + bf0[54];
+  bf1[55] = bf0[52] + bf0[55];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = bf0[57] - bf0[58];
+  bf1[59] = bf0[56] - bf0[59];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[61] + bf0[62];
+  bf1[63] = bf0[60] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = bf0[35] - bf0[36];
+  bf1[37] = bf0[34] - bf0[37];
+  bf1[38] = bf0[33] - bf0[38];
+  bf1[39] = bf0[32] - bf0[39];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[43] + bf0[44];
+  bf1[45] = bf0[42] + bf0[45];
+  bf1[46] = bf0[41] + bf0[46];
+  bf1[47] = bf0[40] + bf0[47];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = bf0[51] - bf0[52];
+  bf1[53] = bf0[50] - bf0[53];
+  bf1[54] = bf0[49] - bf0[54];
+  bf1[55] = bf0[48] - bf0[55];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[59] + bf0[60];
+  bf1[61] = bf0[58] + bf0[61];
+  bf1[62] = bf0[57] + bf0[62];
+  bf1[63] = bf0[56] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = bf0[39] - bf0[40];
+  bf1[41] = bf0[38] - bf0[41];
+  bf1[42] = bf0[37] - bf0[42];
+  bf1[43] = bf0[36] - bf0[43];
+  bf1[44] = bf0[35] - bf0[44];
+  bf1[45] = bf0[34] - bf0[45];
+  bf1[46] = bf0[33] - bf0[46];
+  bf1[47] = bf0[32] - bf0[47];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[55] + bf0[56];
+  bf1[57] = bf0[54] + bf0[57];
+  bf1[58] = bf0[53] + bf0[58];
+  bf1[59] = bf0[52] + bf0[59];
+  bf1[60] = bf0[51] + bf0[60];
+  bf1[61] = bf0[50] + bf0[61];
+  bf1[62] = bf0[49] + bf0[62];
+  bf1[63] = bf0[48] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[63];
+  bf1[1] = bf0[1] + bf0[62];
+  bf1[2] = bf0[2] + bf0[61];
+  bf1[3] = bf0[3] + bf0[60];
+  bf1[4] = bf0[4] + bf0[59];
+  bf1[5] = bf0[5] + bf0[58];
+  bf1[6] = bf0[6] + bf0[57];
+  bf1[7] = bf0[7] + bf0[56];
+  bf1[8] = bf0[8] + bf0[55];
+  bf1[9] = bf0[9] + bf0[54];
+  bf1[10] = bf0[10] + bf0[53];
+  bf1[11] = bf0[11] + bf0[52];
+  bf1[12] = bf0[12] + bf0[51];
+  bf1[13] = bf0[13] + bf0[50];
+  bf1[14] = bf0[14] + bf0[49];
+  bf1[15] = bf0[15] + bf0[48];
+  bf1[16] = bf0[16] + bf0[47];
+  bf1[17] = bf0[17] + bf0[46];
+  bf1[18] = bf0[18] + bf0[45];
+  bf1[19] = bf0[19] + bf0[44];
+  bf1[20] = bf0[20] + bf0[43];
+  bf1[21] = bf0[21] + bf0[42];
+  bf1[22] = bf0[22] + bf0[41];
+  bf1[23] = bf0[23] + bf0[40];
+  bf1[24] = bf0[24] + bf0[39];
+  bf1[25] = bf0[25] + bf0[38];
+  bf1[26] = bf0[26] + bf0[37];
+  bf1[27] = bf0[27] + bf0[36];
+  bf1[28] = bf0[28] + bf0[35];
+  bf1[29] = bf0[29] + bf0[34];
+  bf1[30] = bf0[30] + bf0[33];
+  bf1[31] = bf0[31] + bf0[32];
+  bf1[32] = bf0[31] - bf0[32];
+  bf1[33] = bf0[30] - bf0[33];
+  bf1[34] = bf0[29] - bf0[34];
+  bf1[35] = bf0[28] - bf0[35];
+  bf1[36] = bf0[27] - bf0[36];
+  bf1[37] = bf0[26] - bf0[37];
+  bf1[38] = bf0[25] - bf0[38];
+  bf1[39] = bf0[24] - bf0[39];
+  bf1[40] = bf0[23] - bf0[40];
+  bf1[41] = bf0[22] - bf0[41];
+  bf1[42] = bf0[21] - bf0[42];
+  bf1[43] = bf0[20] - bf0[43];
+  bf1[44] = bf0[19] - bf0[44];
+  bf1[45] = bf0[18] - bf0[45];
+  bf1[46] = bf0[17] - bf0[46];
+  bf1[47] = bf0[16] - bf0[47];
+  bf1[48] = bf0[15] - bf0[48];
+  bf1[49] = bf0[14] - bf0[49];
+  bf1[50] = bf0[13] - bf0[50];
+  bf1[51] = bf0[12] - bf0[51];
+  bf1[52] = bf0[11] - bf0[52];
+  bf1[53] = bf0[10] - bf0[53];
+  bf1[54] = bf0[9] - bf0[54];
+  bf1[55] = bf0[8] - bf0[55];
+  bf1[56] = bf0[7] - bf0[56];
+  bf1[57] = bf0[6] - bf0[57];
+  bf1[58] = bf0[5] - bf0[58];
+  bf1[59] = bf0[4] - bf0[59];
+  bf1[60] = bf0[3] - bf0[60];
+  bf1[61] = bf0[2] - bf0[61];
+  bf1[62] = bf0[1] - bf0[62];
+  bf1[63] = bf0[0] - bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/vp10/common/vp10_inv_txfm1d.h b/vp10/common/vp10_inv_txfm1d.h
index 0609b65..fd547a6 100644
--- a/vp10/common/vp10_inv_txfm1d.h
+++ b/vp10/common/vp10_inv_txfm1d.h
@@ -25,6 +25,8 @@
                      const int8_t *cos_bit, const int8_t *stage_range);
 void vp10_idct32_new(const int32_t *input, int32_t *output,
                      const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
 
 void vp10_iadst4_new(const int32_t *input, int32_t *output,
                      const int8_t *cos_bit, const int8_t *stage_range);
diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c
index c894a42..d9f713c 100644
--- a/vp10/common/vp10_inv_txfm2d.c
+++ b/vp10/common/vp10_inv_txfm2d.c
@@ -9,8 +9,47 @@
  */
 
 #include "vp10/common/vp10_txfm.h"
+#include "vp10/common/vp10_inv_txfm1d.h"
 
-static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range);
+
+static inline TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4:
+      return vp10_idct4_new;
+      break;
+    case TXFM_TYPE_DCT8:
+      return vp10_idct8_new;
+      break;
+    case TXFM_TYPE_DCT16:
+      return vp10_idct16_new;
+      break;
+    case TXFM_TYPE_DCT32:
+      return vp10_idct32_new;
+      break;
+    case TXFM_TYPE_DCT64:
+      return vp10_idct64_new;
+      break;
+    case TXFM_TYPE_ADST4:
+      return vp10_iadst4_new;
+      break;
+    case TXFM_TYPE_ADST8:
+      return vp10_iadst8_new;
+      break;
+    case TXFM_TYPE_ADST16:
+      return vp10_iadst16_new;
+      break;
+    case TXFM_TYPE_ADST32:
+      return vp10_iadst32_new;
+      break;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static inline void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
                                     int stride, const TXFM_2D_CFG *cfg,
                                     int32_t *txfm_buf) {
   const int txfm_size = cfg->txfm_size;
@@ -19,8 +58,8 @@
   const int8_t *stage_range_row = cfg->stage_range_row;
   const int8_t *cos_bit_col = cfg->cos_bit_col;
   const int8_t *cos_bit_row = cfg->cos_bit_row;
-  const TxfmFunc txfm_func_col = cfg->txfm_func_col;
-  const TxfmFunc txfm_func_row = cfg->txfm_func_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
 
   // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
   // it is used for intermediate data buffering
@@ -49,7 +88,7 @@
   }
 }
 
-void vp10_inv_txfm2d_add_4x4(const int32_t *input, uint16_t *output,
+void vp10_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
                              const int stride, const TXFM_2D_CFG *cfg,
                              const int bd) {
   int txfm_buf[4 * 4 + 4 + 4];
@@ -61,7 +100,7 @@
   clamp_block((int16_t *)output, 4, stride, 0, (1 << bd) - 1);
 }
 
-void vp10_inv_txfm2d_add_8x8(const int32_t *input, uint16_t *output,
+void vp10_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
                              const int stride, const TXFM_2D_CFG *cfg,
                              const int bd) {
   int txfm_buf[8 * 8 + 8 + 8];
@@ -73,7 +112,7 @@
   clamp_block((int16_t *)output, 8, stride, 0, (1 << bd) - 1);
 }
 
-void vp10_inv_txfm2d_add_16x16(const int32_t *input, uint16_t *output,
+void vp10_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
                                const int stride, const TXFM_2D_CFG *cfg,
                                const int bd) {
   int txfm_buf[16 * 16 + 16 + 16];
@@ -85,7 +124,7 @@
   clamp_block((int16_t *)output, 16, stride, 0, (1 << bd) - 1);
 }
 
-void vp10_inv_txfm2d_add_32x32(const int32_t *input, uint16_t *output,
+void vp10_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
                                const int stride, const TXFM_2D_CFG *cfg,
                                const int bd) {
   int txfm_buf[32 * 32 + 32 + 32];
@@ -96,3 +135,15 @@
   inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
   clamp_block((int16_t *)output, 32, stride, 0, (1 << bd) - 1);
 }
+
+void vp10_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+                               const int stride, const TXFM_2D_CFG *cfg,
+                               const int bd) {
+  int txfm_buf[64 * 64 + 64 + 64];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  clamp_block((int16_t *)output, 64, stride, 0, (1 << bd) - 1);
+}
diff --git a/vp10/common/vp10_inv_txfm2d.h b/vp10/common/vp10_inv_txfm2d.h
deleted file mode 100644
index 1b570ef..0000000
--- a/vp10/common/vp10_inv_txfm2d.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP10_INV_TXFM2D_C_H_
-#define VP10_INV_TXFM2D_C_H_
-
-#include "vp10/common/vp10_inv_txfm2d_cfg.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-void vp10_inv_txfm2d_add_4x4(const int32_t *input, uint16_t *output,
-                             const int stride, const TXFM_2D_CFG *cfg,
-                             const int bd);
-void vp10_inv_txfm2d_add_8x8(const int32_t *input, uint16_t *output,
-                             const int stride, const TXFM_2D_CFG *cfg,
-                             const int bd);
-void vp10_inv_txfm2d_add_16x16(const int32_t *input, uint16_t *output,
-                               const int stride, const TXFM_2D_CFG *cfg,
-                               const int bd);
-void vp10_inv_txfm2d_add_32x32(const int32_t *input, uint16_t *output,
-                               const int stride, const TXFM_2D_CFG *cfg,
-                               const int bd);
-#ifdef __cplusplus
-}
-#endif
-#endif  // VP10_INV_TXFM2D_C_H_
diff --git a/vp10/common/vp10_inv_txfm2d_cfg.h b/vp10/common/vp10_inv_txfm2d_cfg.h
index fc552fe..ee965ba 100644
--- a/vp10/common/vp10_inv_txfm2d_cfg.h
+++ b/vp10/common/vp10_inv_txfm2d_cfg.h
@@ -11,7 +11,6 @@
 #ifndef VP10_INV_TXFM2D_CFG_H_
 #define VP10_INV_TXFM2D_CFG_H_
 #include "vp10/common/vp10_inv_txfm1d.h"
-
 //  ---------------- config inv_dct_dct_4 ----------------
 static const int8_t inv_shift_dct_dct_4[2] = {1, -5};
 static const int8_t inv_stage_range_col_dct_dct_4[4] = {17, 17, 16, 16};
@@ -28,8 +27,8 @@
     inv_stage_range_row_dct_dct_4,  // .stage_range_row
     inv_cos_bit_col_dct_dct_4,      // .cos_bit_col
     inv_cos_bit_row_dct_dct_4,      // .cos_bit_row
-    vp10_idct4_new,                 // .txfm_func_col
-    vp10_idct4_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                // .txfm_type_row
 
 //  ---------------- config inv_dct_dct_8 ----------------
 static const int8_t inv_shift_dct_dct_8[2] = {0, -5};
@@ -47,8 +46,8 @@
     inv_stage_range_row_dct_dct_8,  // .stage_range_row
     inv_cos_bit_col_dct_dct_8,      // .cos_bit_col
     inv_cos_bit_row_dct_dct_8,      // .cos_bit_row
-    vp10_idct8_new,                 // .txfm_func_col
-    vp10_idct8_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                // .txfm_type_row
 
 //  ---------------- config inv_dct_dct_16 ----------------
 static const int8_t inv_shift_dct_dct_16[2] = {0, -6};
@@ -70,8 +69,8 @@
     inv_stage_range_row_dct_dct_16,  // .stage_range_row
     inv_cos_bit_col_dct_dct_16,      // .cos_bit_col
     inv_cos_bit_row_dct_dct_16,      // .cos_bit_row
-    vp10_idct16_new,                 // .txfm_func_col
-    vp10_idct16_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                // .txfm_type_row
 
 //  ---------------- config inv_dct_dct_32 ----------------
 static const int8_t inv_shift_dct_dct_32[2] = {-1, -6};
@@ -93,8 +92,31 @@
     inv_stage_range_row_dct_dct_32,  // .stage_range_row
     inv_cos_bit_col_dct_dct_32,      // .cos_bit_col
     inv_cos_bit_row_dct_dct_32,      // .cos_bit_row
-    vp10_idct32_new,                 // .txfm_func_col
-    vp10_idct32_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_64 ----------------
+static const int8_t inv_shift_dct_dct_64[2] = {-1, -7};
+static const int8_t inv_stage_range_col_dct_dct_64[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_dct_64[12] = {
+    20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_dct_dct_64[12] = {13, 13, 13, 13, 13, 13,
+                                                      13, 13, 13, 13, 13, 14};
+static const int8_t inv_cos_bit_row_dct_dct_64[12] = {12, 12, 12, 12, 12, 12,
+                                                      12, 12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_64 = {
+    64,                              // .txfm_size
+    12,                              // .stage_num_col
+    12,                              // .stage_num_row
+    inv_shift_dct_dct_64,            // .shift
+    inv_stage_range_col_dct_dct_64,  // .stage_range_col
+    inv_stage_range_row_dct_dct_64,  // .stage_range_row
+    inv_cos_bit_col_dct_dct_64,      // .cos_bit_col
+    inv_cos_bit_row_dct_dct_64,      // .cos_bit_row
+    TXFM_TYPE_DCT64,                 // .txfm_type_col
+    TXFM_TYPE_DCT64};                // .txfm_type_row
 
 //  ---------------- config inv_dct_adst_4 ----------------
 static const int8_t inv_shift_dct_adst_4[2] = {1, -5};
@@ -113,8 +135,8 @@
     inv_stage_range_row_dct_adst_4,  // .stage_range_row
     inv_cos_bit_col_dct_adst_4,      // .cos_bit_col
     inv_cos_bit_row_dct_adst_4,      // .cos_bit_row
-    vp10_idct4_new,                  // .txfm_func_col
-    vp10_iadst4_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                // .txfm_type_row
 
 //  ---------------- config inv_dct_adst_8 ----------------
 static const int8_t inv_shift_dct_adst_8[2] = {-1, -4};
@@ -135,8 +157,8 @@
     inv_stage_range_row_dct_adst_8,  // .stage_range_row
     inv_cos_bit_col_dct_adst_8,      // .cos_bit_col
     inv_cos_bit_row_dct_adst_8,      // .cos_bit_row
-    vp10_idct8_new,                  // .txfm_func_col
-    vp10_iadst8_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                // .txfm_type_row
 
 //  ---------------- config inv_dct_adst_16 ----------------
 static const int8_t inv_shift_dct_adst_16[2] = {1, -7};
@@ -158,8 +180,8 @@
     inv_stage_range_row_dct_adst_16,  // .stage_range_row
     inv_cos_bit_col_dct_adst_16,      // .cos_bit_col
     inv_cos_bit_row_dct_adst_16,      // .cos_bit_row
-    vp10_idct16_new,                  // .txfm_func_col
-    vp10_iadst16_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                // .txfm_type_row
 
 //  ---------------- config inv_dct_adst_32 ----------------
 static const int8_t inv_shift_dct_adst_32[2] = {-1, -6};
@@ -181,8 +203,8 @@
     inv_stage_range_row_dct_adst_32,  // .stage_range_row
     inv_cos_bit_col_dct_adst_32,      // .cos_bit_col
     inv_cos_bit_row_dct_adst_32,      // .cos_bit_row
-    vp10_idct32_new,                  // .txfm_func_col
-    vp10_iadst32_new};                // .txfm_func_row;
+    TXFM_TYPE_DCT32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                // .txfm_type_row
 
 //  ---------------- config inv_adst_adst_4 ----------------
 static const int8_t inv_shift_adst_adst_4[2] = {0, -4};
@@ -202,8 +224,8 @@
     inv_stage_range_row_adst_adst_4,  // .stage_range_row
     inv_cos_bit_col_adst_adst_4,      // .cos_bit_col
     inv_cos_bit_row_adst_adst_4,      // .cos_bit_row
-    vp10_iadst4_new,                  // .txfm_func_col
-    vp10_iadst4_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_adst_8 ----------------
 static const int8_t inv_shift_adst_adst_8[2] = {-1, -4};
@@ -225,8 +247,8 @@
     inv_stage_range_row_adst_adst_8,  // .stage_range_row
     inv_cos_bit_col_adst_adst_8,      // .cos_bit_col
     inv_cos_bit_row_adst_adst_8,      // .cos_bit_row
-    vp10_iadst8_new,                  // .txfm_func_col
-    vp10_iadst8_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_adst_16 ----------------
 static const int8_t inv_shift_adst_adst_16[2] = {0, -6};
@@ -248,8 +270,8 @@
     inv_stage_range_row_adst_adst_16,  // .stage_range_row
     inv_cos_bit_col_adst_adst_16,      // .cos_bit_col
     inv_cos_bit_row_adst_adst_16,      // .cos_bit_row
-    vp10_iadst16_new,                  // .txfm_func_col
-    vp10_iadst16_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_adst_32 ----------------
 static const int8_t inv_shift_adst_adst_32[2] = {-1, -6};
@@ -271,8 +293,8 @@
     inv_stage_range_row_adst_adst_32,  // .stage_range_row
     inv_cos_bit_col_adst_adst_32,      // .cos_bit_col
     inv_cos_bit_row_adst_adst_32,      // .cos_bit_row
-    vp10_iadst32_new,                  // .txfm_func_col
-    vp10_iadst32_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_dct_4 ----------------
 static const int8_t inv_shift_adst_dct_4[2] = {1, -5};
@@ -291,8 +313,8 @@
     inv_stage_range_row_adst_dct_4,  // .stage_range_row
     inv_cos_bit_col_adst_dct_4,      // .cos_bit_col
     inv_cos_bit_row_adst_dct_4,      // .cos_bit_row
-    vp10_iadst4_new,                 // .txfm_func_col
-    vp10_idct4_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_dct_8 ----------------
 static const int8_t inv_shift_adst_dct_8[2] = {-1, -4};
@@ -313,8 +335,8 @@
     inv_stage_range_row_adst_dct_8,  // .stage_range_row
     inv_cos_bit_col_adst_dct_8,      // .cos_bit_col
     inv_cos_bit_row_adst_dct_8,      // .cos_bit_row
-    vp10_iadst8_new,                 // .txfm_func_col
-    vp10_idct8_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_dct_16 ----------------
 static const int8_t inv_shift_adst_dct_16[2] = {-1, -5};
@@ -336,8 +358,8 @@
     inv_stage_range_row_adst_dct_16,  // .stage_range_row
     inv_cos_bit_col_adst_dct_16,      // .cos_bit_col
     inv_cos_bit_row_adst_dct_16,      // .cos_bit_row
-    vp10_iadst16_new,                 // .txfm_func_col
-    vp10_idct16_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                 // .txfm_type_row
 
 //  ---------------- config inv_adst_dct_32 ----------------
 static const int8_t inv_shift_adst_dct_32[2] = {-1, -6};
@@ -359,7 +381,7 @@
     inv_stage_range_row_adst_dct_32,  // .stage_range_row
     inv_cos_bit_col_adst_dct_32,      // .cos_bit_col
     inv_cos_bit_row_adst_dct_32,      // .cos_bit_row
-    vp10_iadst32_new,                 // .txfm_func_col
-    vp10_idct32_new};                 // .txfm_func_row;
+    TXFM_TYPE_ADST32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                 // .txfm_type_row
 
 #endif  // VP10_INV_TXFM2D_CFG_H_
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index f617ff6..4612395 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -7,6 +7,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp10/common/common.h"
 #include "vp10/common/enums.h"
+#include "vp10/common/vp10_txfm.h"
 
 struct macroblockd;
 
@@ -611,6 +612,32 @@
   }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #fwd txfm
+  add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_fwd_txfm2d_4x4/;
+  add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_fwd_txfm2d_8x8/;
+  add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_fwd_txfm2d_16x16/;
+  add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_fwd_txfm2d_32x32/;
+  add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_fwd_txfm2d_64x64/;
+
+  #inv txfm
+  add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_inv_txfm2d_add_4x4/;
+  add_proto qw/void vp10_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_inv_txfm2d_add_8x8/;
+  add_proto qw/void vp10_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_inv_txfm2d_add_16x16/;
+  add_proto qw/void vp10_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_inv_txfm2d_add_32x32/;
+  add_proto qw/void vp10_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
+  specialize qw/vp10_inv_txfm2d_add_64x64/;
+}
+
 #
 # Motion search
 #
diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h
index b4fd753..ad7b38f 100644
--- a/vp10/common/vp10_txfm.h
+++ b/vp10/common/vp10_txfm.h
@@ -118,9 +118,9 @@
   int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
   if (result_32 != result_64) {
     printf(
-        "%s overflow result_32: %d result_64: %ld w0: %d in0: %d w1: %d in1: "
+        "%s overflow result_32: %d result_64: %lld w0: %d in0: %d w1: %d in1: "
         "%d\n",
-        __func__, result_32, result_64, w0, in0, w1, in1);
+        __func__, result_32, (long long int)result_64, w0, in0, w1, in1);
     assert(0 && "half_btf overflow");
   }
 #endif
@@ -150,6 +150,18 @@
 typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
                          const int8_t *cos_bit, const int8_t *stage_range);
 
+typedef enum TXFM_TYPE {
+  TXFM_TYPE_DCT4,
+  TXFM_TYPE_DCT8,
+  TXFM_TYPE_DCT16,
+  TXFM_TYPE_DCT32,
+  TXFM_TYPE_DCT64,
+  TXFM_TYPE_ADST4,
+  TXFM_TYPE_ADST8,
+  TXFM_TYPE_ADST16,
+  TXFM_TYPE_ADST32,
+} TXFM_TYPE;
+
 typedef struct TXFM_2D_CFG {
   const int txfm_size;
   const int stage_num_col;
@@ -160,8 +172,8 @@
   const int8_t *stage_range_row;
   const int8_t *cos_bit_col;
   const int8_t *cos_bit_row;
-  const TxfmFunc txfm_func_col;
-  const TxfmFunc txfm_func_row;
+  const TXFM_TYPE txfm_type_col;
+  const TXFM_TYPE txfm_type_row;
 } TXFM_2D_CFG;
 
 #endif  // VP10_TXFM_H_
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index ce6317c..935e4f7 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -117,9 +117,7 @@
   for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
     vp10_diff_update_prob(r, &fc->refmv_prob[i]);
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    vp10_diff_update_prob(r, &fc->drl_prob0[i]);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    vp10_diff_update_prob(r, &fc->drl_prob1[i]);
+    vp10_diff_update_prob(r, &fc->drl_prob[i]);
 #if CONFIG_EXT_INTER
   vp10_diff_update_prob(r, &fc->new2mv_prob);
 #endif  // CONFIG_EXT_INTER
@@ -215,58 +213,28 @@
   }
 }
 
-static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
-                                          const TX_SIZE tx_size,
-                                          uint8_t *dst, int stride,
-                                          int eob, int block) {
+static void inverse_transform_block(MACROBLOCKD* xd, int plane,
+                                    const TX_TYPE tx_type,
+                                    const TX_SIZE tx_size,
+                                    uint8_t *dst, int stride,
+                                    int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
   const int seg_id = xd->mi[0]->mbmi.segment_id;
   if (eob > 0) {
     tran_low_t *const dqcoeff = pd->dqcoeff;
+    INV_TXFM_PARAM inv_txfm_param;
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = eob;
+    inv_txfm_param.lossless = xd->lossless[seg_id];
+
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type, xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type);
-          break;
-        case TX_16X16:
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        case TX_32X32:
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
+      inv_txfm_param.bd = xd->bd;
+      highbd_inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
     } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                                xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_16X16:
-          vp10_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_32X32:
-          vp10_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
+      inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
 #if CONFIG_VP9_HIGHBITDEPTH
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -289,75 +257,6 @@
   }
 }
 
-static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
-                                          const TX_TYPE tx_type,
-                                          const TX_SIZE tx_size,
-                                          uint8_t *dst, int stride,
-                                          int eob) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int seg_id = xd->mi[0]->mbmi.segment_id;
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type, xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type);
-          break;
-        case TX_16X16:
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        case TX_32X32:
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
-    } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                                xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_16X16:
-          vp10_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_32X32:
-          vp10_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
-#if CONFIG_VP9_HIGHBITDEPTH
-    }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
-  }
-}
-
 static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
 #if CONFIG_ANS
                                          const rans_dec_lut *const token_tab,
@@ -393,8 +292,8 @@
 #endif  // CONFIG_ANS
                                              plane, sc, col, row, tx_size,
                                              r, mbmi->segment_id);
-    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
-                                  dst, pd->dst.stride, eob);
+    inverse_transform_block(xd, plane, tx_type, tx_size,
+                            dst, pd->dst.stride, eob);
   }
 }
 
@@ -406,11 +305,11 @@
                                   TX_SIZE tx_size, int *eob_total) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-               (blk_col >> (1 - pd->subsampling_x));
-  TX_SIZE plane_tx_size = plane ?
-      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], bsize, 0, 0) :
-      mbmi->inter_tx_size[tx_idx];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
 
@@ -429,9 +328,9 @@
     const int eob = vp10_decode_block_tokens(xd, plane, sc,
                                              blk_col, blk_row, tx_size,
                                              r, mbmi->segment_id);
-    inverse_transform_block_inter(xd, plane, tx_size,
+    inverse_transform_block(xd, plane, tx_type, tx_size,
         &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col],
-        pd->dst.stride, eob, block);
+        pd->dst.stride, eob);
     *eob_total += eob;
   } else {
     int bsl = b_width_log2_lookup[bsize];
@@ -477,14 +376,14 @@
                                            plane, sc, col, row, tx_size, r,
                                            mbmi->segment_id);
 
-  inverse_transform_block_inter(xd, plane, tx_size,
-                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
-                            pd->dst.stride, eob, block_idx);
+  inverse_transform_block(xd, plane, tx_type, tx_size,
+                          &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                          pd->dst.stride, eob);
   return eob;
 }
 #endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
 
-#if (CONFIG_SUPERTX || CONFIG_OBMC)
+#if CONFIG_SUPERTX
 static void build_mc_border(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
                             int x, int y, int b_w, int b_h, int w, int h) {
@@ -873,9 +772,7 @@
                             interp_filter, xs, ys, xd);
 #endif  // CONFIG_EXT_INTER
 }
-#endif  // (CONFIG_SUPERTX || CONFIG_OBMC)
 
-#if CONFIG_SUPERTX
 static void dec_build_inter_predictors_sb_extend(
     VP10Decoder *const pbi, MACROBLOCKD *xd,
 #if CONFIG_EXT_INTER
@@ -1046,237 +943,6 @@
 }
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_OBMC
-static void dec_build_prediction_by_above_preds(VP10Decoder *const pbi,
-                                                MACROBLOCKD *xd,
-                                                int mi_row, int mi_col,
-                                                uint8_t *tmp_buf[MAX_MB_PLANE],
-                                                int tmp_stride[MAX_MB_PLANE]) {
-  VP10_COMMON *const cm = &pbi->common;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int i, j, mi_step, ref;
-
-  if (mi_row == 0)
-    return;
-
-  for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    const MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * cm->mi_stride];
-    const MB_MODE_INFO *mbmi = &mi->mbmi;
-    const BLOCK_SIZE sb_type = mbmi->sb_type;
-    const int is_compound = has_second_ref(mbmi);
-    const INTERP_FILTER interp_filter = mbmi->interp_filter;
-
-    mi_step = VPXMIN(xd->n8_w, num_8x8_blocks_wide_lookup[sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi))
-      continue;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst,
-                       tmp_buf[j], tmp_stride[j],
-                       0, i, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
-    }
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!vp10_is_valid_scale(&ref_buf->sf)))
-        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
-                            &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge = -(((mi_col + i) * MI_SIZE) * 8);
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = mi_row << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *pd = &xd->plane[j];
-      struct buf_2d *const dst_buf = &pd->dst;
-      bw = (mi_step * 8) >> pd->subsampling_x;
-      bh = VPXMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
-                  4);
-
-      for (ref = 0; ref < 1 + is_compound; ++ref) {
-        const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-        struct buf_2d *const pre_buf = &pd->pre[ref];
-        const int idx = xd->block_refs[ref]->idx;
-        BufferPool *const pool = pbi->common.buffer_pool;
-        RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-        const int is_scaled = vp10_is_scaled(sf);
-
-        if (sb_type < BLOCK_8X8) {
-          const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
-          const int have_vsplit = bp != PARTITION_HORZ;
-          const int have_hsplit = bp != PARTITION_VERT;
-          const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-          const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-          const int pw = 8 >> (have_vsplit | pd->subsampling_x);
-          int x, y;
-
-          for (y = 0; y < num_4x4_h; ++y)
-            for (x = 0; x < num_4x4_w; ++x) {
-              const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-              if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT)
-                  && y == 0 && !pd->subsampling_y)
-                continue;
-
-              dec_build_inter_predictors(pbi, xd, j,
-                                         mi_col_offset, mi_row_offset,
-                                         bw, bh,
-                                         4 * x, 0, pw, bh,
-#if CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                         0, 0,
-#endif  // CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                         mi_x, mi_y,
-                                         interp_filter, sf, pre_buf, dst_buf,
-                                         &mv, ref_frame_buf, is_scaled, ref);
-            }
-        } else {
-          const MV mv = mi->mbmi.mv[ref].as_mv;
-          dec_build_inter_predictors(pbi, xd, j,
-                                     mi_col_offset, mi_row_offset,
-                                     bw, bh,
-                                     0, 0, bw, bh,
-#if CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                     0, 0,
-#endif  // CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                     mi_x, mi_y, interp_filter,
-                                     sf, pre_buf, dst_buf, &mv, ref_frame_buf,
-                                     is_scaled, ref);
-        }
-      }
-    }
-  }
-  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
-}
-
-static void dec_build_prediction_by_left_preds(VP10Decoder *const pbi,
-                                               MACROBLOCKD *xd,
-                                               int mi_row, int mi_col,
-                                               uint8_t *tmp_buf[MAX_MB_PLANE],
-                                               int tmp_stride[MAX_MB_PLANE]) {
-  VP10_COMMON *const cm = &pbi->common;
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int i, j, mi_step, ref;
-
-  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start) ||
-      (mi_col - 1) >= tile->mi_col_end)
-    return;
-
-  for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int mi_x, mi_y, bw, bh;
-    const MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * cm->mi_stride];
-    const MB_MODE_INFO *mbmi = &mi->mbmi;
-    const BLOCK_SIZE sb_type = mbmi->sb_type;
-    const int is_compound = has_second_ref(mbmi);
-    const INTERP_FILTER interp_filter = mbmi->interp_filter;
-
-    mi_step = VPXMIN(xd->n8_h, num_8x8_blocks_high_lookup[sb_type]);
-
-    if (!is_neighbor_overlappable(mbmi))
-      continue;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst,
-                       tmp_buf[j], tmp_stride[j],
-                       i, 0, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
-    }
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!vp10_is_valid_scale(&ref_buf->sf)))
-        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
-                            &ref_buf->sf);
-    }
-
-    xd->mb_to_top_edge    = -(((mi_row + i) * MI_SIZE) * 8);
-    mi_x = mi_col << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *pd = &xd->plane[j];
-      struct buf_2d *const dst_buf = &pd->dst;
-      bw = VPXMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
-                  4);
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      for (ref = 0; ref < 1 + is_compound; ++ref) {
-        const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-        struct buf_2d *const pre_buf = &pd->pre[ref];
-        const int idx = xd->block_refs[ref]->idx;
-        BufferPool *const pool = pbi->common.buffer_pool;
-        RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-        const int is_scaled = vp10_is_scaled(sf);
-
-        if (sb_type < BLOCK_8X8) {
-          const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
-          const int have_vsplit = bp != PARTITION_HORZ;
-          const int have_hsplit = bp != PARTITION_VERT;
-          const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-          const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-          const int ph = 8 >> (have_hsplit | pd->subsampling_y);
-          int x, y;
-
-          for (y = 0; y < num_4x4_h; ++y)
-            for (x = 0; x < num_4x4_w; ++x) {
-              const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-              if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT)
-                  && x == 0 && !pd->subsampling_x)
-                continue;
-
-              dec_build_inter_predictors(pbi, xd, j,
-#if CONFIG_OBMC
-                                         mi_col_offset, mi_row_offset,
-#endif  // CONFIG_OBMC
-                                         bw, bh,
-                                         0, 4 * y, bw, ph,
-#if CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                         0, 0,
-#endif  // CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                         mi_x, mi_y,
-                                         interp_filter, sf, pre_buf, dst_buf,
-                                         &mv, ref_frame_buf, is_scaled, ref);
-            }
-        } else {
-          const MV mv = mi->mbmi.mv[ref].as_mv;
-          dec_build_inter_predictors(pbi, xd, j,
-#if CONFIG_OBMC
-                                     mi_col_offset, mi_row_offset,
-#endif  // CONFIG_OBMC
-                                     bw, bh,
-                                     0, 0, bw, bh,
-#if CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                     0, 0,
-#endif  // CONFIG_EXT_INTER && CONFIG_SUPERTX
-                                     mi_x, mi_y, interp_filter,
-                                     sf, pre_buf, dst_buf, &mv, ref_frame_buf,
-                                     is_scaled, ref);
-        }
-      }
-    }
-  }
-  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
-}
-#endif  // CONFIG_OBMC
-
 static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                          int n4_wl, int n4_hl) {
   // get minimum log2 num4x4s dimension
@@ -1628,8 +1294,13 @@
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+#if !CONFIG_EXT_PARTITION_TYPES
   MB_MODE_INFO *mbmi;
+#endif
   int i, offset = mi_row * cm->mi_stride + mi_col;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
 
   DECLARE_ALIGNED(16, uint8_t,
@@ -1674,8 +1345,13 @@
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = cm->mi + offset;
+#if CONFIG_EXT_PARTITION_TYPES
+  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
+                            mi_row, mi_col, bsize);
+#else
   mbmi = &xd->mi[0]->mbmi;
   partition = partition_lookup[bsl][mbmi->sb_type];
+#endif
   subsize = get_subsize(bsize, partition);
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -1921,6 +1597,204 @@
           }
         }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, 1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[i], dst_stride[i],
+                                                 dst_buf1[i], dst_stride1[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[i], dst_stride[i],
+                                                 dst_buf2[i], dst_stride2[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_HORZ, i);
+      }
+      break;
+    case PARTITION_VERT_A:
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[i], dst_stride[i],
+                                                 dst_buf1[i], dst_stride1[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[i], dst_stride[i],
+                                                 dst_buf2[i], dst_stride2[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_VERT, i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize,
+                     mi_row + hbs, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf1[i], dst_stride1[i],
+                                                 dst_buf2[i], dst_stride2[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[i], dst_stride[i],
+                                                 dst_buf1[i], dst_stride1[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_HORZ, i);
+      }
+      break;
+    case PARTITION_VERT_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize,
+                     mi_row + hbs, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf1[i], dst_stride1[i],
+                                                 dst_buf2[i], dst_stride2[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[i], dst_stride[i],
+                                                 dst_buf1[i], dst_stride1[i],
+                                                 &xd->plane[i],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_VERT, i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
   }
@@ -1936,6 +1810,9 @@
 #if CONFIG_ANS
                          struct AnsDecoder *const tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_TYPE partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                          BLOCK_SIZE bsize,
                          int bwl, int bhl) {
   VP10_COMMON *const cm = &pbi->common;
@@ -1954,11 +1831,17 @@
     mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
                        bw, bh, x_mis, y_mis, bwl, bhl);
   }
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
   vp10_read_mode_info(pbi, xd, supertx_enabled,
                       mi_row, mi_col, r, x_mis, y_mis);
 #else
   MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
                                    bw, bh, x_mis, y_mis, bwl, bhl);
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
   vp10_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 #endif  // CONFIG_SUPERTX
 
@@ -1998,6 +1881,9 @@
             (xd->mb_to_bottom_edge >= 0 ?
              0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
+        if (plane <= 1 && mbmi->palette_mode_info.palette_size[plane])
+          vp10_decode_palette_tokens(xd, plane, r);
+
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
             predict_and_reconstruct_intra_block(xd,
@@ -2053,10 +1939,10 @@
 #if CONFIG_VP9_HIGHBITDEPTH
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        dec_build_prediction_by_above_preds(pbi, xd, mi_row, mi_col,
-                                            dst_buf1, dst_stride1);
-        dec_build_prediction_by_left_preds(pbi, xd, mi_row, mi_col,
-                                           dst_buf2, dst_stride2);
+        vp10_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                             dst_buf1, dst_stride1);
+        vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                            dst_buf2, dst_stride2);
         vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
                               mi_row, mi_col);
         vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0, NULL, NULL,
@@ -2140,6 +2026,7 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+#if !CONFIG_EXT_PARTITION_TYPES
 static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
                                                 int mi_row, int mi_col,
                                                 BLOCK_SIZE subsize,
@@ -2153,17 +2040,29 @@
   memset(above_ctx, partition_context_lookup[subsize].above, bw);
   memset(left_ctx, partition_context_lookup[subsize].left, bw);
 }
+#endif  // !CONFIG_EXT_PARTITION_TYPES
 
 static PARTITION_TYPE read_partition(VP10_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col, vpx_reader *r,
-                                     int has_rows, int has_cols, int bsl) {
+                                     int has_rows, int has_cols,
+#if CONFIG_EXT_PARTITION_TYPES
+                                     BLOCK_SIZE bsize,
+#endif
+                                     int bsl) {
   const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
   const vpx_prob *const probs = cm->fc->partition_prob[ctx];
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+      p = (PARTITION_TYPE)vpx_read_tree(r, vp10_partition_tree, probs);
+    else
+      p = (PARTITION_TYPE)vpx_read_tree(r, vp10_ext_partition_tree, probs);
+#else
     p = (PARTITION_TYPE)vpx_read_tree(r, vp10_partition_tree, probs);
+#endif  // CONFIG_EXT_PARTITION_TYPES
   else if (!has_rows && has_cols)
     p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
@@ -2206,6 +2105,9 @@
   const int hbs = num_8x8_wh >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 #if CONFIG_SUPERTX
@@ -2220,6 +2122,9 @@
     return;
 
   partition = read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols,
+#if CONFIG_EXT_PARTITION_TYPES
+                             bsize,
+#endif
                              n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
 #if CONFIG_SUPERTX
@@ -2284,6 +2189,9 @@
 #if CONFIG_ANS
                  tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                  subsize, 1, 1);
   } else {
     switch (partition) {
@@ -2296,6 +2204,9 @@
 #if CONFIG_ANS
                      tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                      subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
@@ -2307,6 +2218,9 @@
 #if CONFIG_ANS
                      tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                      subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
           decode_block(pbi, xd,
@@ -2317,6 +2231,9 @@
 #if CONFIG_ANS
                        tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                        subsize, n4x4_l2, n8x8_l2);
         break;
       case PARTITION_VERT:
@@ -2328,6 +2245,9 @@
 #if CONFIG_ANS
                      tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                      subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
           decode_block(pbi, xd,
@@ -2338,6 +2258,9 @@
 #if CONFIG_ANS
                        tok,
 #endif  // CONFIG_ANS
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                        subsize, n8x8_l2, n4x4_l2);
         break;
       case PARTITION_SPLIT:
@@ -2378,6 +2301,124 @@
 #endif  // CONFIG_ANS
                          subsize, n8x8_l2);
         break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col,       r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col + hbs, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, subsize, n4x4_l2, n8x8_l2);
+        break;
+      case PARTITION_HORZ_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, subsize, n4x4_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col,       r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        break;
+      case PARTITION_VERT_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col,       r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col,       r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, subsize, n8x8_l2, n4x4_l2);
+        break;
+      case PARTITION_VERT_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, subsize, n8x8_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col + hbs, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r,
+#if CONFIG_ANS
+                     tok,
+#endif  // CONFIG_ANS
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        break;
+#endif
       default:
         assert(0 && "Invalid partition type");
     }
@@ -2421,7 +2462,7 @@
           for (col = 0; col < max_blocks_wide; col += step)
             eobtotal += reconstruct_inter_block(xd,
 #if CONFIG_ANS
-                                                pbi->token_tab, tok,
+                                                cm->token_tab, tok,
 #else
                                                 r,
 #endif
@@ -2435,10 +2476,43 @@
   }
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize >= BLOCK_8X8) {
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize > BLOCK_8X8)
+          break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+#else
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void setup_bool_decoder(const uint8_t *data,
@@ -2962,18 +3036,7 @@
   assert(tile_rows <= 4);
   assert(tile_cols <= (1 << 6));
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
-
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_cols);
-
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * aligned_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_cols);
 
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -3032,11 +3095,7 @@
                         tile_cols - tile_col - 1 : tile_col;
         tile_data = pbi->tile_data + tile_cols * tile_row + col;
         vp10_tile_set_col(&tile, tile_data->cm, col);
-        vp10_zero(tile_data->xd.left_context);
-        vp10_zero(tile_data->xd.left_seg_context);
-#if CONFIG_VAR_TX
-        vp10_zero(tile_data->xd.left_txfm_context_buffer);
-#endif
+        vp10_zero_left_context(&tile_data->xd);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
           decode_partition(pbi, &tile_data->xd,
@@ -3126,11 +3185,7 @@
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(tile_data->xd.left_context);
-    vp10_zero(tile_data->xd.left_seg_context);
-#if CONFIG_VAR_TX
-    vp10_zero(tile_data->xd.left_txfm_context_buffer);
-#endif
+    vp10_zero_left_context(&tile_data->xd);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
@@ -3211,16 +3266,8 @@
     worker->data2 = &pbi->tile_worker_info[n];
   }
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * aligned_mi_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_mi_cols);
+
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -3720,9 +3767,17 @@
     for (i = 0; i < INTRA_MODES - 1; ++i)
       vp10_diff_update_prob(&r, &fc->uv_mode_prob[j][i]);
 
+#if CONFIG_EXT_PARTITION_TYPES
+    for (i = 0; i < PARTITION_TYPES - 1; ++i)
+      vp10_diff_update_prob(&r, &fc->partition_prob[0][i]);
+    for (j = 1; j < PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
+        vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
+#else
   for (j = 0; j < PARTITION_CONTEXTS; ++j)
     for (i = 0; i < PARTITION_TYPES - 1; ++i)
       vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_EXT_INTRA
   for (i = 0; i < INTRA_FILTERS + 1; ++i)
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index eb336be..5b2fa1f 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -155,33 +155,45 @@
   uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
   mbmi->ref_mv_idx = 0;
 
-  if (xd->ref_mv_count[ref_frame_type] > 2) {
-    uint8_t drl0_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], 1);
-    vpx_prob drl0_prob = cm->fc->drl_prob0[drl0_ctx];
-    if (vpx_read(r, drl0_prob)) {
-      mbmi->ref_mv_idx = 1;
-      if (xd->counts)
-        ++xd->counts->drl_mode0[drl0_ctx][1];
-      if (xd->ref_mv_count[ref_frame_type] > 3) {
-        uint8_t drl1_ctx =
-            vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], 2);
-        vpx_prob drl1_prob = cm->fc->drl_prob1[drl1_ctx];
-        if (vpx_read(r, drl1_prob)) {
-          mbmi->ref_mv_idx = 2;
+  if (mbmi->mode == NEWMV) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+        if (!vpx_read(r, drl_prob)) {
+          mbmi->ref_mv_idx = idx;
           if (xd->counts)
-            ++xd->counts->drl_mode1[drl1_ctx][1];
-
+            ++xd->counts->drl_mode[drl_ctx][0];
           return;
         }
-
+        mbmi->ref_mv_idx = idx + 1;
         if (xd->counts)
-          ++xd->counts->drl_mode1[drl1_ctx][0];
+          ++xd->counts->drl_mode[drl_ctx][1];
       }
-      return;
     }
+  }
 
-    if (xd->counts)
-      ++xd->counts->drl_mode0[drl0_ctx][0];
+  if (mbmi->mode == NEARMV) {
+    int idx;
+    // Offset the NEARESTMV mode.
+    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+    // mode is factored in.
+    for (idx = 1; idx < 3; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+        if (!vpx_read(r, drl_prob)) {
+          mbmi->ref_mv_idx = idx - 1;
+          if (xd->counts)
+            ++xd->counts->drl_mode[drl_ctx][0];
+          return;
+        }
+        mbmi->ref_mv_idx = idx;
+        if (xd->counts)
+          ++xd->counts->drl_mode[drl_ctx][1];
+      }
+    }
   }
 }
 #endif
@@ -213,12 +225,15 @@
                                TX_SIZE tx_size, int blk_row, int blk_col,
                                vpx_reader *r) {
   int is_split = 0;
-  const int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col >> 1),
-                                   xd->left_txfm_context + (blk_row >> 1),
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
                                    tx_size);
+  TX_SIZE (*const inter_tx_size)[MI_BLOCK_SIZE] =
+    (TX_SIZE (*)[MI_BLOCK_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
 
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> 5;
@@ -239,10 +254,10 @@
       ++counts->txfm_partition[ctx][1];
 
     if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[tx_idx] = TX_4X4;
-      mbmi->tx_size = mbmi->inter_tx_size[tx_idx];
-      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      inter_tx_size[0][0] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
       return;
     }
 
@@ -256,15 +271,15 @@
     }
   } else {
     int idx, idy;
-    mbmi->inter_tx_size[tx_idx] = tx_size;
+    inter_tx_size[0][0] = tx_size;
     for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
       for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
-        mbmi->inter_tx_size[tx_idx + (idy << 3) + idx] = tx_size;
-    mbmi->tx_size = mbmi->inter_tx_size[tx_idx];
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
     if (counts)
       ++counts->txfm_partition[ctx][0];
-    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
   }
 }
 #endif
@@ -1243,7 +1258,7 @@
 #endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
                                    r, mode_ctx);
 #if CONFIG_REF_MV
-      if (mbmi->mode == NEARMV)
+      if (mbmi->mode == NEARMV || mbmi->mode == NEWMV)
         read_drl_idx(cm, xd, mbmi, r);
 #endif
     }
@@ -1376,6 +1391,10 @@
 #else
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+          CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+          uint8_t ref_mv_count[2];
+#endif
           for (ref = 0; ref < 1 + is_compound; ++ref)
 #if CONFIG_EXT_INTER
           {
@@ -1384,6 +1403,10 @@
                                    mv_ref_list, j, mi_row, mi_col, NULL);
 #endif  // CONFIG_EXT_INTER
             vp10_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                           ref_mv_stack[ref],
+                                           &ref_mv_count[ref],
+#endif
 #if CONFIG_EXT_INTER
                                            mv_ref_list,
 #endif  // CONFIG_EXT_INTER
@@ -1436,6 +1459,22 @@
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
+    int ref;
+    for (ref = 0; ref < 1 + is_compound && mbmi->mode == NEWMV; ++ref) {
+      int_mv ref_mv = nearestmv[ref];
+#if CONFIG_REF_MV
+      uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+      if (xd->ref_mv_count[ref_frame_type] > 1) {
+        ref_mv = (ref == 0) ?
+            xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].this_mv :
+            xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].comp_mv;
+        clamp_mv_ref(&ref_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+        lower_mv_precision(&ref_mv.as_mv, allow_hp);
+      }
+#endif
+      nearestmv[ref] = ref_mv;
+    }
+
     xd->corrupted |= !assign_mv(cm, xd, mbmi->mode,
 #if CONFIG_REF_MV
                                 0,
@@ -1565,7 +1604,7 @@
         int idx, idy;
         for (idy = 0; idy < height; ++idy)
           for (idx = 0; idx < width; ++idx)
-            mbmi->inter_tx_size[(idy >> 1) * 8 + (idx >> 1)] = mbmi->tx_size;
+            mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
       }
 
       set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
@@ -1584,7 +1623,7 @@
     xd->mi[0]->mbmi.tx_size = xd->supertx_size;
     for (idy = 0; idy < height; ++idy)
       for (idx = 0; idx < width; ++idx)
-        xd->mi[0]->mbmi.inter_tx_size[(idy >> 1) * 8 + (idx >> 1)] =
+        xd->mi[0]->mbmi.inter_tx_size[idy >> 1][idx >> 1] =
             xd->supertx_size;
   }
 #endif  // CONFIG_VAR_TX
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 1ef2ea5..7f58b52 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -28,6 +28,7 @@
 #include "vp10/common/seg_common.h"
 #include "vp10/common/tile_common.h"
 
+#include "vp10/encoder/buf_ans.h"
 #include "vp10/encoder/cost.h"
 #include "vp10/encoder/bitstream.h"
 #include "vp10/encoder/encodemv.h"
@@ -49,6 +50,10 @@
 static const struct vp10_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
   {{0, 1}, {2, 2}, {3, 2}};
 #endif  // CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
+#if CONFIG_EXT_PARTITION_TYPES
+static const struct vp10_token ext_partition_encodings[EXT_PARTITION_TYPES] =
+  {{0, 1}, {4, 3}, {12, 4}, {7, 3}, {10, 4}, {11, 4}, {26, 5}, {27, 5}};
+#endif
 static const struct vp10_token partition_encodings[PARTITION_TYPES] =
   {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
 #if !CONFIG_REF_MV
@@ -193,18 +198,40 @@
                           const MB_MODE_INFO_EXT *mbmi_ext,
                           vpx_writer *w) {
   uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
-  if (mbmi_ext->ref_mv_count[ref_frame_type] > 2) {
-    uint8_t drl0_ctx =
-        vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], 1);
-    vpx_prob drl0_prob = cm->fc->drl_prob0[drl0_ctx];
-    vpx_write(w, mbmi->ref_mv_idx != 0, drl0_prob);
-    if (mbmi_ext->ref_mv_count[ref_frame_type] > 3 &&
-        mbmi->ref_mv_idx > 0) {
-      uint8_t drl1_ctx =
-          vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], 2);
-      vpx_prob drl1_prob = cm->fc->drl_prob1[drl1_ctx];
-      vpx_write(w, mbmi->ref_mv_idx != 1, drl1_prob);
+
+  assert(mbmi->ref_mv_idx < 3);
+
+  if (mbmi->mode == NEWMV) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+        vpx_write(w, mbmi->ref_mv_idx != idx, drl_prob);
+        if (mbmi->ref_mv_idx == idx)
+          return;
+      }
     }
+    return;
+  }
+
+  if (mbmi->mode == NEARMV) {
+    int idx;
+    // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+    for (idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+        vpx_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
+        if (mbmi->ref_mv_idx == (idx - 1))
+          return;
+      }
+    }
+    return;
   }
 }
 #endif
@@ -266,11 +293,12 @@
                                 const MB_MODE_INFO *mbmi,
                                 TX_SIZE tx_size, int blk_row, int blk_col,
                                 vpx_writer *w) {
-  const int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col >> 1),
-                                   xd->left_txfm_context + (blk_row >> 1),
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
                                    tx_size);
 
   if (xd->mb_to_bottom_edge < 0)
@@ -281,10 +309,10 @@
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
      return;
 
-  if (tx_size == mbmi->inter_tx_size[tx_idx]) {
+  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
     vpx_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
-    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
   } else {
     const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int bsl = b_width_log2_lookup[bsize];
@@ -292,8 +320,8 @@
     vpx_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
 
     if (tx_size == TX_8X8) {
-      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
       return;
     }
 
@@ -343,11 +371,8 @@
     vp10_cond_prob_diff_update(w, &cm->fc->refmv_prob[i],
                                counts->refmv_mode[i]);
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    vp10_cond_prob_diff_update(w, &cm->fc->drl_prob0[i],
-                               counts->drl_mode0[i]);
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    vp10_cond_prob_diff_update(w, &cm->fc->drl_prob1[i],
-                               counts->drl_mode1[i]);
+    vp10_cond_prob_diff_update(w, &cm->fc->drl_prob[i],
+                               counts->drl_mode[i]);
 #if CONFIG_EXT_INTER
   vp10_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode);
 #endif  // CONFIG_EXT_INTER
@@ -634,65 +659,85 @@
   *tp = p;
 }
 #else
-// This function serializes the tokens backwards both in token order and
-// bit order in each token.
-static void pack_mb_tokens_ans(struct AnsCoder *const ans,
-                               rans_dec_lut token_tab[COEFF_PROB_MODELS],
-                               const TOKENEXTRA *const start,
+// This function serializes the tokens in forward order using a buffered ans
+// coder.
+static void pack_mb_tokens_ans(struct BufAnsCoder *ans,
+                               const rans_dec_lut token_tab[COEFF_PROB_MODELS],
+                               TOKENEXTRA **tp,
                                const TOKENEXTRA *const stop,
-                               vpx_bit_depth_t bit_depth) {
-  const TOKENEXTRA *p;
-  TX_SIZE tx_size = TX_SIZES;
+                               vpx_bit_depth_t bit_depth,
+                               const TX_SIZE tx) {
+  TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = 16 << (tx << 1);
+#endif  // CONFIG_VAR_TX
 
-  for (p = stop - 1; p >= start; --p) {
+  while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
-    if (t == EOSB_TOKEN) {
-      tx_size = (TX_SIZE)p->extra;
-    } else {
 #if CONFIG_VP9_HIGHBITDEPTH
-    const vp10_extra_bit *const b =
-      (bit_depth == VPX_BITS_12) ? &vp10_extra_bits_high12[t] :
-      (bit_depth == VPX_BITS_10) ? &vp10_extra_bits_high10[t] :
-      &vp10_extra_bits[t];
+    const vp10_extra_bit *b;
+    if (bit_depth == VPX_BITS_12)
+      b = &vp10_extra_bits_high12[t];
+    else if (bit_depth == VPX_BITS_10)
+      b = &vp10_extra_bits_high10[t];
+    else
+      b = &vp10_extra_bits[t];
 #else
     const vp10_extra_bit *const b = &vp10_extra_bits[t];
-    (void) bit_depth;
+    (void)bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (t != EOB_TOKEN && t != ZERO_TOKEN) {
-      // Write extra bits first
-      const int e = p->extra;
-      const int l = b->len;
-      const int skip_bits = (t == CATEGORY6_TOKEN) ? TX_SIZES - 1 - tx_size : 0;
-      assert(tx_size < TX_SIZES);
-      uabs_write(ans, e & 1, 128);
-      if (l) {
-        const int v = e >> 1;
-        int n;
-        for (n = 0; n < l - skip_bits; ++n) {
-          const int bb = (v >> n) & 1;
-          uabs_write(ans, bb, b->prob[l - 1 - n]);
-        }
-        for (; n < l; ++n) {
-          assert(((v >> n) & 1) == 0);
-        }
-      }
+    /* skip one or two nodes */
+    if (!p->skip_eob_node)
+      buf_uabs_write(ans, t != EOB_TOKEN, p->context_tree[0]);
 
-      {
+    if (t != EOB_TOKEN) {
+      buf_uabs_write(ans, t != ZERO_TOKEN, p->context_tree[1]);
+
+      if (t != ZERO_TOKEN) {
         struct rans_sym s;
         const rans_dec_lut *token_cdf =
             &token_tab[p->context_tree[PIVOT_NODE] - 1];
         s.cum_prob = (*token_cdf)[t - ONE_TOKEN];
         s.prob = (*token_cdf)[t - ONE_TOKEN + 1] - s.cum_prob;
-        rans_write(ans, &s);
+        buf_rans_write(ans, &s);
       }
     }
-    if (t != EOB_TOKEN)
-      uabs_write(ans, t != ZERO_TOKEN, p->context_tree[1]);
-    if (!p->skip_eob_node)
-      uabs_write(ans, t != EOB_TOKEN, p->context_tree[0]);
+
+    if (b->base_val) {
+      const int e = p->extra, l = b->len;
+      int skip_bits = (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+
+      if (l) {
+        const unsigned char *pb = b->prob;
+        int v = e >> 1;
+        int n = l; /* number of bits in v, assumed nonzero */
+        int i = 0;
+
+        do {
+          const int bb = (v >> --n) & 1;
+          if (skip_bits) {
+            skip_bits--;
+            assert(!bb);
+          } else {
+            buf_uabs_write(ans, bb, pb[i >> 1]);
+          }
+          i = b->tree[i + bb];
+        } while (n);
+      }
+
+      buf_uabs_write(ans, e & 1, 128);
+    }
+    ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (t == EOB_TOKEN || count == seg_eob) break;
+#endif  // CONFIG_VAR_TX
   }
-  }
+
+  *tp = p;
 }
 #endif  // !CONFIG_ANS
 
@@ -706,11 +751,11 @@
                            int blk_row, int blk_col, TX_SIZE tx_size) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-               (blk_col >> (1 - pd->subsampling_x));
-  TX_SIZE plane_tx_size = plane ?
-      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], bsize, 0, 0) :
-      mbmi->inter_tx_size[tx_idx];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
 
@@ -1088,7 +1133,7 @@
                          mode_ctx);
 
 #if CONFIG_REF_MV
-        if (mode == NEARMV)
+        if (mode == NEARMV || mode == NEWMV)
           write_drl_idx(cm, mbmi, mbmi_ext, w);
 #endif
       }
@@ -1175,13 +1220,15 @@
 #else
       if (mode == NEWMV) {
 #endif  // CONFIG_EXT_INTER
+        int_mv ref_mv;
         for (ref = 0; ref < 1 + is_compound; ++ref) {
 #if CONFIG_REF_MV
-              int nmv_ctx =
-                  vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[ref]],
-                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[ref]]);
-              const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+          int nmv_ctx =
+              vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[ref]],
+                           mbmi_ext->ref_mv_stack[mbmi->ref_frame[ref]]);
+          const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
+          ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
 #if CONFIG_EXT_INTER
           if (mode == NEWFROMNEARMV)
             vp10_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
@@ -1190,8 +1237,8 @@
           else
 #endif  // CONFIG_EXT_INTER
           vp10_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
-                        &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
-                        allow_hp);
+                         &ref_mv.as_mv, nmvc,
+                         allow_hp);
         }
 #if CONFIG_EXT_INTER
       } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
@@ -1408,9 +1455,31 @@
 #endif  // CONFIG_EXT_INTRA
 }
 
+#if CONFIG_ANS && CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end,            \
+                              supertx_enabled, mi_row, mi_col)            \
+  write_modes_b(cpi, tile, w, ans, tok, tok_end, supertx_enabled, mi_row, \
+                mi_col)
+#elif CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, \
+                              supertx_enabled, mi_row, mi_col) \
+  write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
+#elif CONFIG_ANS
+#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, \
+                              supertx_enabled, mi_row, mi_col) \
+  write_modes_b(cpi, tile, w, ans, tok, tok_end, mi_row, mi_col)
+#else
+#define write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, \
+                              supertx_enabled, mi_row, mi_col) \
+  write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
+#endif  // CONFIG_ANS && CONFIG_SUPERTX
+
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
-                          vpx_writer *w, TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end,
+                          vpx_writer *w,
+#if CONFIG_ANS
+                          struct BufAnsCoder *ans,
+#endif  // CONFIG_ANS
+                          TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
 #if CONFIG_SUPERTX
                           int supertx_enabled,
 #endif
@@ -1457,7 +1526,7 @@
       assert(*tok < tok_end);
       pack_palette_tokens(w, tok, m->mbmi.palette_mode_info.palette_size[plane],
                           rows * cols - 1);
-      assert(*tok < tok_end);
+      assert(*tok < tok_end + m->mbmi.skip);
     }
   }
 
@@ -1465,7 +1534,6 @@
   if (supertx_enabled) return;
 #endif  // CONFIG_SUPERTX
 
-#if !CONFIG_ANS
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
@@ -1501,18 +1569,26 @@
 
         for (row = 0; row < num_4x4_h; row += bw)
           for (col = 0; col < num_4x4_w; col += bw)
+#if CONFIG_ANS
+            pack_mb_tokens_ans(ans, cm->token_tab, tok, tok_end, cm->bit_depth,
+                               tx);
+#else
             pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+#endif  // CONFIG_ANS
       }
 #else
       TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
                          : m->mbmi.tx_size;
+#if CONFIG_ANS
+      pack_mb_tokens_ans(ans, cm->token_tab, tok, tok_end, cm->bit_depth, tx);
+#else
       pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+#endif  // CONFIG_ANS
 #endif  // CONFIG_VAR_TX
       assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
       (*tok)++;
     }
   }
-#endif
 }
 
 static void write_partition(const VP10_COMMON *const cm,
@@ -1525,7 +1601,15 @@
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+      vp10_write_token(w, vp10_partition_tree, probs, &partition_encodings[p]);
+    else
+      vp10_write_token(w, vp10_ext_partition_tree, probs,
+                      &ext_partition_encodings[p]);
+#else
     vp10_write_token(w, vp10_partition_tree, probs, &partition_encodings[p]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
     vpx_write(w, p == PARTITION_SPLIT, probs[1]);
@@ -1537,8 +1621,31 @@
   }
 }
 
-static void write_modes_sb(VP10_COMP *cpi,
-                           const TileInfo *const tile, vpx_writer *w,
+#if CONFIG_ANS && CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,            \
+                               supertx_enabled, mi_row, mi_col, bsize)     \
+  write_modes_sb(cpi, tile, w, ans, tok, tok_end, supertx_enabled, mi_row, \
+                 mi_col, bsize)
+#elif CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,               \
+                               supertx_enabled, mi_row, mi_col, bsize)        \
+  write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
+                 bsize)
+#elif CONFIG_ANS
+#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,        \
+                               supertx_enabled, mi_row, mi_col, bsize) \
+  write_modes_sb(cpi, tile, w, ans, tok, tok_end, mi_row, mi_col, bsize)
+#else
+#define write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end,        \
+                               supertx_enabled, mi_row, mi_col, bsize) \
+  write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
+#endif  // CONFIG_ANS && CONFIG_SUPERTX
+
+static void write_modes_sb(VP10_COMP *cpi, const TileInfo *const tile,
+                           vpx_writer *w,
+#if CONFIG_ANS
+                           struct BufAnsCoder *ans,
+#endif  // CONFIG_ANS
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
 #if CONFIG_SUPERTX
                            int supertx_enabled,
@@ -1564,6 +1671,10 @@
   m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
+#if CONFIG_EXT_PARTITION_TYPES
+  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
+                            mi_row, mi_col, bsize);
+#endif
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
 #if CONFIG_SUPERTX
@@ -1607,68 +1718,108 @@
   }
 #endif  // CONFIG_SUPERTX
   if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                  supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                  mi_row, mi_col);
+    write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                          mi_row, mi_col);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                      mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
         break;
       case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                      supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                      mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
         if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                        supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                        mi_row + bs, mi_col);
+          write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end,
+                                supertx_enabled, mi_row + bs, mi_col);
         break;
       case PARTITION_VERT:
+        write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_col + bs < cm->mi_cols)
+          write_modes_b_wrapper(cpi, tile, w, ans, tok, tok_end,
+                                supertx_enabled, mi_row, mi_col + bs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col + bs, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                               mi_row + bs, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, supertx_enabled,
+                               mi_row + bs, mi_col + bs, subsize);
+        break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
         write_modes_b(cpi, tile, w, tok, tok_end,
 #if CONFIG_SUPERTX
                       supertx_enabled,
-#endif  // CONFIG_SUPERTX
+#endif
                       mi_row, mi_col);
-        if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end,
+        write_modes_b(cpi, tile, w, tok, tok_end,
 #if CONFIG_SUPERTX
-                        supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                        mi_row, mi_col + bs);
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col + bs);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row + bs, mi_col);
         break;
-      case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end,
+      case PARTITION_HORZ_B:
+        write_modes_b(cpi, tile, w, tok, tok_end,
 #if CONFIG_SUPERTX
-                       supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                       mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end,
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
 #if CONFIG_SUPERTX
-                       supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                       mi_row, mi_col + bs, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end,
+                      supertx_enabled,
+#endif
+                      mi_row + bs, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
 #if CONFIG_SUPERTX
-                       supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                       mi_row + bs, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                       supertx_enabled,
-#endif  // CONFIG_SUPERTX
-                       mi_row + bs, mi_col + bs, subsize);
+                      supertx_enabled,
+#endif
+                      mi_row + bs, mi_col + bs);
         break;
+      case PARTITION_VERT_A:
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row + bs, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col + bs);
+        break;
+      case PARTITION_VERT_B:
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row, mi_col + bs);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif
+                      mi_row + bs, mi_col + bs);
+        break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
       default:
         assert(0);
     }
@@ -1689,7 +1840,12 @@
 
       for (row = 0; row < num_4x4_h; row += bw)
         for (col = 0; col < num_4x4_w; col += bw)
+#if CONFIG_ANS
+          pack_mb_tokens_ans(ans, cm->token_tab, tok, tok_end, cm->bit_depth,
+                             tx);
+#else
           pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+#endif
       assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
       (*tok)++;
     }
@@ -1697,30 +1853,31 @@
 #endif  // CONFIG_SUPERTX
 
   // update partition context
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-static void write_modes(VP10_COMP *cpi,
-                        const TileInfo *const tile, vpx_writer *w,
+static void write_modes(VP10_COMP *cpi, const TileInfo *const tile,
+                        vpx_writer *w,
+#if CONFIG_ANS
+                        struct BufAnsCoder *ans,
+#endif  // CONFIG_ANS
                         TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int mi_row, mi_col;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(xd->left_seg_context);
-#if CONFIG_VAR_TX
-    vp10_zero(xd->left_txfm_context_buffer);
-#endif
+    vp10_zero_left_context(xd);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end,
-#if CONFIG_SUPERTX
-                     0,
-#endif
-                     mi_row, mi_col, BLOCK_64X64);
+      write_modes_sb_wrapper(cpi, tile, w, ans, tok, tok_end, 0, mi_row, mi_col,
+                             BLOCK_64X64);
   }
 }
 
@@ -2182,20 +2339,20 @@
   vpx_writer mode_bc;
 #if CONFIG_ANS
   struct AnsCoder token_ans;
-#endif
+  struct BufAnsCoder buffered_ans;
+#endif  // CONFIG_ANS
   int tile_row, tile_col;
   TOKENEXTRA *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
   unsigned int max_tile = 0;
+  const int ans_window_size = get_token_alloc(cm->mb_rows, cm->mb_cols) * 3;
+  struct buffered_ans_symbol *uco_ans_buf =
+      malloc(ans_window_size * sizeof(*uco_ans_buf));
+  assert(uco_ans_buf);
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * mi_cols_aligned_to_sb(cm->mi_cols));
-#endif
+  vp10_zero_above_context(cm, 0, mi_cols_aligned_to_sb(cm->mi_cols));
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
@@ -2213,8 +2370,8 @@
 
 #if !CONFIG_ANS
       (void) token_section_size;
-      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
-                  &mode_bc, &tok, tok_end);
+      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &mode_bc, &tok,
+                  tok_end);
       assert(tok == tok_end);
       vpx_stop_encode(&mode_bc);
       if (put_tile_size) {
@@ -2229,12 +2386,13 @@
       }
       total_size += mode_bc.pos;
 #else
+      buf_ans_write_init(&buffered_ans, uco_ans_buf, ans_window_size);
       write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &mode_bc,
-                  NULL, NULL);
+                  &buffered_ans, &tok, tok_end);
+      assert(tok == tok_end);
       vpx_stop_encode(&mode_bc);
       ans_write_init(&token_ans, mode_data_start + mode_bc.pos);
-      pack_mb_tokens_ans(&token_ans, cm->token_tab, tok, tok_end,
-                         cm->bit_depth);
+      buf_ans_flush(&buffered_ans, &token_ans);
       token_section_size = ans_write_end(&token_ans);
       if (put_tile_size) {
         // size of this tile
@@ -2248,6 +2406,9 @@
   }
   *max_tile_sz = max_tile;
 
+#if CONFIG_ANS
+  free(uco_ans_buf);
+#endif  // CONFIG_ANS
   return total_size;
 }
 
@@ -2467,9 +2628,18 @@
     prob_diff_update(vp10_intra_mode_tree, fc->uv_mode_prob[i],
                      counts->uv_mode[i], INTRA_MODES, &header_bc);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  prob_diff_update(vp10_partition_tree, fc->partition_prob[0],
+                   counts->partition[0], PARTITION_TYPES, &header_bc);
+  for (i = 1; i < PARTITION_CONTEXTS; ++i)
+    prob_diff_update(vp10_ext_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], EXT_PARTITION_TYPES,
+                     &header_bc);
+#else
   for (i = 0; i < PARTITION_CONTEXTS; ++i)
     prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
                      counts->partition[i], PARTITION_TYPES, &header_bc);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_EXT_INTRA
   for (i = 0; i < INTRA_FILTERS + 1; ++i)
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index ce650b1..295213f 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -164,12 +164,12 @@
   int quant_fp;
 
   // skip forward transform and quantization
-  uint8_t skip_txfm[MAX_MB_PLANE << 2];
+  uint8_t skip_txfm[MAX_MB_PLANE][4];
   #define SKIP_TXFM_NONE 0
   #define SKIP_TXFM_AC_DC 1
   #define SKIP_TXFM_AC_ONLY 2
 
-  int64_t bsse[MAX_MB_PLANE << 2];
+  int64_t bsse[MAX_MB_PLANE][4];
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
diff --git a/vp10/encoder/buf_ans.h b/vp10/encoder/buf_ans.h
new file mode 100644
index 0000000..ae76873
--- /dev/null
+++ b/vp10/encoder/buf_ans.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_BUF_ANS_H_
+#define VP10_ENCODER_BUF_ANS_H_
+// Buffered forward ANS writer.
+// Symbols are written to the writer in forward (decode) order and serialzed
+// backwards due to ANS's stack like behavior.
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem_ops.h"
+#include "vp10/common/ans.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define ANS_METHOD_UABS 0
+#define ANS_METHOD_RANS 1
+
+struct buffered_ans_symbol {
+  uint8_t method;    // one of ANS_METHOD_UABS or ANS_METHOD_RANS
+  // TODO(aconverse): Should be possible to write this interms of start for ABS
+  AnsP8 val_start;  // Boolean value for ABS, start in symbol cycle for Rans
+  AnsP8 prob;  // Probability of this symbol
+};
+
+struct BufAnsCoder {
+  struct buffered_ans_symbol *buf;
+  int size;
+  int offset;
+};
+
+static INLINE void buf_ans_write_init(struct BufAnsCoder *const c,
+                                      struct buffered_ans_symbol *sym_arr,
+                                      int size) {
+  c->buf = sym_arr;
+  c->size = size;
+  c->offset = 0;
+}
+
+static INLINE void buf_uabs_write(struct BufAnsCoder *const c,
+                             uint8_t val, AnsP8 prob) {
+  assert(c->offset < c->size);
+  c->buf[c->offset].method = ANS_METHOD_UABS;
+  c->buf[c->offset].val_start = val;
+  c->buf[c->offset].prob = prob;
+  ++c->offset;
+}
+
+static INLINE void buf_rans_write(struct BufAnsCoder *const c,
+                                  const struct rans_sym *const sym) {
+  assert(c->offset < c->size);
+  c->buf[c->offset].method = ANS_METHOD_RANS;
+  c->buf[c->offset].val_start = sym->cum_prob;
+  c->buf[c->offset].prob = sym->prob;
+  ++c->offset;
+}
+
+static INLINE void buf_ans_flush(const struct BufAnsCoder *const c,
+                                 struct AnsCoder *ans) {
+  int offset;
+  for (offset = c->offset - 1; offset >= 0; --offset) {
+    if (c->buf[offset].method == ANS_METHOD_RANS) {
+      struct rans_sym sym;
+      sym.prob = c->buf[offset].prob;
+      sym.cum_prob = c->buf[offset].val_start;
+      rans_write(ans, &sym);
+    } else {
+      uabs_write(ans, c->buf[offset].val_start, c->buf[offset].prob);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // VP10_ENCODER_BUF_ANS_H_
diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index 3cd23ec..0a76195 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -19,11 +19,17 @@
 };
 
 static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
+#if CONFIG_EXT_PARTITION_TYPES
+                               PARTITION_TYPE partition,
+#endif
                                PICK_MODE_CONTEXT *ctx) {
   const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
   const int num_pix = num_blk << 4;
   int i, k;
   ctx->num_4x4_blk = num_blk;
+#if CONFIG_EXT_PARTITION_TYPES
+  ctx->partition = partition;
+#endif
 
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_blk, sizeof(uint8_t)));
@@ -78,6 +84,46 @@
 
 static void alloc_tree_contexts(VP10_COMMON *cm, PC_TREE *tree,
                                 int num_4x4_blk) {
+#if CONFIG_EXT_PARTITION_TYPES
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_HORZ, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT, &tree->vertical[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_A,
+                     &tree->horizontala[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_A,
+                     &tree->horizontala[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_HORZ_A,
+                     &tree->horizontala[2]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_HORZ_B,
+                     &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_B,
+                     &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_B,
+                     &tree->horizontalb[2]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_A, &tree->verticala[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_A, &tree->verticala[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT_A, &tree->verticala[2]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT_B, &tree->verticalb[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_B, &tree->verticalb[1]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_B, &tree->verticalb[2]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
+                     &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A,
+                     &tree->horizontala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B,
+                     &tree->horizontalb_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A,
+                     &tree->verticala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B,
+                     &tree->verticalb_supertx);
+#endif  // CONFIG_SUPERTX
+#else
   alloc_mode_context(cm, num_4x4_blk, &tree->none);
   alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
@@ -94,9 +140,19 @@
     memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
     memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
   }
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void free_tree_contexts(PC_TREE *tree) {
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+  for (i = 0; i < 3; i++) {
+    free_mode_context(&tree->horizontala[i]);
+    free_mode_context(&tree->horizontalb[i]);
+    free_mode_context(&tree->verticala[i]);
+    free_mode_context(&tree->verticalb[i]);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
   free_mode_context(&tree->none);
   free_mode_context(&tree->horizontal[0]);
   free_mode_context(&tree->horizontal[1]);
@@ -106,7 +162,13 @@
   free_mode_context(&tree->horizontal_supertx);
   free_mode_context(&tree->vertical_supertx);
   free_mode_context(&tree->split_supertx);
-#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  free_mode_context(&tree->horizontala_supertx);
+  free_mode_context(&tree->horizontalb_supertx);
+  free_mode_context(&tree->verticala_supertx);
+  free_mode_context(&tree->verticalb_supertx);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
 }
 
 // This function sets up a tree of contexts such that at each square
@@ -135,8 +197,13 @@
 
   // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
   // context so we only need to allocate 1 for each 8x8 block.
-  for (i = 0; i < leaf_nodes; ++i)
+  for (i = 0; i < leaf_nodes; ++i) {
+#if CONFIG_EXT_PARTITION_TYPES
+    alloc_mode_context(cm, 1, PARTITION_NONE, &td->leaf_tree[i]);
+#else
     alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+#endif
+  }
 
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
diff --git a/vp10/encoder/context_tree.h b/vp10/encoder/context_tree.h
index 4fa5806..de17e3e 100644
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h
@@ -54,7 +54,6 @@
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
   // scope of refactoring.
@@ -74,6 +73,9 @@
   // search loop
   MV pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
@@ -83,6 +85,12 @@
   PICK_MODE_CONTEXT none;
   PICK_MODE_CONTEXT horizontal[2];
   PICK_MODE_CONTEXT vertical[2];
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala[3];
+  PICK_MODE_CONTEXT horizontalb[3];
+  PICK_MODE_CONTEXT verticala[3];
+  PICK_MODE_CONTEXT verticalb[3];
+#endif
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
@@ -91,6 +99,12 @@
   PICK_MODE_CONTEXT horizontal_supertx;
   PICK_MODE_CONTEXT vertical_supertx;
   PICK_MODE_CONTEXT split_supertx;
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala_supertx;
+  PICK_MODE_CONTEXT horizontalb_supertx;
+  PICK_MODE_CONTEXT verticala_supertx;
+  PICK_MODE_CONTEXT verticalb_supertx;
+#endif
 #endif
 } PC_TREE;
 
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 8a1ee20..11d4a8e 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -36,219 +36,6 @@
 #endif
 }
 
-#if CONFIG_EXT_TX
-void fdst4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t step[4];
-  tran_high_t temp1, temp2;
-
-  step[0] = input[0] - input[3];
-  step[1] = -input[1] + input[2];
-  step[2] = -input[1] - input[2];
-  step[3] = input[0] + input[3];
-
-  temp1 = (step[0] + step[1]) * cospi_16_64;
-  temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[3] = fdct_round_shift(temp1);
-  output[1] = fdct_round_shift(temp2);
-  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[2] = fdct_round_shift(temp1);
-  output[0] = fdct_round_shift(temp2);
-}
-
-void fdst8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-  tran_high_t t0, t1, t2, t3;                  // needs32
-  tran_high_t x0, x1, x2, x3;                  // canbe16
-
-  // stage 1
-  s0 = input[0] - input[7];
-  s1 = -input[1] + input[6];
-  s2 = input[2] - input[5];
-  s3 = -input[3] + input[4];
-  s4 = -input[3] - input[4];
-  s5 = input[2] + input[5];
-  s6 = -input[1] - input[6];
-  s7 = input[0] + input[7];
-
-  x0 = s0 + s3;
-  x1 = s1 + s2;
-  x2 = s1 - s2;
-  x3 = s0 - s3;
-  t0 = (x0 + x1) * cospi_16_64;
-  t1 = (x0 - x1) * cospi_16_64;
-  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[7] = fdct_round_shift(t0);
-  output[5] = fdct_round_shift(t2);
-  output[3] = fdct_round_shift(t1);
-  output[1] = fdct_round_shift(t3);
-
-  // Stage 2
-  t0 = (s6 - s5) * cospi_16_64;
-  t1 = (s6 + s5) * cospi_16_64;
-  t2 = fdct_round_shift(t0);
-  t3 = fdct_round_shift(t1);
-
-  // Stage 3
-  x0 = s4 + t2;
-  x1 = s4 - t2;
-  x2 = s7 - t3;
-  x3 = s7 + t3;
-
-  // Stage 4
-  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[6] = fdct_round_shift(t0);
-  output[4] = fdct_round_shift(t2);
-  output[2] = fdct_round_shift(t1);
-  output[0] = fdct_round_shift(t3);
-}
-
-void fdst16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t step1[8];      // canbe16
-  tran_high_t step2[8];      // canbe16
-  tran_high_t step3[8];      // canbe16
-  tran_high_t in[8];         // canbe16
-  tran_high_t temp1, temp2;  // needs32
-
-  // step 1
-  in[0] = input[0] - input[15];
-  in[1] = -input[1] + input[14];
-  in[2] = input[2] - input[13];
-  in[3] = -input[3] + input[12];
-  in[4] = input[4] - input[11];
-  in[5] = -input[5] + input[10];
-  in[6] = input[6] - input[ 9];
-  in[7] = -input[7] + input[ 8];
-
-  step1[0] = -input[7] - input[ 8];
-  step1[1] = input[6] + input[ 9];
-  step1[2] = -input[5] - input[10];
-  step1[3] = input[4] + input[11];
-  step1[4] = -input[3] - input[12];
-  step1[5] = input[2] + input[13];
-  step1[6] = -input[1] - input[14];
-  step1[7] = input[0] + input[15];
-
-  // fdct8(step, step);
-  {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    // stage 1
-    s0 = in[0] + in[7];
-    s1 = in[1] + in[6];
-    s2 = in[2] + in[5];
-    s3 = in[3] + in[4];
-    s4 = in[3] - in[4];
-    s5 = in[2] - in[5];
-    s6 = in[1] - in[6];
-    s7 = in[0] - in[7];
-
-    // fdct4(step, step);
-    x0 = s0 + s3;
-    x1 = s1 + s2;
-    x2 = s1 - s2;
-    x3 = s0 - s3;
-    t0 = (x0 + x1) * cospi_16_64;
-    t1 = (x0 - x1) * cospi_16_64;
-    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
-    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    output[15] = fdct_round_shift(t0);
-    output[11] = fdct_round_shift(t2);
-    output[7] = fdct_round_shift(t1);
-    output[3] = fdct_round_shift(t3);
-
-    // Stage 2
-    t0 = (s6 - s5) * cospi_16_64;
-    t1 = (s6 + s5) * cospi_16_64;
-    t2 = fdct_round_shift(t0);
-    t3 = fdct_round_shift(t1);
-
-    // Stage 3
-    x0 = s4 + t2;
-    x1 = s4 - t2;
-    x2 = s7 - t3;
-    x3 = s7 + t3;
-
-    // Stage 4
-    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    output[13] = fdct_round_shift(t0);
-    output[9] = fdct_round_shift(t2);
-    output[5] = fdct_round_shift(t1);
-    output[1] = fdct_round_shift(t3);
-  }
-
-  // step 2
-  temp1 = (step1[5] - step1[2]) * cospi_16_64;
-  temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = fdct_round_shift(temp1);
-  step2[3] = fdct_round_shift(temp2);
-  temp1 = (step1[4] + step1[3]) * cospi_16_64;
-  temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = fdct_round_shift(temp1);
-  step2[5] = fdct_round_shift(temp2);
-
-  // step 3
-  step3[0] = step1[0] + step2[3];
-  step3[1] = step1[1] + step2[2];
-  step3[2] = step1[1] - step2[2];
-  step3[3] = step1[0] - step2[3];
-  step3[4] = step1[7] - step2[4];
-  step3[5] = step1[6] - step2[5];
-  step3[6] = step1[6] + step2[5];
-  step3[7] = step1[7] + step2[4];
-
-  // step 4
-  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
-  step2[1] = fdct_round_shift(temp1);
-  step2[2] = fdct_round_shift(temp2);
-  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = fdct_round_shift(temp1);
-  step2[6] = fdct_round_shift(temp2);
-
-  // step 5
-  step1[0] = step3[0] + step2[1];
-  step1[1] = step3[0] - step2[1];
-  step1[2] = step3[3] + step2[2];
-  step1[3] = step3[3] - step2[2];
-  step1[4] = step3[4] - step2[5];
-  step1[5] = step3[4] + step2[5];
-  step1[6] = step3[7] - step2[6];
-  step1[7] = step3[7] + step2[6];
-
-  // step 6
-  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
-  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  output[14] = fdct_round_shift(temp1);
-  output[6] = fdct_round_shift(temp2);
-
-  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  output[10] = fdct_round_shift(temp1);
-  output[2] = fdct_round_shift(temp2);
-
-  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
-  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  output[12] = fdct_round_shift(temp1);
-  output[4] = fdct_round_shift(temp2);
-
-  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  output[8] = fdct_round_shift(temp1);
-  output[0] = fdct_round_shift(temp2);
-}
-#endif  // CONFIG_EXT_TX
-
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
@@ -1236,22 +1023,6 @@
     output[i] = input[i] * 4;
 }
 
-// For use in lieu of DST
-static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 8; ++i) {
-    output[16 + i] = input[i] * 4;
-    output[24 + i] = input[24 + i] * 4;
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 8] * Sqrt2);
-  }
-  fdct16(inputhalf, output);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
 // For use in lieu of ADST
 static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -1334,25 +1105,22 @@
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
     case IDTX:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
-    case FLIPADST_DST:
+    case V_FLIPADST:
       copy_flipud(*src, *src_stride, l, buff, l);
       *src = buff;
       *src_stride = l;
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
-    case DST_FLIPADST:
+    case H_FLIPADST:
       copy_fliplr(*src, *src_stride, l, buff, l);
       *src = buff;
       *src_stride = l;
@@ -1370,98 +1138,86 @@
 #endif  // CONFIG_EXT_TX
 
 static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT           = 0,
-  { fadst4, fdct4  },  // ADST_DCT          = 1,
-  { fdct4,  fadst4 },  // DCT_ADST          = 2,
-  { fadst4, fadst4 },  // ADST_ADST         = 3,
+  { fdct4,  fdct4  },  // DCT_DCT
+  { fadst4, fdct4  },  // ADST_DCT
+  { fdct4,  fadst4 },  // DCT_ADST
+  { fadst4, fadst4 },  // ADST_ADST
 #if CONFIG_EXT_TX
-  { fadst4, fdct4  },  // FLIPADST_DCT      = 4,
-  { fdct4,  fadst4 },  // DCT_FLIPADST      = 5,
-  { fadst4, fadst4 },  // FLIPADST_FLIPADST = 6,
-  { fadst4, fadst4 },  // ADST_FLIPADST     = 7,
-  { fadst4, fadst4 },  // FLIPADST_ADST     = 8,
-  { fdst4,  fdct4  },  // DST_DCT           = 9,
-  { fdct4,  fdst4  },  // DCT_DST           = 10,
-  { fdst4,  fadst4 },  // DST_ADST          = 11,
-  { fadst4, fdst4  },  // ADST_DST          = 12,
-  { fdst4,  fadst4 },  // DST_FLIPADST      = 13,
-  { fadst4, fdst4  },  // FLIPADST_DST      = 14,
-  { fdst4,  fdst4  },  // DST_DST           = 15
-  { fidtx4, fidtx4 },  // IDTX              = 16
-  { fdct4,  fidtx4 },  // V_DCT             = 17
-  { fidtx4, fdct4  },  // H_DCT             = 18
+  { fadst4, fdct4  },  // FLIPADST_DCT
+  { fdct4,  fadst4 },  // DCT_FLIPADST
+  { fadst4, fadst4 },  // FLIPADST_FLIPADST
+  { fadst4, fadst4 },  // ADST_FLIPADST
+  { fadst4, fadst4 },  // FLIPADST_ADST
+  { fidtx4, fidtx4 },  // IDTX
+  { fdct4,  fidtx4 },  // V_DCT
+  { fidtx4, fdct4  },  // H_DCT
+  { fadst4, fidtx4 },  // V_ADST
+  { fidtx4, fadst4 },  // H_ADST
+  { fadst4, fidtx4 },  // V_FLIPADST
+  { fidtx4, fadst4 },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
 };
 
 static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT           = 0,
-  { fadst8, fdct8  },  // ADST_DCT          = 1,
-  { fdct8,  fadst8 },  // DCT_ADST          = 2,
-  { fadst8, fadst8 },  // ADST_ADST         = 3,
+  { fdct8,  fdct8  },  // DCT_DCT
+  { fadst8, fdct8  },  // ADST_DCT
+  { fdct8,  fadst8 },  // DCT_ADST
+  { fadst8, fadst8 },  // ADST_ADST
 #if CONFIG_EXT_TX
-  { fadst8, fdct8  },  // FLIPADST_DCT      = 4,
-  { fdct8,  fadst8 },  // DCT_FLIPADST      = 5,
-  { fadst8, fadst8 },  // FLIPADST_FLIPADST = 6,
-  { fadst8, fadst8 },  // ADST_FLIPADST     = 7,
-  { fadst8, fadst8 },  // FLIPADST_ADST     = 8,
-  { fdst8,  fdct8  },  // DST_DCT           = 9,
-  { fdct8,  fdst8  },  // DCT_DST           = 10,
-  { fdst8,  fadst8 },  // DST_ADST          = 11,
-  { fadst8, fdst8  },  // ADST_DST          = 12,
-  { fdst8,  fadst8 },  // DST_FLIPADST      = 13,
-  { fadst8, fdst8  },  // FLIPADST_DST      = 14,
-  { fdst8,  fdst8  },  // DST_DST           = 15
-  { fidtx8, fidtx8 },  // IDTX              = 16
-  { fdct8,  fidtx8 },  // V_DCT             = 17
-  { fidtx8, fdct8  },  // H_DCT             = 18
+  { fadst8, fdct8  },  // FLIPADST_DCT
+  { fdct8,  fadst8 },  // DCT_FLIPADST
+  { fadst8, fadst8 },  // FLIPADST_FLIPADST
+  { fadst8, fadst8 },  // ADST_FLIPADST
+  { fadst8, fadst8 },  // FLIPADST_ADST
+  { fidtx8, fidtx8 },  // IDTX
+  { fdct8,  fidtx8 },  // V_DCT
+  { fidtx8, fdct8  },  // H_DCT
+  { fadst8, fidtx8 },  // V_ADST
+  { fidtx8, fadst8 },  // H_ADST
+  { fadst8, fidtx8 },  // V_FLIPADST
+  { fidtx8, fadst8 },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
 };
 
 static const transform_2d FHT_16[] = {
-  { fdct16,  fdct16  },  // DCT_DCT           = 0,
-  { fadst16, fdct16  },  // ADST_DCT          = 1,
-  { fdct16,  fadst16 },  // DCT_ADST          = 2,
-  { fadst16, fadst16 },  // ADST_ADST         = 3,
+  { fdct16,  fdct16  },  // DCT_DCT
+  { fadst16, fdct16  },  // ADST_DCT
+  { fdct16,  fadst16 },  // DCT_ADST
+  { fadst16, fadst16 },  // ADST_ADST
 #if CONFIG_EXT_TX
-  { fadst16, fdct16  },  // FLIPADST_DCT      = 4,
-  { fdct16,  fadst16 },  // DCT_FLIPADST      = 5,
-  { fadst16, fadst16 },  // FLIPADST_FLIPADST = 6,
-  { fadst16, fadst16 },  // ADST_FLIPADST     = 7,
-  { fadst16, fadst16 },  // FLIPADST_ADST     = 8,
-  { fdst16,  fdct16  },  // DST_DCT           = 9,
-  { fdct16,  fdst16  },  // DCT_DST           = 10,
-  { fdst16,  fadst16 },  // DST_ADST          = 11,
-  { fadst16, fdst16  },  // ADST_DST          = 12,
-  { fdst16,  fadst16 },  // DST_FLIPADST      = 13,
-  { fadst16, fdst16  },  // FLIPADST_DST      = 14,
-  { fdst16,  fdst16  },  // DST_DST           = 15
-  { fidtx16, fidtx16 },  // IDTX              = 16
-  { fdct16,  fidtx16 },  // V_DCT             = 17
-  { fidtx16, fdct16  },  // H_DCT             = 18
+  { fadst16, fdct16  },  // FLIPADST_DCT
+  { fdct16,  fadst16 },  // DCT_FLIPADST
+  { fadst16, fadst16 },  // FLIPADST_FLIPADST
+  { fadst16, fadst16 },  // ADST_FLIPADST
+  { fadst16, fadst16 },  // FLIPADST_ADST
+  { fidtx16, fidtx16 },  // IDTX
+  { fdct16,  fidtx16 },  // V_DCT
+  { fidtx16, fdct16  },  // H_DCT
+  { fadst16, fidtx16 },  // V_ADST
+  { fidtx16, fadst16 },  // H_ADST
+  { fadst16, fidtx16 },  // V_FLIPADST
+  { fidtx16, fadst16 },  // H_FLIPADST
 #endif  // CONFIG_EXT_TX
 };
 
 #if CONFIG_EXT_TX
 static const transform_2d FHT_32[] = {
-  { fdct32,  fdct32  },                // DCT_DCT           = 0,
-  { fhalfright32, fdct32  },           // ADST_DCT          = 1,
-  { fdct32,  fhalfright32 },           // DCT_ADST          = 2,
-  { fhalfright32, fhalfright32 },      // ADST_ADST         = 3,
-  { fhalfright32, fdct32  },           // FLIPADST_DCT      = 4,
-  { fdct32,  fhalfright32 },           // DCT_FLIPADST      = 5,
-  { fhalfright32, fhalfright32 },      // FLIPADST_FLIPADST = 6,
-  { fhalfright32, fhalfright32 },      // ADST_FLIPADST     = 7,
-  { fhalfright32, fhalfright32 },      // FLIPADST_ADST     = 8,
-  { fhalfcenter32,  fdct32  },         // DST_DCT           = 9,
-  { fdct32,  fhalfcenter32  },         // DCT_DST           = 10,
-  { fhalfcenter32,  fhalfright32 },    // DST_ADST          = 11,
-  { fhalfright32, fhalfcenter32  },    // ADST_DST          = 12,
-  { fhalfcenter32,  fhalfright32 },    // DST_FLIPADST      = 13,
-  { fhalfright32, fhalfcenter32  },    // FLIPADST_DST      = 14,
-  { fhalfcenter32,  fhalfcenter32  },  // DST_DST           = 15
-  { fidtx32, fidtx32 },                // IDTX              = 16
-  { fdct32,  fidtx32 },                // V_DCT             = 17
-  { fidtx32, fdct32  },                // H_DCT             = 18
+  { fdct32,  fdct32  },                // DCT_DCT
+  { fhalfright32, fdct32  },           // ADST_DCT
+  { fdct32,  fhalfright32 },           // DCT_ADST
+  { fhalfright32, fhalfright32 },      // ADST_ADST
+  { fhalfright32, fdct32  },           // FLIPADST_DCT
+  { fdct32,  fhalfright32 },           // DCT_FLIPADST
+  { fhalfright32, fhalfright32 },      // FLIPADST_FLIPADST
+  { fhalfright32, fhalfright32 },      // ADST_FLIPADST
+  { fhalfright32, fhalfright32 },      // FLIPADST_ADST
+  { fidtx32, fidtx32 },                // IDTX
+  { fdct32,  fidtx32 },                // V_DCT
+  { fidtx32, fdct32  },                // H_DCT
+  { fhalfright32, fidtx32 },           // V_ADST
+  { fidtx32, fhalfright32 },           // H_ADST
+  { fhalfright32, fidtx32 },           // V_FLIPADST
+  { fidtx32, fhalfright32 },           // H_FLIPADST
 };
 #endif  // CONFIG_EXT_TX
 
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index c5a68a9..61f6e9c 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -792,6 +792,10 @@
     }
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  assert(0);
+#endif
+
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
   if (xd->mb_to_right_edge < 0)
@@ -1085,6 +1089,10 @@
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   int max_plane;
 
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
 #if !CONFIG_SUPERTX
   assert(mi->mbmi.sb_type == bsize);
 #endif
@@ -1092,6 +1100,23 @@
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
 
+#if CONFIG_REF_MV
+  rf_type = vp10_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+      mbmi->sb_type >= BLOCK_8X8 &&
+      mbmi->mode == NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      int_mv this_mv = (i == 0) ?
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].this_mv :
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].comp_mv;
+      clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+      lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
+      x->mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+      mbmi->pred_mv[i] = this_mv;
+    }
+  }
+#endif
+
   // If segmentation in use
   if (seg->enabled) {
     // For in frame complexity AQ copy the segment id from the segment map.
@@ -1194,9 +1219,6 @@
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rdc->filter_diff[i] += ctx->best_filter_diff[i];
   }
 
   for (h = 0; h < y_mis; ++h) {
@@ -1216,7 +1238,10 @@
                                  PICK_MODE_CONTEXT *ctx,
                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
                                  int output_enabled) {
-  int i, y, x_idx;
+  int y, x_idx;
+#if CONFIG_VAR_TX
+  int i;
+#endif
   VP10_COMMON *const cm = &cpi->common;
   RD_COUNTS *const rdc = &td->rd_counts;
   MACROBLOCK *const x = &td->mb;
@@ -1234,11 +1259,32 @@
       cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
   int w, h;
 
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
   assert(is_inter_block(mbmi));
   assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
 
+#if CONFIG_REF_MV
+  rf_type = vp10_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+      mbmi->sb_type >= BLOCK_8X8 &&
+      mbmi->mode == NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      int_mv this_mv = (i == 0) ?
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].this_mv :
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].comp_mv;
+      clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+      lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
+      x->mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+      mbmi->pred_mv[i] = this_mv;
+    }
+  }
+#endif
+
   // If segmentation in use
   if (seg->enabled && output_enabled) {
     // For in frame complexity AQ copy the segment id from the segment map.
@@ -1290,7 +1336,7 @@
     int idy, idx;
     for (idy = 0; idy < (1 << mtx) / 2; ++idy)
       for (idx = 0; idx < (1 << mtx) / 2; ++idx)
-        mbmi->inter_tx_size[(idy << 3) + idx] = mbmi->tx_size;
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
   }
 #endif  // CONFIG_VAR_TX
 #if CONFIG_OBMC
@@ -1316,9 +1362,6 @@
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rdc->filter_diff[i] += ctx->best_filter_diff[i];
   }
 
   for (h = 0; h < y_mis; ++h) {
@@ -1347,6 +1390,9 @@
   PARTITION_TYPE partition = pc_tree->partitioning;
   BLOCK_SIZE subsize = get_subsize(bsize, partition);
   int i;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
   PICK_MODE_CONTEXT *pmc = NULL;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
@@ -1401,6 +1447,56 @@
       }
       pmc = &pc_tree->split_supertx;
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
+                           bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
+                           mi_col + hbs, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
+                           mi_col, subsize, output_enabled);
+      pmc = &pc_tree->horizontala_supertx;
+      break;
+    case PARTITION_HORZ_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
+                           mi_col, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, output_enabled);
+      pmc = &pc_tree->horizontalb_supertx;
+      break;
+    case PARTITION_VERT_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
+                           bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
+                           mi_col, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
+                           mi_col + hbs, subsize, output_enabled);
+      pmc = &pc_tree->verticala_supertx;
+      break;
+    case PARTITION_VERT_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
+                           mi_col + hbs, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, output_enabled);
+      pmc = &pc_tree->verticalb_supertx;
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
   }
@@ -1449,6 +1545,9 @@
   int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition = pc_tree->partitioning;
   BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -1497,6 +1596,28 @@
                                 supertx_size, pc_tree->split[3]);
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
+                            supertx_size);
+      break;
+    case PARTITION_HORZ_B:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
+                            supertx_size);
+      break;
+    case PARTITION_VERT_A:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticala[i], best_tx,
+                            supertx_size);
+      break;
+    case PARTITION_VERT_B:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticalb[i], best_tx,
+                            supertx_size);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
   }
@@ -1537,6 +1658,9 @@
 #if CONFIG_SUPERTX
                              int *totalrate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                             PARTITION_TYPE partition,
+#endif
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
   VP10_COMMON *const cm = &cpi->common;
@@ -1564,6 +1688,9 @@
   // block as a supertx block, even if rdopt did not pick it as such.
   mbmi->tx_size = max_txsize_lookup[bsize];
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+  mbmi->partition = partition;
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][0];
@@ -1654,6 +1781,9 @@
                                    totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
                                    bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
       }
     } else {
       vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
@@ -1661,6 +1791,9 @@
                                      totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
                                      bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+      assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
     }
   }
 
@@ -1867,24 +2000,34 @@
 #endif  // CONFIG_EXT_INTER
                                 mode_ctx);
 
+        if (mode == NEWMV) {
+          uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+          int idx;
+
+          for (idx = 0; idx < 2; ++idx) {
+            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+              uint8_t drl_ctx =
+                  vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+
+              if (mbmi->ref_mv_idx == idx)
+                break;
+            }
+          }
+        }
+
         if (mode == NEARMV) {
           uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
-          if (mbmi_ext->ref_mv_count[ref_frame_type] > 2) {
-            uint8_t drl0_ctx =
-                vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], 1);
-            if (mbmi->ref_mv_idx == 0)
-              ++counts->drl_mode0[drl0_ctx][0];
-            else
-              ++counts->drl_mode0[drl0_ctx][1];
+          int idx;
 
-            if (mbmi_ext->ref_mv_count[ref_frame_type] > 3 &&
-                mbmi->ref_mv_idx > 0) {
-              uint8_t drl1_ctx =
-                  vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], 2);
-              if (mbmi->ref_mv_idx == 1)
-                ++counts->drl_mode1[drl1_ctx][0];
-              else
-                ++counts->drl_mode1[drl1_ctx][1];
+          for (idx = 1; idx < 3; ++idx) {
+            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+              uint8_t drl_ctx =
+                  vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+
+              if (mbmi->ref_mv_idx == idx - 1)
+                break;
             }
           }
         }
@@ -1941,15 +2084,24 @@
   }
 }
 
-static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
-                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+
+typedef struct {
+  ENTROPY_CONTEXT a[16 * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[8];
+  PARTITION_CONTEXT sl[8];
 #if CONFIG_VAR_TX
-                            TXFM_CONTEXT ta[8], TXFM_CONTEXT tl[8],
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[8];
+  TXFM_CONTEXT tl[8];
 #endif
-                            BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+                            const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1958,37 +2110,34 @@
   for (p = 0; p < MAX_MB_PLANE; p++) {
     memcpy(
         xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
-        a + num_4x4_blocks_wide * p,
+        ctx->a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     memcpy(
         xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-        l + num_4x4_blocks_high * p,
+        ctx->l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  memcpy(xd->above_seg_context + mi_col, sa,
+  memcpy(xd->above_seg_context + mi_col, ctx->sa,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+  memcpy(xd->left_seg_context + (mi_row & MI_MASK), ctx->sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
 #if CONFIG_VAR_TX
-  memcpy(xd->above_txfm_context, ta,
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
          sizeof(*xd->above_txfm_context) * mi_width);
-  memcpy(xd->left_txfm_context, tl,
+  memcpy(xd->left_txfm_context, ctx->tl,
          sizeof(*xd->left_txfm_context) * mi_height);
 #endif
 }
 
-static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
-                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-#if CONFIG_VAR_TX
-                         TXFM_CONTEXT ta[8], TXFM_CONTEXT tl[8],
-#endif
-                         BLOCK_SIZE bsize) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
+static void save_context(const MACROBLOCK *x,
+                         RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1998,26 +2147,28 @@
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     memcpy(
-        a + num_4x4_blocks_wide * p,
+        ctx->a + num_4x4_blocks_wide * p,
         xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     memcpy(
-        l + num_4x4_blocks_high * p,
+        ctx->l + num_4x4_blocks_high * p,
         xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  memcpy(sa, xd->above_seg_context + mi_col,
+  memcpy(ctx->sa, xd->above_seg_context + mi_col,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MI_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
 #if CONFIG_VAR_TX
-  memcpy(ta, xd->above_txfm_context,
+  memcpy(ctx->ta, xd->above_txfm_context,
          sizeof(*xd->above_txfm_context) * mi_width);
-  memcpy(tl, xd->left_txfm_context,
+  memcpy(ctx->tl, xd->left_txfm_context,
          sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
 #endif
 }
 
@@ -2025,9 +2176,15 @@
                      ThreadData *td,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_TYPE partition,
+#endif
                      PICK_MODE_CONTEXT *ctx) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+#if CONFIG_EXT_PARTITION_TYPES
+  x->e_mbd.mi[0]->mbmi.partition = partition;
+#endif
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
@@ -2053,6 +2210,9 @@
   int ctx;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize = bsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -2066,6 +2226,10 @@
   }
 
   partition = partition_lookup[bsl][subsize];
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize > BLOCK_8X8)
+    partition = pc_tree->partitioning;
+#endif
   if (output_enabled && bsize != BLOCK_4X4)
     td->counts->partition[ctx][partition]++;
 
@@ -2144,8 +2308,13 @@
         }
 #endif  // CONFIG_EXT_TX
       }
+#if CONFIG_EXT_PARTITION_TYPES
+      update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
+                                   partition);
+#else
       if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
         update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif
 #if CONFIG_VAR_TX
       set_txfm_ctx(xd->left_txfm_context, supertx_size, xd->n8_h);
       set_txfm_ctx(xd->above_txfm_context, supertx_size, mi_height);
@@ -2163,27 +2332,47 @@
   switch (partition) {
     case PARTITION_NONE:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
                &pc_tree->none);
       break;
     case PARTITION_VERT:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
                &pc_tree->vertical[0]);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
-                 subsize, &pc_tree->vertical[1]);
+                 subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
                &pc_tree->horizontal[0]);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
-                 subsize, &pc_tree->horizontal[1]);
+                 subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
                  pc_tree->leaf_split[0]);
       } else {
         encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
@@ -2196,13 +2385,52 @@
                   subsize, pc_tree->split[3]);
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->horizontala[0]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
+               partition, &pc_tree->horizontala[1]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, subsize,
+               partition, &pc_tree->horizontala[2]);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               partition, &pc_tree->horizontalb[0]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->horizontalb[1]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+               bsize2, partition, &pc_tree->horizontalb[2]);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->verticala[0]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->verticala[1]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, subsize,
+               partition, &pc_tree->verticala[2]);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               partition, &pc_tree->verticalb[0]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
+               partition, &pc_tree->verticalb[1]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+               bsize2, partition, &pc_tree->verticalb[2]);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0 && "Invalid partition type.");
       break;
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 // Check to see if the given partition size is allowed for a specified number
@@ -2299,11 +2527,7 @@
   int i, pl;
   PARTITION_TYPE partition = PARTITION_NONE;
   BLOCK_SIZE subsize;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
-#if CONFIG_VAR_TX
-  TXFM_CONTEXT tl[8], ta[8];
-#endif
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
@@ -2316,6 +2540,10 @@
   int chosen_rate_nocoef = INT_MAX;
 #endif
 
+#if CONFIG_EXT_PARTITION_TYPES
+  assert(0);
+#endif
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
@@ -2329,16 +2557,14 @@
   partition = partition_lookup[bsl][bs_type];
   subsize = get_subsize(bsize, partition);
 
+  pc_tree->partitioning = partition;
+
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
   xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
 #endif
-  pc_tree->partitioning = partition;
-  save_context(x, mi_row, mi_col, a, l, sa, sl,
-#if CONFIG_VAR_TX
-               ta, tl,
-#endif
-               bsize);
+
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -2371,6 +2597,9 @@
 #if CONFIG_SUPERTX
                        &none_rate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
                        bsize, ctx, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2384,11 +2613,8 @@
 #endif
       }
 
-      restore_context(x, mi_row, mi_col, a, l, sa, sl,
-#if CONFIG_VAR_TX
-                      ta, tl,
-#endif
-                      bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
       mi_8x8[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
@@ -2400,6 +2626,9 @@
 #if CONFIG_SUPERTX
                        &last_part_rate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
                        bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
@@ -2407,6 +2636,9 @@
 #if CONFIG_SUPERTX
                        &last_part_rate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
@@ -2424,6 +2656,9 @@
 #if CONFIG_SUPERTX
                          &rt_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_HORZ,
+#endif
                          subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
@@ -2445,6 +2680,9 @@
 #if CONFIG_SUPERTX
                        &last_part_rate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
@@ -2461,6 +2699,9 @@
 #if CONFIG_SUPERTX
                          &rt_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_VERT,
+#endif
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
                          INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -2484,6 +2725,9 @@
 #if CONFIG_SUPERTX
                          &last_part_rate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_SPLIT,
+#endif
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
@@ -2556,11 +2800,9 @@
 #if CONFIG_SUPERTX
     chosen_rate_nocoef = 0;
 #endif
-    restore_context(x, mi_row, mi_col, a, l, sa, sl,
-#if CONFIG_VAR_TX
-                    ta, tl,
-#endif
-                    bsize);
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
@@ -2571,33 +2813,24 @@
 #if CONFIG_SUPERTX
       int rt_nocoef = 0;
 #endif
-      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl[8], sa[8];
-#if CONFIG_VAR_TX
-      TXFM_CONTEXT tl[8], ta[8];
-#endif
+      RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      save_context(x, mi_row, mi_col, a, l, sa, sl,
-#if CONFIG_VAR_TX
-                   ta, tl,
-#endif
-                   bsize);
+      save_context(x, &x_ctx, mi_row, mi_col, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x,
                        mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
 #if CONFIG_SUPERTX
                        &rt_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
                        split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
-      restore_context(x, mi_row, mi_col, a, l, sa, sl,
-#if CONFIG_VAR_TX
-                      ta, tl,
-#endif
-                      bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         vp10_rd_cost_reset(&chosen_rdc);
@@ -2655,15 +2888,7 @@
 #endif
   }
 
-#if CONFIG_VAR_TX
-  xd->above_txfm_context = cm->above_txfm_context + mi_col;
-  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-#endif
-  restore_context(x, mi_row, mi_col, a, l, sa, sl,
-#if CONFIG_VAR_TX
-                  ta, tl,
-#endif
-                  bsize);
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
@@ -2923,6 +3148,192 @@
 }
 #endif
 
+#if CONFIG_EXT_PARTITION_TYPES
+static void rd_test_partition3(VP10_COMP *cpi, ThreadData *td,
+                               TileDataEnc *tile_data,
+                               TOKENEXTRA **tp, PC_TREE *pc_tree,
+                               RD_COST *best_rdc, PICK_MODE_CONTEXT ctxs[3],
+                               PICK_MODE_CONTEXT *ctx,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               PARTITION_TYPE partition,
+#if CONFIG_SUPERTX
+                               int64_t best_rd, int *best_rate_nocoef,
+                               RD_SEARCH_MACROBLOCK_CONTEXT* x_ctx,
+#endif
+                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COST this_rdc, sum_rdc;
+#if CONFIG_SUPERTX
+  VP10_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  int this_rate_nocoef, sum_rate_nocoef;
+  int abort_flag;
+  PARTITION_TYPE best_partition;
+  int tmp_rate;
+  int64_t tmp_dist, tmp_rd;
+#endif
+  if (cpi->sf.adaptive_motion_search)
+    load_pred_mv(x, ctx);
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc,
+#if CONFIG_SUPERTX
+                   &sum_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                   partition,
+#endif
+                   subsize0, &ctxs[0], best_rdc->rdcost);
+#if CONFIG_SUPERTX
+  abort_flag = sum_rdc.rdcost >= best_rd;
+#endif
+
+#if CONFIG_SUPERTX
+  if (sum_rdc.rdcost < INT64_MAX) {
+#else
+  if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+    PICK_MODE_CONTEXT *ctx = &ctxs[0];
+    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 0);
+    encode_superblock(cpi, td, tp, 0, mi_row0, mi_col0, subsize0, ctx);
+
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+
+#if CONFIG_SUPERTX
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+                     &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost);
+#else
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef = INT_MAX;
+#endif
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += this_rate_nocoef;
+#endif
+    }
+
+#if CONFIG_SUPERTX
+    if (sum_rdc.rdcost < INT64_MAX) {
+#else
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+      PICK_MODE_CONTEXT *ctx = &ctxs[1];
+      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row1, mi_col1, subsize1, ctx);
+
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, ctx);
+
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif
+      }
+
+#if CONFIG_SUPERTX
+      if (cm->frame_type != KEY_FRAME && !abort_flag &&
+          sum_rdc.rdcost < INT64_MAX && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+          !xd->lossless[0]) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        best_partition = pc_tree->partitioning;
+        pc_tree->partitioning = partition;
+        sum_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[partition]][supertx_size],
+            0);
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate,
+                                sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+
+          tmp_rate = sum_rate_nocoef;
+          tmp_dist = 0;
+          restore_context(x, x_ctx, mi_row, mi_col, bsize);
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rate,
+                        &tmp_dist, &best_tx, pc_tree);
+
+          tmp_rate += vp10_cost_bit(
+              cm->fc->supertx_prob
+              [partition_supertx_context_lookup[partition]][supertx_size],
+              1);
+          tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
+          if (tmp_rd < sum_rdc.rdcost) {
+            sum_rdc.rdcost = tmp_rd;
+            sum_rdc.rate = tmp_rate;
+            sum_rdc.dist = tmp_dist;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
+
+      if (sum_rdc.rdcost < best_rdc->rdcost) {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        sum_rdc.rate += cpi->partition_cost[pl][partition];
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate,
+                                sum_rdc.dist);
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+        if (sum_rdc.rdcost < best_rdc->rdcost) {
+#if CONFIG_SUPERTX
+          *best_rate_nocoef = sum_rate_nocoef;
+          assert(*best_rate_nocoef >= 0);
+#endif
+          *best_rdc = sum_rdc;
+          pc_tree->partitioning = partition;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2939,11 +3350,7 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
-#if CONFIG_VAR_TX
-  TXFM_CONTEXT tl[8], ta[8];
-#endif
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i;
@@ -2959,6 +3366,9 @@
 #endif  // CONFIG_SUPERTX
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
 
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
@@ -3020,11 +3430,10 @@
 #if CONFIG_VAR_TX
   xd->above_txfm_context = cm->above_txfm_context + mi_col;
   xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-  save_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 #endif
 
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
+
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -3090,6 +3499,9 @@
 #if CONFIG_SUPERTX
                      &this_rate_nocoef,
 #endif
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_NONE,
+#endif
                      bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
@@ -3178,13 +3590,8 @@
 #endif
       }
     }
-#if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif  // CONFIG_VAR_TX
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
 
   // store estimated motion vector
@@ -3203,11 +3610,17 @@
             ctx->mic.mbmi.interp_filter;
 #if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
-                       &sum_rate_nocoef, subsize, pc_tree->leaf_split[0],
-                       INT64_MAX);
+                       &sum_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], INT64_MAX);
 #else
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rdcost);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], best_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
       if (sum_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -3232,14 +3645,9 @@
           TX_TYPE best_tx = DCT_DCT;
           tmp_rate = sum_rate_nocoef;
           tmp_dist = 0;
-#if CONFIG_VAR_TX
-          xd->above_txfm_context = cm->above_txfm_context + mi_col;
-          xd->left_txfm_context =
-              xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-          restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-          restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif  // CONFIG_VAR_TX
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
           rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
                         &tmp_rate, &tmp_dist,
                         &best_tx,
@@ -3326,14 +3734,9 @@
 
           tmp_rate = sum_rate_nocoef;
           tmp_dist = 0;
-#if CONFIG_VAR_TX
-          xd->above_txfm_context = cm->above_txfm_context + mi_col;
-          xd->left_txfm_context =
-              xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-          restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-          restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif  // CONFIG_VAR_TX
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
           rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
                         &tmp_rate, &tmp_dist,
                         &best_tx,
@@ -3380,13 +3783,8 @@
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
-#if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }  // if (do_split)
 
   // PARTITION_HORZ
@@ -3403,6 +3801,9 @@
 #if CONFIG_SUPERTX
                      &sum_rate_nocoef,
 #endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_HORZ,
+#endif
                      subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
 
 #if CONFIG_SUPERTX
@@ -3427,11 +3828,18 @@
 #if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
                        &this_rdc, &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
                        subsize, &pc_tree->horizontal[1],
                        INT64_MAX);
 #else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
-                       &this_rdc, subsize, &pc_tree->horizontal[1],
+                       &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
@@ -3466,13 +3874,9 @@
         TX_TYPE best_tx = DCT_DCT;
         tmp_rate = sum_rate_nocoef;
         tmp_dist = 0;
-#if CONFIG_VAR_TX
-        xd->above_txfm_context = cm->above_txfm_context + mi_col;
-        xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
-        restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-        restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif  // CONFIG_VAR_TX
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
         rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
                       &tmp_rate, &tmp_dist,
                       &best_tx,
@@ -3511,13 +3915,8 @@
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
-#if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
   // PARTITION_VERT
   if (partition_vert_allowed &&
@@ -3534,6 +3933,9 @@
 #if CONFIG_SUPERTX
                      &sum_rate_nocoef,
 #endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_VERT,
+#endif
                      subsize, &pc_tree->vertical[0], best_rdc.rdcost);
 #if CONFIG_SUPERTX
     abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
@@ -3556,11 +3958,19 @@
             ctx->mic.mbmi.interp_filter;
 #if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
-                       &this_rate_nocoef, subsize, &pc_tree->vertical[1],
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize, &pc_tree->vertical[1],
                        INT64_MAX - sum_rdc.rdcost);
 #else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
-                       &this_rdc, subsize,
+                       &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize,
                        &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
 #endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
@@ -3594,13 +4004,9 @@
 
         tmp_rate = sum_rate_nocoef;
         tmp_dist = 0;
-#if CONFIG_VAR_TX
-        xd->above_txfm_context = cm->above_txfm_context + mi_col;
-        xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
-        restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-        restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif  // CONFIG_VAR_TX
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
         rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
                       &tmp_rate, &tmp_dist,
                       &best_tx,
@@ -3640,15 +4046,72 @@
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
-#if CONFIG_VAR_TX
-    xd->above_txfm_context = cm->above_txfm_context + mi_col;
-    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MI_MASK);
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
-#else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  // PARTITION_HORZ_A
+  if (partition_horz_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontala,
+                       ctx, mi_row, mi_col, bsize, PARTITION_HORZ_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2,
+                       mi_row, mi_col + mi_step, bsize2,
+                       mi_row + mi_step, mi_col, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_HORZ_B
+  if (partition_horz_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontalb,
+                       ctx, mi_row, mi_col, bsize, PARTITION_HORZ_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize,
+                       mi_row + mi_step, mi_col, bsize2,
+                       mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_A
+  if (partition_vert_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticala,
+                       ctx, mi_row, mi_col, bsize, PARTITION_VERT_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2,
+                       mi_row + mi_step, mi_col, bsize2,
+                       mi_row, mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_B
+  if (partition_vert_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticalb,
+                       ctx, mi_row, mi_col, bsize, PARTITION_VERT_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize,
+                       mi_row, mi_col + mi_step, bsize2,
+                       mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
@@ -3687,13 +4150,8 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
-  // Initialize the left context for the new SB row
-  memset(&xd->left_context, 0, sizeof(xd->left_context));
-  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
-#if CONFIG_VAR_TX
-  memset(xd->left_txfm_context_buffer, 0,
-         sizeof(xd->left_txfm_context_buffer));
-#endif
+  vp10_zero_left_context(xd);
+
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
@@ -3791,19 +4249,9 @@
   // Copy data over into macro block data structures.
   vp10_setup_src_planes(x, cpi->Source, 0, 0);
 
-  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  vp10_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(xd->above_context[0], 0,
-         sizeof(*xd->above_context[0]) *
-         2 * aligned_mi_cols * MAX_MB_PLANE);
-  memset(xd->above_seg_context, 0,
-         sizeof(*xd->above_seg_context) * aligned_mi_cols);
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*xd->above_txfm_context) * aligned_mi_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_mi_cols);
 }
 
 static int check_dual_ref_flags(VP10_COMP *cpi) {
@@ -3971,7 +4419,6 @@
   vp10_zero(*td->counts);
   vp10_zero(rdc->coef_counts);
   vp10_zero(rdc->comp_pred_diff);
-  vp10_zero(rdc->filter_diff);
   rdc->m_search_count = 0;   // Count of motion search hits.
   rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
@@ -4039,31 +4486,9 @@
   cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
 }
-
-static INTERP_FILTER get_interp_filter(
-    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
-#if CONFIG_EXT_INTERP
-  if (!is_alt_ref &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[EIGHTTAP_SMOOTH] &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[EIGHTTAP_REGULAR] &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[MULTITAP_SHARP] &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_SMOOTH2;
-  }
-#endif  // CONFIG_EXT_INTERP
-  if (!is_alt_ref &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_REGULAR] &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[MULTITAP_SHARP] &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_SMOOTH;
-  } else if (threshes[MULTITAP_SHARP] > threshes[EIGHTTAP_REGULAR] &&
-             threshes[MULTITAP_SHARP] > threshes[SWITCHABLE - 1]) {
-    return MULTITAP_SHARP;
-  } else if (threshes[EIGHTTAP_REGULAR] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_REGULAR;
-  } else {
-    return SWITCHABLE;
-  }
+static INTERP_FILTER get_cm_interp_filter(VP10_COMP *cpi) {
+  (void)cpi;
+  return SWITCHABLE;
 }
 
 void vp10_encode_frame(VP10_COMP *cpi) {
@@ -4116,7 +4541,6 @@
     // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
-    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
@@ -4134,7 +4558,7 @@
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
     if (cm->interp_filter == SWITCHABLE) {
-      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+      cm->interp_filter = get_cm_interp_filter(cpi);
     }
 
     encode_frame_internal(cpi);
@@ -4142,9 +4566,6 @@
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
-
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
@@ -4260,13 +4681,14 @@
                               FRAME_COUNTS *counts,
                               TX_SIZE tx_size, int blk_row, int blk_col) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col >> 1),
-                                   xd->left_txfm_context + (blk_row >> 1),
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
                                    tx_size);
-  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
 
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> 5;
@@ -4279,8 +4701,8 @@
   if (tx_size == plane_tx_size) {
     ++counts->txfm_partition[ctx][0];
     mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
   } else {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int bh = num_4x4_blocks_high_lookup[bsize];
@@ -4288,10 +4710,10 @@
     ++counts->txfm_partition[ctx][1];
 
     if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
       mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
       return;
     }
 
@@ -4327,10 +4749,11 @@
 static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
                              int blk_row, int blk_col) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
 
   if (xd->mb_to_bottom_edge < 0)
     max_blocks_high += xd->mb_to_bottom_edge >> 5;
@@ -4342,8 +4765,8 @@
 
   if (tx_size == plane_tx_size) {
     mbmi->tx_size = tx_size;
-    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
 
   } else {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
@@ -4351,10 +4774,10 @@
     int i;
 
     if (tx_size == TX_8X8) {
-      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
       mbmi->tx_size = TX_4X4;
-      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
-                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
       return;
     }
 
@@ -4520,9 +4943,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      vp10_build_prediction_by_above_preds(cpi, xd, mi_row, mi_col, dst_buf1,
+      vp10_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                            dst_stride1);
-      vp10_build_prediction_by_left_preds(cpi, xd, mi_row, mi_col, dst_buf2,
+      vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                           dst_stride2);
       vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
                             mi_row, mi_col);
@@ -4637,6 +5060,9 @@
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize = bsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return 1;
@@ -4647,6 +5073,10 @@
     subsize = BLOCK_4X4;
 
   partition = partition_lookup[bsl][subsize];
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize > BLOCK_8X8)
+    partition = pc_tree->partitioning;
+#endif
 
   switch (partition) {
     case PARTITION_NONE:
@@ -4687,6 +5117,32 @@
           return 1;
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontala[i]))
+          return 1;
+      }
+      break;
+    case PARTITION_HORZ_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontalb[i]))
+          return 1;
+      }
+      break;
+    case PARTITION_VERT_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticala[i]))
+          return 1;
+      }
+      break;
+    case PARTITION_VERT_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticalb[i]))
+          return 1;
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
   }
@@ -4716,6 +5172,16 @@
         return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
       else
         return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
+    case PARTITION_HORZ_B:
+      return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
+    case PARTITION_VERT_A:
+      return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
+    case PARTITION_VERT_B:
+      return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
       return 0;
@@ -4950,6 +5416,9 @@
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
 
   int i, ctx;
   uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
@@ -5000,6 +5469,10 @@
     subsize = BLOCK_4X4;
   }
   partition = partition_lookup[bsl][subsize];
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize > BLOCK_8X8)
+    partition = pc_tree->partitioning;
+#endif
   if (output_enabled && bsize != BLOCK_4X4 && bsize < top_bsize)
       cm->counts.partition[ctx][partition]++;
 
@@ -5260,13 +5733,225 @@
           }
         }
         break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf1, dst_stride1, bsize2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2, 1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+
+      break;
+    case PARTITION_VERT_A:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2, subsize, top_bsize, subsize, output_enabled,
+                       0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf, dst_stride, 0);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, bsize2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      break;
+    case PARTITION_VERT_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf, dst_stride, 3);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                       dst_stride1, bsize2, top_bsize, bsize2, output_enabled,
+                       0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, bsize2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  &xd->plane[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
         assert(0);
   }
 
 
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize < top_bsize)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
@@ -5313,7 +5998,7 @@
   // to reuse distortion values from the RD estimation, so we reset these
   // flags here before evaluating RD for supertx coding.
   for (plane = 0 ; plane < MAX_MB_PLANE ; plane++)
-    x->skip_txfm[plane << 2] = SKIP_TXFM_NONE;
+    x->skip_txfm[plane][0] = SKIP_TXFM_NONE;
 
   mbmi = &xd->mi[0]->mbmi;
   best_tx_nostx = mbmi->tx_type;
@@ -5378,11 +6063,6 @@
 #if CONFIG_EXT_TX
     if (!ext_tx_used_inter[ext_tx_set][tx_type])
       continue;
-    if (ext_tx_set == 1 &&
-        tx_type >= DST_ADST && tx_type < IDTX && *best_tx == DCT_DCT) {
-      tx_type = IDTX - 1;
-      continue;
-    }
 #else
     if (tx_size >= TX_32X32 && tx_type != DCT_DCT)
       continue;
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 700088c..c42b7f1 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -449,7 +449,7 @@
 #endif
     if (x->quant_fp) {
       // Encoding process for rtc mode
-      if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
+      if (x->skip_txfm[0][0] == SKIP_TXFM_AC_DC && plane == 0) {
         // skip forward transform
         p->eobs[block] = 0;
         *a = *l = 0;
@@ -460,12 +460,12 @@
       }
     } else {
       if (max_txsize_lookup[plane_bsize] == tx_size) {
-        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
-        if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
+        int blk_index = (block >> (tx_size << 1));
+        if (x->skip_txfm[plane][blk_index] == SKIP_TXFM_NONE) {
           // full forward transform and quantization
           vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
                            tx_size, VP10_XFORM_QUANT_B);
-        } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
+        } else if (x->skip_txfm[plane][blk_index] == SKIP_TXFM_AC_ONLY) {
           // fast path forward transform and quantization
           vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
                            tx_size, VP10_XFORM_QUANT_DC);
@@ -558,12 +558,11 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  int blk_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-                (blk_col >> (1 - pd->subsampling_x));
-  TX_SIZE plane_tx_size = plane ?
-      get_uv_tx_size_impl(mbmi->inter_tx_size[blk_idx], bsize,
-                          0, 0) :
-      mbmi->inter_tx_size[blk_idx];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
 
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index 61429aa..7941363 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c
@@ -282,9 +282,9 @@
       const MV diff = {mvs[i].as_mv.row - ref->row,
                        mvs[i].as_mv.col - ref->col};
 #if CONFIG_REF_MV
-    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
-                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+      int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
+                                 mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
       vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
     }
@@ -332,9 +332,9 @@
       const MV diff = {mvs[i].as_mv.row - ref->row,
                        mvs[i].as_mv.col - ref->col};
 #if CONFIG_REF_MV
-    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
-                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
-    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+      int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
+                                 mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
       vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
     }
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index afe3292..b2fe978 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -251,7 +251,6 @@
 typedef struct RD_COUNTS {
   vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   int m_search_count;
   int ex_search_count;
 } RD_COUNTS;
@@ -488,7 +487,6 @@
   int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
   int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
   int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
-  int drl_mode_cost1[DRL_MODE_CONTEXTS][2];
 #if CONFIG_EXT_INTER
   int new2mv_mode_cost[2];
 #endif  // CONFIG_EXT_INTER
@@ -505,7 +503,11 @@
   int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
   int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
   int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
   int palette_y_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index 6cb9494..c586b9a 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -19,9 +19,6 @@
   for (i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
-
   for (i = 0; i < TX_SIZES; i++)
     for (j = 0; j < PLANE_TYPES; j++)
       for (k = 0; k < REF_TYPES; k++)
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index faedb43..785fef0 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -54,17 +54,14 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
       vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
@@ -96,17 +93,14 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
       vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
@@ -138,17 +132,14 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
       vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
@@ -180,17 +171,14 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
       vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
@@ -227,15 +215,12 @@
     case FLIPADST_ADST:
       vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
       break;
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
@@ -270,15 +255,12 @@
     case FLIPADST_ADST:
       vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST exists only in C
       vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
@@ -314,15 +296,12 @@
     case FLIPADST_ADST:
       vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       // Use C version since DST exists only in C
       vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
@@ -355,15 +334,12 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case DST_DST:
-    case DCT_DST:
-    case DST_DCT:
-    case DST_ADST:
-    case ADST_DST:
-    case DST_FLIPADST:
-    case FLIPADST_DST:
-    case H_DCT:
     case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
       vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 1f147d7..23184ed 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -685,47 +685,6 @@
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
-#if CONFIG_VP9_HIGHBITDEPTH
-// TODO(yunqing): Optimize the following 2 functions.
-static void highbd_comp_avg_upsampled_pred(uint16_t *comp_pred,
-                                           const uint8_t *pred8,
-                                           int width, int height,
-                                           const uint8_t *ref8,
-                                           int ref_stride) {
-  int i, j;
-  int stride = ref_stride << 3;
-
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      const int tmp = pred[j] + ref[(j << 3)];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += stride;
-  }
-}
-
-static void highbd_upsampled_pred(uint16_t *comp_pred,
-                                  int width, int height,
-                                  const uint8_t *ref8,
-                                  int ref_stride) {
-  int i, j;
-  int stride = ref_stride << 3;
-
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = ref[(j << 3)];
-    }
-    comp_pred += width;
-    ref += stride;
-  }
-}
-#endif
-
 static int upsampled_pref_error(const MACROBLOCKD *xd,
                                 const vp10_variance_fn_ptr_t *vfp,
                                 const uint8_t *const src, const int src_stride,
@@ -737,10 +696,10 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
     if (second_pred != NULL)
-      highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
-                                     y_stride);
+      vpx_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+                                         y_stride);
     else
-      highbd_upsampled_pred(pred16, w, h, y, y_stride);
+      vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
                       sse);
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index 78e8e9a..7097dc1 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -393,9 +393,17 @@
 
     if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
         cm->frame_type == KEY_FRAME) {
+#if CONFIG_EXT_PARTITION_TYPES
+      vp10_cost_tokens(cpi->partition_cost[0], cm->fc->partition_prob[0],
+                       vp10_partition_tree);
+      for (i = 1; i < PARTITION_CONTEXTS; ++i)
+        vp10_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                         vp10_ext_partition_tree);
+#else
       for (i = 0; i < PARTITION_CONTEXTS; ++i)
         vp10_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
                          vp10_partition_tree);
+#endif  // CONFIG_EXT_PARTITION_TYPES
     }
 
     fill_mode_costs(cpi);
@@ -418,13 +426,8 @@
       }
 
       for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-        cpi->drl_mode_cost0[i][0] = vp10_cost_bit(cm->fc->drl_prob0[i], 0);
-        cpi->drl_mode_cost0[i][1] = vp10_cost_bit(cm->fc->drl_prob0[i], 1);
-      }
-
-      for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-        cpi->drl_mode_cost1[i][0] = vp10_cost_bit(cm->fc->drl_prob1[i], 0);
-        cpi->drl_mode_cost1[i][1] = vp10_cost_bit(cm->fc->drl_prob1[i], 1);
+        cpi->drl_mode_cost0[i][0] = vp10_cost_bit(cm->fc->drl_prob[i], 0);
+        cpi->drl_mode_cost0[i][1] = vp10_cost_bit(cm->fc->drl_prob[i], 1);
       }
 #if CONFIG_EXT_INTER
       cpi->new2mv_mode_cost[0] = vp10_cost_bit(cm->fc->new2mv_prob, 0);
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 5a6a44a..61feabe 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -279,8 +279,6 @@
 
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
-  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-
   int RDMULT;
   int RDDIV;
 } RD_OPT;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 63163d7..fb6e0c3 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -377,16 +377,35 @@
   unsigned int var[16];
   double total = 0;
   const int f_index = bsize - 6;
+
   if (f_index < 0) {
     int i, j, index;
     int w_shift = bw == 8 ? 1 : 2;
     int h_shift = bh == 8 ? 1 : 2;
-    for (i = 0; i < bh; ++i)
-      for (j = 0; j < bw; ++j) {
-        index = (j >> w_shift) + ((i >> h_shift) << 2);
-        esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
-                      (src[j + i * src_stride] - dst[j + i * dst_stride]);
-      }
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (i = 0; i < bh; ++i)
+        for (j = 0; j < bw; ++j) {
+          index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src16[j + i * src_stride] -
+                        dst16[j + i * dst_stride]) *
+                        (src16[j + i * src_stride] -
+                        dst16[j + i * dst_stride]);
+        }
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      for (i = 0; i < bh; ++i)
+        for (j = 0; j < bw; ++j) {
+          index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   } else {
     var[0] = cpi->fn_ptr[f_index].vf(src, src_stride,
                                      dst, dst_stride, &esq[0]);
@@ -584,17 +603,6 @@
          dct_vs_dst(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
 }
 
-static int prune_three_for_sby(const VP10_COMP *cpi,
-                               BLOCK_SIZE bsize,
-                               MACROBLOCK *x,
-                               MACROBLOCKD *xd) {
-  (void) cpi;
-  (void) bsize;
-  (void) x;
-  (void) xd;
-  return 0;
-}
-
 #endif  // CONFIG_EXT_TX
 
 // Performance drop: 0.3%, Speed improvement: 5%
@@ -625,9 +633,6 @@
     case PRUNE_TWO :
       return prune_two_for_sby(cpi, bsize, x, xd);
       break;
-    case PRUNE_THREE :
-      return prune_three_for_sby(cpi, bsize, x, xd);
-      break;
   #endif
   }
   assert(0);
@@ -745,18 +750,18 @@
 
         var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
                                         dst, pd->dst.stride, &sse);
-        x->bsse[(i << 2) + block_idx] = sse;
+        x->bsse[i][block_idx] = sse;
         sum_sse += sse;
 
-        x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
+        x->skip_txfm[i][block_idx] = SKIP_TXFM_NONE;
         if (!x->select_tx_size) {
           // Check if all ac coefficients can be quantized to zero.
           if (var < ac_thr || var == 0) {
-            x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
+            x->skip_txfm[i][block_idx] = SKIP_TXFM_AC_ONLY;
 
             // Check if dc coefficient can be quantized to zero.
             if (sse - var < dc_thr || sse == var) {
-              x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
+              x->skip_txfm[i][block_idx] = SKIP_TXFM_AC_DC;
 
               if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
                 low_err_skip = 1;
@@ -1149,21 +1154,21 @@
       dist = (int64_t)tmp * 16;
     }
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
-    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+    if (x->skip_txfm[plane][block >> (tx_size << 1)] ==
         SKIP_TXFM_NONE) {
       // full forward transform and quantization
       vp10_xform_quant(x, plane, block, blk_row, blk_col,
                        plane_bsize, tx_size, VP10_XFORM_QUANT_B);
       dist_block(args->cpi, x, plane, block, blk_row, blk_col,
                  tx_size, &dist, &sse);
-    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
+    } else if (x->skip_txfm[plane][block >> (tx_size << 1)] ==
                SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
       vp10_xform_quant(x, plane, block, blk_row, blk_col,
                           plane_bsize, tx_size, VP10_XFORM_QUANT_DC);
-      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      sse  = x->bsse[plane][block >> (tx_size << 1)] << 4;
       dist = sse;
       if (x->plane[plane].eobs[block]) {
         const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
@@ -1181,7 +1186,7 @@
       // SKIP_TXFM_AC_DC
       // skip forward transform
       x->plane[plane].eobs[block] = 0;
-      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+      sse  = x->bsse[plane][block >> (tx_size << 1)] << 4;
       dist = sse;
     }
   } else {
@@ -1306,6 +1311,179 @@
 }
 #endif  // CONFIG_SUPERTX
 
+static int64_t txfm_yrd(VP10_COMP *cpi, MACROBLOCK *x,
+                        int *r, int64_t *d, int *s, int64_t *sse,
+                        int64_t ref_best_rd,
+                        BLOCK_SIZE bs, TX_TYPE tx_type, int tx_size) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int s0, s1;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  const int is_inter = is_inter_block(mbmi);
+  const int r_tx_size =
+      cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)][tx_size];
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+
+  assert(skip_prob > 0);
+  s0 = vp10_cost_bit(skip_prob, 0);
+  s1 = vp10_cost_bit(skip_prob, 1);
+
+  mbmi->tx_type = tx_type;
+  mbmi->tx_size = tx_size;
+  txfm_rd_in_plane(x,
+                   cpi,
+                   r, d, s,
+                   sse, ref_best_rd, 0, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing);
+  if (*r == INT_MAX)
+    return INT64_MAX;
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(tx_size, bs, is_inter);
+  if (get_ext_tx_types(tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *r += cpi->inter_tx_type_costs[ext_tx_set]
+                                      [mbmi->tx_size][mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                      [mbmi->mode][mbmi->tx_type];
+    }
+  }
+
+#else
+  if (tx_size < TX_32X32 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id] && !FIXED_TX_TYPE) {
+    if (is_inter) {
+      *r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+    } else {
+      *r += cpi->intra_tx_type_costs[mbmi->tx_size]
+           [intra_mode_to_tx_type_context[mbmi->mode]]
+           [mbmi->tx_type];
+    }
+  }
+#endif  // CONFIG_EXT_TX
+
+  if (*s) {
+    if (is_inter) {
+      rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+    } else {
+      rd =  RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, *sse);
+    }
+  } else {
+    rd = RDCOST(x->rdmult, x->rddiv, *r + s0 + r_tx_size * tx_select, *d);
+  }
+
+  if (tx_select && !(*s && is_inter))
+    *r += r_tx_size;
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !(*s))
+    rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+
+  return rd;
+}
+
+static int64_t choose_tx_size_fix_type(VP10_COMP *cpi, MACROBLOCK *x,
+                                       int *rate,
+                                       int64_t *distortion,
+                                       int *skip,
+                                       int64_t *psse,
+                                       int64_t ref_best_rd,
+                                       BLOCK_SIZE bs, TX_TYPE tx_type,
+                                       int prune) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int r, s;
+  int64_t d, sse;
+  int64_t rd = INT64_MAX;
+  int n;
+  int start_tx, end_tx;
+  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE best_tx = max_tx_size;
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+
+  if (tx_select) {
+    start_tx = max_tx_size;
+    end_tx = 0;
+  } else {
+    const TX_SIZE chosen_tx_size =
+        VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
+    start_tx = chosen_tx_size;
+    end_tx = chosen_tx_size;
+  }
+
+  *distortion = INT64_MAX;
+  *rate       = INT_MAX;
+  *skip       = 0;
+  *psse       = INT64_MAX;
+
+  mbmi->tx_type = tx_type;
+  last_rd = INT64_MAX;
+  for (n = start_tx; n >= end_tx; --n) {
+    if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n))
+        continue;
+#if CONFIG_EXT_TX
+    ext_tx_set = get_ext_tx_set(n, bs, is_inter);
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type])
+        continue;
+      if (cpi->sf.tx_type_search > 0) {
+        if (!do_tx_type_search(tx_type, prune))
+          continue;
+      }
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+          continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type])
+        continue;
+    }
+#else  // CONFIG_EXT_TX
+    if (n >= TX_32X32 && tx_type != DCT_DCT)
+      continue;
+    if (is_inter && cpi->sf.tx_type_search > 0 &&
+        !do_tx_type_search(tx_type, prune))
+        continue;
+#endif  // CONFIG_EXT_TX
+
+    rd = txfm_yrd(cpi, x, &r, &d, &s, &sse, ref_best_rd, bs, tx_type, n);
+
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd == INT64_MAX ||
+         (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
+         (n < (int) max_tx_size && rd > last_rd)))
+      break;
+
+    last_rd = rd;
+    if (rd < best_rd) {
+      best_tx = n;
+      best_rd = rd;
+      *distortion = d;
+      *rate       = r;
+      *skip       = s;
+      *psse       = sse;
+    }
+  }
+  mbmi->tx_size = best_tx;
+
+  return best_rd;
+}
+
 static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
                                    int *rate, int64_t *distortion,
                                    int *skip, int64_t *sse,
@@ -1344,11 +1522,6 @@
         if (cpi->sf.tx_type_search > 0) {
           if (!do_tx_type_search(tx_type, prune))
             continue;
-        } else if (ext_tx_set == 1 &&
-                   tx_type >= DST_ADST && tx_type < IDTX &&
-                   best_tx_type == DCT_DCT) {
-          tx_type = IDTX - 1;
-          continue;
         }
       } else {
         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
@@ -1357,12 +1530,6 @@
         }
         if (!ext_tx_used_intra[ext_tx_set][tx_type])
           continue;
-        if (ext_tx_set == 1 &&
-            tx_type >= DST_ADST && tx_type < IDTX &&
-            best_tx_type == DCT_DCT) {
-          tx_type = IDTX - 1;
-          continue;
-        }
       }
 
       mbmi->tx_type = tx_type;
@@ -1470,166 +1637,36 @@
                                    int64_t *psse,
                                    int64_t ref_best_rd,
                                    BLOCK_SIZE bs) {
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
   int r, s;
   int64_t d, sse;
   int64_t rd = INT64_MAX;
-  int n;
-  int s0, s1;
-  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
-  TX_SIZE best_tx = max_tx_size;
-  int start_tx, end_tx;
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = max_txsize_lookup[bs];
   const int is_inter = is_inter_block(mbmi);
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   int prune = 0;
-#if CONFIG_EXT_TX
-  int ext_tx_set;
-#endif  // CONFIG_EXT_TX
 
   if (is_inter && cpi->sf.tx_type_search > 0)
     prune = prune_tx_types(cpi, bs, x, xd);
 
-  assert(skip_prob > 0);
-  s0 = vp10_cost_bit(skip_prob, 0);
-  s1 = vp10_cost_bit(skip_prob, 1);
-
-  if (tx_select) {
-    start_tx = max_tx_size;
-    end_tx = 0;
-  } else {
-    const TX_SIZE chosen_tx_size =
-        VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
-    start_tx = chosen_tx_size;
-    end_tx = chosen_tx_size;
-  }
-
   *distortion = INT64_MAX;
   *rate       = INT_MAX;
   *skip       = 0;
   *psse       = INT64_MAX;
 
   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-    last_rd = INT64_MAX;
-    for (n = start_tx; n >= end_tx; --n) {
-      const int r_tx_size =
-          cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)][n];
-      if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n))
-          continue;
-#if CONFIG_EXT_TX
-      ext_tx_set = get_ext_tx_set(n, bs, is_inter);
-      if (is_inter) {
-        if (!ext_tx_used_inter[ext_tx_set][tx_type])
-          continue;
-        if (cpi->sf.tx_type_search > 0) {
-          if (!do_tx_type_search(tx_type, prune))
-            continue;
-        } else if (ext_tx_set == 1 &&
-                   tx_type >= DST_ADST && tx_type < IDTX &&
-                   best_tx_type == DCT_DCT) {
-          tx_type = IDTX - 1;
-          continue;
-        }
-      } else {
-        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
-          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
-            continue;
-        }
-        if (!ext_tx_used_intra[ext_tx_set][tx_type])
-          continue;
-        if (ext_tx_set == 1 &&
-            tx_type >= DST_ADST && tx_type < IDTX &&
-            best_tx_type == DCT_DCT) {
-          tx_type = IDTX - 1;
-          break;
-        }
-      }
-      mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x,
-                       cpi,
-                       &r, &d, &s,
-                       &sse, ref_best_rd, 0, bs, n,
-                       cpi->sf.use_fast_coef_costing);
-      if (get_ext_tx_types(n, bs, is_inter) > 1 &&
-          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-          r != INT_MAX) {
-        if (is_inter) {
-          if (ext_tx_set > 0)
-            r += cpi->inter_tx_type_costs[ext_tx_set]
-                                         [mbmi->tx_size][mbmi->tx_type];
-        } else {
-          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-            r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
-                                         [mbmi->mode][mbmi->tx_type];
-        }
-      }
-#else  // CONFIG_EXT_TX
-      if (n >= TX_32X32 && tx_type != DCT_DCT) {
-        continue;
-      }
-      mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x,
-                       cpi,
-                       &r, &d, &s,
-                       &sse, ref_best_rd, 0, bs, n,
-                       cpi->sf.use_fast_coef_costing);
-      if (n < TX_32X32 &&
-          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-          r != INT_MAX && !FIXED_TX_TYPE) {
-        if (is_inter) {
-          r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-          if (cpi->sf.tx_type_search > 0 && !do_tx_type_search(tx_type, prune))
-              continue;
-        } else {
-          r += cpi->intra_tx_type_costs[mbmi->tx_size]
-              [intra_mode_to_tx_type_context[mbmi->mode]]
-              [mbmi->tx_type];
-        }
-      }
-#endif  // CONFIG_EXT_TX
-
-      if (r == INT_MAX)
-        continue;
-
-      if (s) {
-        if (is_inter) {
-          rd = RDCOST(x->rdmult, x->rddiv, s1, sse);
-        } else {
-          rd =  RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, sse);
-        }
-      } else {
-        rd = RDCOST(x->rdmult, x->rddiv, r + s0 + r_tx_size * tx_select, d);
-      }
-
-      if (tx_select && !(s && is_inter))
-        r += r_tx_size;
-
-      if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !s)
-        rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, sse));
-
-      // Early termination in transform size search.
-      if (cpi->sf.tx_size_search_breakout &&
-          (rd == INT64_MAX ||
-           (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int) max_tx_size && rd > last_rd)))
-        break;
-
-      last_rd = rd;
-      if (rd <
-          (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) *
-          best_rd) {
-        best_tx = n;
-        best_rd = rd;
-        *distortion = d;
-        *rate       = r;
-        *skip       = s;
-        *psse       = sse;
-        best_tx_type = mbmi->tx_type;
-      }
+    rd = choose_tx_size_fix_type(cpi, x, &r, &d, &s, &sse, ref_best_rd, bs,
+                                 tx_type, prune);
+    if (rd < (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) * best_rd) {
+      best_rd = rd;
+      *distortion = d;
+      *rate       = r;
+      *skip       = s;
+      *psse       = sse;
+      best_tx_type = tx_type;
+      best_tx = mbmi->tx_size;
     }
   }
 
@@ -2846,57 +2883,21 @@
   *bsse += tmp * 16;
 
   if (p->eobs[block] > 0) {
-    const int lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+    INV_TXFM_PARAM inv_txfm_param;
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = p->eobs[block];
+    inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      const int bd = xd->bd;
-      switch (tx_size) {
-        case TX_32X32:
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, rec_buffer, 32,
-                                         p->eobs[block], bd, tx_type);
-          break;
-        case TX_16X16:
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, rec_buffer, 32,
-                                         p->eobs[block], bd, tx_type);
-          break;
-        case TX_8X8:
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, rec_buffer, 32,
-                                       p->eobs[block], bd, tx_type);
-          break;
-        case TX_4X4:
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, rec_buffer, 32,
-                                       p->eobs[block], bd, tx_type, lossless);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          break;
-      }
+      inv_txfm_param.bd = xd->bd;
+      highbd_inv_txfm_add(dqcoeff, rec_buffer, 32, &inv_txfm_param);
     } else {
-#else
-    {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      switch (tx_size) {
-        case TX_32X32:
-          vp10_inv_txfm_add_32x32(dqcoeff, rec_buffer, 32, p->eobs[block],
-                                  tx_type);
-          break;
-        case TX_16X16:
-          vp10_inv_txfm_add_16x16(dqcoeff, rec_buffer, 32, p->eobs[block],
-                                  tx_type);
-          break;
-        case TX_8X8:
-          vp10_inv_txfm_add_8x8(dqcoeff, rec_buffer, 32, p->eobs[block],
-                                tx_type);
-          break;
-        case TX_4X4:
-          vp10_inv_txfm_add_4x4(dqcoeff, rec_buffer, 32, p->eobs[block],
-                                tx_type, lossless);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          break;
-      }
+      inv_txfm_add(dqcoeff, rec_buffer, 32, &inv_txfm_param);
     }
+#else  // CONFIG_VP9_HIGHBITDEPTH
+    inv_txfm_add(dqcoeff, rec_buffer, 32, &inv_txfm_param);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if ((bh >> 2) + blk_col > max_blocks_wide ||
         (bh >> 2) + blk_row > max_blocks_high) {
@@ -2938,8 +2939,10 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-               (blk_col >> (1 - pd->subsampling_x));
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE (*const inter_tx_size)[MI_BLOCK_SIZE] =
+    (TX_SIZE (*)[MI_BLOCK_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
   int64_t this_rd = INT64_MAX;
@@ -3002,7 +3005,7 @@
       x->token_costs[tx_size][pd->plane_type][1][0][0][coeff_ctx][EOB_TOKEN];
 
   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
-    mbmi->inter_tx_size[tx_idx] = tx_size;
+    inter_tx_size[0][0] = tx_size;
     vp10_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                        plane_bsize, coeff_ctx, rate, dist, bsse, skip);
 
@@ -3065,11 +3068,10 @@
       pta[i] = ptl[i] = !(tmp_eob == 0);
     txfm_partition_update(tx_above + (blk_col >> 1),
                           tx_left + (blk_row >> 1), tx_size);
-    mbmi->inter_tx_size[tx_idx] = tx_size;
-
+    inter_tx_size[0][0] = tx_size;
     for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
       for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
-        mbmi->inter_tx_size[tx_idx + (idy << 3) + idx] = tx_size;
+        inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (this_rd == INT64_MAX)
       *is_cost_valid = 0;
@@ -3154,22 +3156,76 @@
   }
 }
 
+static int64_t select_tx_size_fix_type(const VP10_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int64_t *dist,
+                                       int *skippable,
+                                       int64_t *sse, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd, TX_TYPE tx_type) {
+  const VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+  int ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
+#endif  // CONFIG_EXT_TX
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int s0 = vp10_cost_bit(skip_prob, 0);
+  int s1 = vp10_cost_bit(skip_prob, 1);
+  int64_t rd;
+
+  mbmi->tx_type = tx_type;
+  inter_block_yrd(cpi, x, rate, dist, skippable, sse, bsize, ref_best_rd);
+
+  if (*rate == INT_MAX)
+    return INT64_MAX;
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(max_tx_size, bsize, is_inter) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *rate += cpi->inter_tx_type_costs[ext_tx_set]
+                                         [max_tx_size][mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *rate += cpi->intra_tx_type_costs[ext_tx_set][max_tx_size]
+                                         [mbmi->mode][mbmi->tx_type];
+    }
+  }
+#else  // CONFIG_EXT_TX
+  if (max_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (is_inter)
+      *rate += cpi->inter_tx_type_costs[max_tx_size][mbmi->tx_type];
+    else
+      *rate += cpi->intra_tx_type_costs[max_tx_size]
+                 [intra_mode_to_tx_type_context[mbmi->mode]][mbmi->tx_type];
+  }
+#endif  // CONFIG_EXT_TX
+
+  if (*skippable)
+    rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+  else
+    rd = RDCOST(x->rdmult, x->rddiv, *rate + s0, *dist);
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !(*skippable))
+    rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+
+  return rd;
+}
+
 static void select_tx_type_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
                                int *rate, int64_t *distortion, int *skippable,
                                int64_t *sse, BLOCK_SIZE bsize,
                                int64_t ref_best_rd) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int64_t rd = INT64_MAX;
   int64_t best_rd = INT64_MAX;
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
-  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
-  int s0 = vp10_cost_bit(skip_prob, 0);
-  int s1 = vp10_cost_bit(skip_prob, 1);
-  TX_SIZE best_tx_size[64];
+  TX_SIZE best_tx_size[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   TX_SIZE best_tx = TX_SIZES;
   uint8_t best_blk_skip[256];
   const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
@@ -3199,11 +3255,6 @@
       if (cpi->sf.tx_type_search > 0) {
         if (!do_tx_type_search(tx_type, prune))
           continue;
-      } else if (ext_tx_set == 1 &&
-                 tx_type >= DST_ADST && tx_type < IDTX &&
-                 best_tx_type == DCT_DCT) {
-        tx_type = IDTX - 1;
-        continue;
       }
     } else {
       if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
@@ -3212,66 +3263,16 @@
       }
       if (!ext_tx_used_intra[ext_tx_set][tx_type])
         continue;
-      if (ext_tx_set == 1 &&
-          tx_type >= DST_ADST && tx_type < IDTX &&
-          best_tx_type == DCT_DCT) {
-        tx_type = IDTX - 1;
-        break;
-      }
-    }
-
-    mbmi->tx_type = tx_type;
-
-    inter_block_yrd(cpi, x, &this_rate, &this_dist, &this_skip, &this_sse,
-                    bsize, ref_best_rd);
-
-    if (get_ext_tx_types(max_tx_size, bsize, is_inter) > 1 &&
-        !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-        this_rate != INT_MAX) {
-      if (is_inter) {
-        if (ext_tx_set > 0)
-          this_rate += cpi->inter_tx_type_costs[ext_tx_set]
-                                       [max_tx_size][mbmi->tx_type];
-      } else {
-        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
-          this_rate += cpi->intra_tx_type_costs[ext_tx_set][max_tx_size]
-                                               [mbmi->mode][mbmi->tx_type];
-      }
     }
 #else  // CONFIG_EXT_TX
-      if (max_tx_size >= TX_32X32 && tx_type != DCT_DCT)
-        continue;
-
-      mbmi->tx_type = tx_type;
-
-      inter_block_yrd(cpi, x, &this_rate, &this_dist, &this_skip, &this_sse,
-                      bsize, ref_best_rd);
-
-      if (max_tx_size < TX_32X32 &&
-          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-          this_rate != INT_MAX) {
-        if (is_inter) {
-          this_rate += cpi->inter_tx_type_costs[max_tx_size][mbmi->tx_type];
-          if (cpi->sf.tx_type_search > 0 && !do_tx_type_search(tx_type, prune))
-              continue;
-        } else {
-          this_rate += cpi->intra_tx_type_costs[max_tx_size]
-              [intra_mode_to_tx_type_context[mbmi->mode]]
-              [mbmi->tx_type];
-        }
-      }
-#endif  // CONFIG_EXT_TX
-
-    if (this_rate == INT_MAX)
+    if (max_tx_size >= TX_32X32 && tx_type != DCT_DCT)
       continue;
-
-    if (this_skip)
-      rd = RDCOST(x->rdmult, x->rddiv, s1, this_sse);
-    else
-      rd = RDCOST(x->rdmult, x->rddiv, this_rate + s0, this_dist);
-
-    if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !this_skip)
-      rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, this_sse));
+    if (is_inter && cpi->sf.tx_type_search > 0 &&
+        !do_tx_type_search(tx_type, prune))
+      continue;
+#endif  // CONFIG_EXT_TX
+    rd = select_tx_size_fix_type(cpi, x, &this_rate, &this_dist, &this_skip,
+                                 &this_sse, bsize, ref_best_rd, tx_type);
 
     if (rd < (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) * best_rd) {
       best_rd = rd;
@@ -3284,14 +3285,14 @@
       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
       for (idy = 0; idy < xd->n8_h; ++idy)
         for (idx = 0; idx < xd->n8_w; ++idx)
-          best_tx_size[idy * 8 + idx] = mbmi->inter_tx_size[idy * 8 + idx];
+          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
     }
   }
 
   mbmi->tx_type = best_tx_type;
   for (idy = 0; idy < xd->n8_h; ++idy)
     for (idx = 0; idx < xd->n8_w; ++idx)
-      mbmi->inter_tx_size[idy * 8 + idx] = best_tx_size[idy * 8 + idx];
+      mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
   mbmi->tx_size = best_tx;
   memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
 }
@@ -3306,12 +3307,11 @@
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-               (blk_col >> (1 - pd->subsampling_x));
-  TX_SIZE plane_tx_size = plane ?
-      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], bsize,
-                          0, 0) :
-      mbmi->inter_tx_size[tx_idx];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
 
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
@@ -3847,7 +3847,7 @@
   mbmi->angle_delta[1] = best_angle_delta;
   if (*rate_tokenonly != INT_MAX)
     super_block_uvrd(cpi, x, &this_rate_tokenonly,
-                     &this_distortion, &s, &this_sse, bsize, INT_MAX);
+                     &this_distortion, &s, &this_sse, bsize, INT64_MAX);
   return *rate_tokenonly != INT_MAX;
 }
 #endif  // CONFIG_EXT_INTRA
@@ -4826,6 +4826,10 @@
       int64_t best_rd = INT64_MAX;
       const int i = idy * 2 + idx;
       int ref;
+#if CONFIG_REF_MV
+        CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+        uint8_t ref_mv_count[2];
+#endif
 #if CONFIG_EXT_INTER
       int mv_idx;
       int_mv ref_mvs_sub8x8[2][2];
@@ -4840,6 +4844,10 @@
 #endif  // CONFIG_EXT_INTER
         frame_mv[ZEROMV][frame].as_int = 0;
         vp10_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                       ref_mv_stack[ref],
+                                       &ref_mv_count[ref],
+#endif
 #if CONFIG_EXT_INTER
                                        mv_ref_list,
 #endif  // CONFIG_EXT_INTER
@@ -5214,6 +5222,7 @@
                                     bsi->rdstat[i][mode_idx].tl,
                                     idy, idx,
                                     mi_row, mi_col);
+
         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
@@ -5457,7 +5466,6 @@
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int64_t comp_pred_diff[REFERENCE_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
                          int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
@@ -5471,9 +5479,6 @@
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
-
-  memcpy(ctx->best_filter_diff, best_filter_diff,
-         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_buffer_inter(
@@ -6041,9 +6046,7 @@
                                  INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
                                  int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
-                                 const int64_t ref_best_rd,
-                                 int64_t *mask_filter,
-                                 int64_t filter_cache[]) {
+                                 const int64_t ref_best_rd) {
   VP10_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -6101,8 +6104,8 @@
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
   INTERP_FILTER best_filter = SWITCHABLE;
-  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
-  int64_t bsse[MAX_MB_PLANE << 2] = {0};
+  uint8_t skip_txfm[MAX_MB_PLANE][4] = {{0}};
+  int64_t bsse[MAX_MB_PLANE][4] = {{0}};
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -6386,11 +6389,6 @@
   if (is_comp_pred)
     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
 
-  // Search for best switchable filter by checking the variance of
-  // pred error irrespective of whether the filter will be used
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
-
   best_filter = predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
                                       single_filter);
   if (cm->interp_filter != BILINEAR && best_filter == SWITCHABLE) {
@@ -6410,12 +6408,8 @@
 
       if (i > 0 && intpel_mv && IsInterpolatingFilter(i)) {
         rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
-        filter_cache[i] = rd;
-        filter_cache[SWITCHABLE_FILTERS] =
-            VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
         if (cm->interp_filter == SWITCHABLE)
           rd += rs_rd;
-        *mask_filter = VPXMAX(*mask_filter, rd);
       } else {
         int rate_sum = 0;
         int64_t dist_sum = 0;
@@ -6447,12 +6441,8 @@
                         &tmp_skip_sb, &tmp_skip_sse);
 
         rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
-        filter_cache[i] = rd;
-        filter_cache[SWITCHABLE_FILTERS] =
-            VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
         if (cm->interp_filter == SWITCHABLE)
           rd += rs_rd;
-        *mask_filter = VPXMAX(*mask_filter, rd);
 
         if (i == 0 && intpel_mv && IsInterpolatingFilter(i)) {
           tmp_rate_sum = rate_sum;
@@ -6964,7 +6954,7 @@
                       bsize, ref_best_rd);
       for (idy = 0; idy < xd->n8_h; ++idy)
         for (idx = 0; idx < xd->n8_w; ++idx)
-          mbmi->inter_tx_size[idy * 8 + idx] = mbmi->tx_size;
+          mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
     }
 #else
     super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
@@ -7396,8 +7386,6 @@
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
@@ -7435,8 +7423,6 @@
   int64_t mode_threshold[MAX_MODES];
   int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
-  int64_t mask_filter = 0;
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   int palette_ctx = 0;
   const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -7492,16 +7478,11 @@
          sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
 #endif  // CONFIG_EXT_INTRA
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
-
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
   for (i = 0; i < MAX_REF_FRAMES; ++i)
@@ -7555,9 +7536,9 @@
 #endif  // CONFIG_REF_MV
 
 #if CONFIG_OBMC
-  vp10_build_prediction_by_above_preds(cpi, xd, mi_row, mi_col, dst_buf1,
+  vp10_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
                                        dst_stride1);
-  vp10_build_prediction_by_left_preds(cpi, xd, mi_row, mi_col, dst_buf2,
+  vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                       dst_stride2);
   vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
 #endif  // CONFIG_OBMC
@@ -8048,6 +8029,13 @@
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
+#if CONFIG_REF_MV
+      int_mv backup_ref_mv[2];
+
+      backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
+      if (comp_pred)
+        backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
+#endif
 #if CONFIG_EXT_INTER
       if (second_ref_frame == INTRA_FRAME) {
         mbmi->interintra_mode = best_intra_mode;
@@ -8066,6 +8054,19 @@
 #if CONFIG_REF_MV
       mbmi->ref_mv_idx = 0;
       ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+
+      if (this_mode == NEWMV &&
+          mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+        int ref;
+        for (ref = 0; ref < 1 + comp_pred; ++ref) {
+          int_mv this_mv = (ref == 0) ?
+              mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv :
+              mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+          clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+          lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
+          mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+        }
+      }
 #endif
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   &rate2, &distortion2, &skippable,
@@ -8086,25 +8087,35 @@
 #endif  // CONFIG_EXT_INTER
                                   single_inter_filter,
                                   single_skippable,
-                                  &total_sse, best_rd,
-                                  &mask_filter, filter_cache);
+                                  &total_sse, best_rd);
 
 #if CONFIG_REF_MV
       // TODO(jingning): This needs some refactoring to improve code quality
       // and reduce redundant steps.
-      if (mbmi->mode == NEARMV &&
-          mbmi_ext->ref_mv_count[ref_frame_type] > 2) {
+      if ((mbmi->mode == NEARMV &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+          (mbmi->mode == NEWMV &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
-        int_mv cur_mv = mbmi_ext->ref_mv_stack[ref_frame][2].this_mv;
         MB_MODE_INFO backup_mbmi = *mbmi;
-
+        int backup_skip = x->skip;
         int64_t tmp_ref_rd = this_rd;
         int ref_idx;
-        int ref_set = VPXMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 2);
 
-        uint8_t drl0_ctx =
-            vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], 1);
-        rate2 += cpi->drl_mode_cost0[drl0_ctx][0];
+        // TODO(jingning): This should be deprecated shortly.
+        int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
+        int ref_set =
+            VPXMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+
+        uint8_t drl_ctx = vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                                       idx_offset);
+        // Dummy
+        int_mv backup_fmv[2];
+        backup_fmv[0] = frame_mv[NEWMV][ref_frame];
+        if (comp_pred)
+          backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
+
+        rate2 += cpi->drl_mode_cost0[drl_ctx][0];
 
         if (this_rd < INT64_MAX) {
           if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
@@ -8129,18 +8140,33 @@
           int tmp_rate = 0, tmp_rate_y = 0, tmp_rate_uv = 0;
           int tmp_skip = 1;
           int64_t tmp_dist = 0, tmp_sse = 0;
+          int dummy_disable_skip = 0;
+          int ref;
+          int_mv cur_mv;
 
-          cur_mv = mbmi_ext->ref_mv_stack[ref_frame][2 + ref_idx].this_mv;
+          mbmi->ref_mv_idx = 1 + ref_idx;
+
+          for (ref = 0; ref < 1 + comp_pred; ++ref) {
+            int_mv this_mv = (ref == 0) ?
+                mbmi_ext->ref_mv_stack[ref_frame_type]
+                                      [mbmi->ref_mv_idx].this_mv :
+                mbmi_ext->ref_mv_stack[ref_frame_type]
+                                      [mbmi->ref_mv_idx].comp_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+            lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+          }
+
+          cur_mv = mbmi_ext->ref_mv_stack[ref_frame]
+                                 [mbmi->ref_mv_idx + idx_offset].this_mv;
           lower_mv_precision(&cur_mv.as_mv, cm->allow_high_precision_mv);
           clamp_mv2(&cur_mv.as_mv, xd);
 
           if (!mv_check_bounds(x, &cur_mv.as_mv)) {
-            int64_t dummy_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
             INTERP_FILTER dummy_single_inter_filter[MB_MODE_COUNT]
                                                    [MAX_REF_FRAMES];
             int dummy_single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
             int dummy_disable_skip = 0;
-            int64_t dummy_mask_filter = 0;
 #if CONFIG_EXT_INTER
             int_mv dummy_single_newmvs[2][MAX_REF_FRAMES] =
                                           { { { 0 } },  { { 0 } } };
@@ -8151,7 +8177,6 @@
 #else
             int_mv dummy_single_newmv[MAX_REF_FRAMES] = { { 0 } };
 #endif
-            mbmi->ref_mv_idx = 1 + ref_idx;
 
             frame_mv[NEARMV][ref_frame] = cur_mv;
             tmp_alt_rd = handle_inter_mode(cpi, x, bsize,
@@ -8173,17 +8198,23 @@
 #endif
                                            dummy_single_inter_filter,
                                            dummy_single_skippable,
-                                           &tmp_sse, best_rd,
-                                           &dummy_mask_filter,
-                                           dummy_filter_cache);
+                                           &tmp_sse, best_rd);
           }
 
-          tmp_rate += cpi->drl_mode_cost0[drl0_ctx][1];
+          for (i = 0; i < mbmi->ref_mv_idx; ++i) {
+            uint8_t drl1_ctx = 0;
+            drl1_ctx = vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                                    i + idx_offset);
+            tmp_rate += cpi->drl_mode_cost0[drl1_ctx][1];
+          }
 
-          if (mbmi_ext->ref_mv_count[ref_frame_type] > 3) {
+          if (mbmi_ext->ref_mv_count[ref_frame_type] >
+              mbmi->ref_mv_idx + idx_offset + 1 &&
+              ref_idx < ref_set - 1) {
             uint8_t drl1_ctx =
-                vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], 2);
-            tmp_rate += cpi->drl_mode_cost1[drl1_ctx][ref_idx];
+                vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                             mbmi->ref_mv_idx + idx_offset);
+            tmp_rate += cpi->drl_mode_cost0[drl1_ctx][0];
           }
 
           if (tmp_alt_rd < INT64_MAX) {
@@ -8206,6 +8237,7 @@
 
           if (tmp_ref_rd > tmp_alt_rd) {
             rate2 = tmp_rate;
+            disable_skip = dummy_disable_skip;
             distortion2 = tmp_dist;
             skippable = tmp_skip;
             rate_y = tmp_rate_y;
@@ -8214,6 +8246,7 @@
             this_rd = tmp_alt_rd;
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
+            backup_skip = x->skip;
 #if CONFIG_VAR_TX
             for (i = 0; i < MAX_MB_PLANE; ++i)
               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
@@ -8221,16 +8254,23 @@
 #endif
           } else {
             *mbmi = backup_mbmi;
+            x->skip = backup_skip;
           }
         }
 
         frame_mv[NEARMV][ref_frame] = backup_mv;
+        frame_mv[NEWMV][ref_frame] = backup_fmv[0];
+        if (comp_pred)
+          frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
 #if CONFIG_VAR_TX
         for (i = 0; i < MAX_MB_PLANE; ++i)
           memcpy(x->blk_skip[i], x->blk_skip_drl[i],
                  sizeof(uint8_t) * ctx->num_4x4_blk);
 #endif
       }
+      mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
+      if (comp_pred)
+        mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
 #endif  // CONFIG_REF_MV
 
       if (this_rd == INT64_MAX)
@@ -8317,8 +8357,6 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -8338,11 +8376,12 @@
 
         rd_cost->rate = rate2;
 #if CONFIG_SUPERTX
-        *returnrate_nocoef = rate2 - rate_y - rate_uv;
-        if (!disable_skip) {
-          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
-                                              skippable || this_skip2);
-        }
+        if (x->skip)
+          *returnrate_nocoef = rate2;
+        else
+          *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+            disable_skip || skippable || this_skip2);
         *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
                                             mbmi->ref_frame[0] != INTRA_FRAME);
 #if CONFIG_OBMC
@@ -8417,29 +8456,6 @@
       }
       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
-
-      /* keep record of best filter type */
-      if (!mode_excluded && cm->interp_filter != BILINEAR) {
-        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->interp_filter];
-
-        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-          int64_t adj_rd;
-          if (ref == INT64_MAX)
-            adj_rd = 0;
-          else if (filter_cache[i] == INT64_MAX)
-            // when early termination is triggered, the encoder does not have
-            // access to the rate-distortion cost. it only knows that the cost
-            // should be above the maximum valid value. hence it takes the known
-            // maximum plus an arbitrary constant as the rate-distortion cost.
-            adj_rd = mask_filter - ref + 10;
-          else
-            adj_rd = filter_cache[i] - ref;
-
-          adj_rd += this_rd;
-          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
-        }
-      }
     }
 
     if (early_term)
@@ -8453,6 +8469,9 @@
   if (cm->allow_screen_content_tools && !is_inter_mode(best_mbmode.mode)) {
     PREDICTION_MODE mode_selected;
     int rate2 = 0, rate_y = 0;
+#if CONFIG_SUPERTX
+    int best_rate_nocoef;
+#endif
     int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
     int skippable = 0, rate_overhead = 0;
     TX_SIZE best_tx_size, uv_tx;
@@ -8522,8 +8541,14 @@
 
     if (skippable) {
       rate2 -= (rate_y + rate_uv_tokenonly[uv_tx]);
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2;
+#endif
       rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
     } else {
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2 - (rate_y + rate_uv_tokenonly[uv_tx]);
+#endif
       rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
     }
     this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@@ -8533,6 +8558,9 @@
       mbmi->mv[0].as_int = 0;
       max_plane = 1;
       rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+      *returnrate_nocoef = best_rate_nocoef;
+#endif
       rd_cost->dist = distortion2;
       rd_cost->rdcost = this_rd;
       best_rd = this_rd;
@@ -8563,17 +8591,21 @@
 #if CONFIG_REF_MV
     const uint8_t rf_type = vp10_ref_frame_type(best_mbmode.ref_frame);
     if (!comp_pred_mode) {
-      if (best_mbmode.ref_mv_idx > 0 && refs[1] == NONE) {
-        int idx = best_mbmode.ref_mv_idx + 1;
-        int_mv cur_mv = mbmi_ext->ref_mv_stack[refs[0]][idx].this_mv;
+      int i;
+      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) ?
+          VPXMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) : INT_MAX;
+
+      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+        int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
         lower_mv_precision(&cur_mv.as_mv, cm->allow_high_precision_mv);
-        frame_mv[NEARMV][refs[0]] = cur_mv;
+        if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
+          best_mbmode.mode = NEARMV;
+          best_mbmode.ref_mv_idx = i;
+        }
       }
 
       if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
         best_mbmode.mode = NEARESTMV;
-      else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
-        best_mbmode.mode = NEARMV;
       else if (best_mbmode.mv[0].as_int == 0)
         best_mbmode.mode = ZEROMV;
     } else {
@@ -8581,21 +8613,37 @@
       const int allow_hp = cm->allow_high_precision_mv;
       int_mv nearestmv[2] = { frame_mv[NEARESTMV][refs[0]],
                               frame_mv[NEARESTMV][refs[1]] };
-
       int_mv nearmv[2] = { frame_mv[NEARMV][refs[0]],
                            frame_mv[NEARMV][refs[1]] };
 
+#if CONFIG_EXT_INTER
+      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
+         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
+         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
+       }
+#else
+      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) ?
+          VPXMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) : INT_MAX;
+
+      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
+        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
+        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
+
+        if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+            nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+          best_mbmode.mode = NEARMV;
+          best_mbmode.ref_mv_idx = i;
+        }
+      }
+#endif
+
       if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
         nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
         nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
       }
 
-      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
-        int ref_mv_idx = best_mbmode.ref_mv_idx + 1;
-        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][ref_mv_idx].this_mv;
-        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][ref_mv_idx].comp_mv;
-      }
-
       for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
         lower_mv_precision(&nearestmv[i].as_mv, allow_hp);
         lower_mv_precision(&nearmv[i].as_mv, allow_hp);
@@ -8615,9 +8663,6 @@
         best_mbmode.mode = ZERO_ZEROMV;
 #else
         best_mbmode.mode = NEARESTMV;
-      else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
-          nearmv[1].as_int == best_mbmode.mv[1].as_int)
-        best_mbmode.mode = NEARMV;
       else if (best_mbmode.mv[0].as_int == 0 && best_mbmode.mv[1].as_int == 0)
         best_mbmode.mode = ZEROMV;
 #endif  // CONFIG_EXT_INTER
@@ -8734,21 +8779,6 @@
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-  if (!x->skip) {
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      if (best_filter_rd[i] == INT64_MAX)
-        best_filter_diff[i] = 0;
-      else
-        best_filter_diff[i] = best_rd - best_filter_rd[i];
-    }
-    if (cm->interp_filter == SWITCHABLE)
-      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp10_zero(best_filter_diff);
-  }
-
-  // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
-  // updating code causes PSNR loss. Need to figure out the confliction.
   x->skip |= best_mode_skippable;
 
   if (!x->skip && !x->select_tx_size) {
@@ -8772,7 +8802,7 @@
   assert(best_mode_index >= 0);
 
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
-                       best_filter_diff, best_mode_skippable);
+                       best_mode_skippable);
 
   if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
     restore_uv_color_map(cpi, x);
@@ -8793,7 +8823,6 @@
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
   INTERP_FILTER best_filter = SWITCHABLE;
@@ -8878,12 +8907,11 @@
                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp10_zero(best_pred_diff);
-  vp10_zero(best_filter_diff);
 
   if (!x->select_tx_size)
     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
   store_coding_context(x, ctx, THR_ZEROMV,
-                       best_pred_diff, best_filter_diff, 0);
+                       best_pred_diff, 0);
 }
 
 void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
@@ -8923,8 +8951,6 @@
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -8944,8 +8970,6 @@
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
-  int64_t mask_filter = 0;
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   int internal_active_edge =
     vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
 
@@ -8969,9 +8993,6 @@
   mbmi->use_wedge_interintra = 0;
 #endif  // CONFIG_EXT_INTER
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
-
   for (i = 0; i < 4; i++) {
     int j;
 #if CONFIG_EXT_INTER
@@ -8991,8 +9012,6 @@
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   rate_uv_intra = INT_MAX;
 
   rd_cost->rate = INT_MAX;
@@ -9187,7 +9206,7 @@
     }
 
 #if CONFIG_VAR_TX
-    mbmi->inter_tx_size[0] = mbmi->tx_size;
+    mbmi->inter_tx_size[0][0] = mbmi->tx_size;
 #endif
 
     if (ref_frame == INTRA_FRAME) {
@@ -9250,8 +9269,6 @@
 #endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
           rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-        filter_cache[i] = INT64_MAX;
 
       // TODO(any): Add search of the tx_type to improve rd performance at the
       // expense of speed.
@@ -9295,14 +9312,9 @@
               continue;
             rs = vp10_get_switchable_rate(cpi, xd);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            filter_cache[switchable_filter_index] = tmp_rd;
-            filter_cache[SWITCHABLE_FILTERS] =
-                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
             if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
-            mask_filter = VPXMAX(mask_filter, tmp_rd);
-
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
               tmp_best_filter = mbmi->interp_filter;
@@ -9476,8 +9488,6 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -9572,29 +9582,6 @@
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
-    /* keep record of best filter type */
-    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
-        cm->interp_filter != BILINEAR) {
-      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->interp_filter];
-      int64_t adj_rd;
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-        if (ref == INT64_MAX)
-          adj_rd = 0;
-        else if (filter_cache[i] == INT64_MAX)
-          // when early termination is triggered, the encoder does not have
-          // access to the rate-distortion cost. it only knows that the cost
-          // should be above the maximum valid value. hence it takes the known
-          // maximum plus an arbitrary constant as the rate-distortion cost.
-          adj_rd = mask_filter - ref + 10;
-        else
-          adj_rd = filter_cache[i] - ref;
-
-        adj_rd += this_rd;
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
-      }
-    }
-
     if (early_term)
       break;
 
@@ -9666,226 +9653,6 @@
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-  if (!x->skip) {
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      if (best_filter_rd[i] == INT64_MAX)
-        best_filter_diff[i] = 0;
-      else
-        best_filter_diff[i] = best_rd - best_filter_rd[i];
-    }
-    if (cm->interp_filter == SWITCHABLE)
-      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp10_zero(best_filter_diff);
-  }
-
   store_coding_context(x, ctx, best_ref_index,
-                       best_pred_diff, best_filter_diff, 0);
+                       best_pred_diff, 0);
 }
-
-#if CONFIG_OBMC
-void vp10_build_prediction_by_above_preds(VP10_COMP *cpi,
-                                          MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          uint8_t *tmp_buf[MAX_MB_PLANE],
-                                          int tmp_stride[MAX_MB_PLANE]) {
-  VP10_COMMON *const cm = &cpi->common;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int i, j, mi_step, ref;
-
-  if (mi_row == 0)
-    return;
-
-  for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *above_mi = xd->mi[mi_col_offset +
-                                 mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
-
-    mi_step = VPXMIN(xd->n8_w,
-                     num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(above_mbmi))
-      continue;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst,
-                       tmp_buf[j], tmp_stride[j],
-                       0, i, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
-    }
-    /*
-    set_ref_ptrs(cm, xd, above_mbmi->ref_frame[0], above_mbmi->ref_frame[1]);
-    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
-      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(
-          cpi, above_mbmi->ref_frame[ref]);
-      assert(cfg != NULL);
-      vp10_setup_pre_planes(xd, ref, cfg, mi_row, mi_col + i,
-                            &xd->block_refs[ref]->sf);
-    }
-    */
-    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
-      MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
-      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!vp10_is_valid_scale(&ref_buf->sf)))
-        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
-                            &ref_buf->sf);
-    }
-
-    xd->mb_to_left_edge   = -(((mi_col + i) * MI_SIZE) * 8);
-    mi_x = (mi_col + i) << MI_SIZE_LOG2;
-    mi_y = mi_row << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (mi_step * 8) >> pd->subsampling_x;
-      bh = VPXMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
-                  4);
-
-      if (above_mbmi->sb_type < BLOCK_8X8) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - above_mbmi->sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
-        int x, y;
-
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT)
-                && y == 0 && !pd->subsampling_y)
-              continue;
-
-            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
-                                   y * 2 + x, bw, bh,
-                                   4 * x, 0, pw, bh,
-#if CONFIG_SUPERTX && CONFIG_EXT_INTER
-                                   0, 0,
-#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
-                                   mi_x, mi_y);
-          }
-      } else {
-        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
-                               0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX && CONFIG_EXT_INTER
-                               0, 0,
-#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
-                               mi_x, mi_y);
-      }
-    }
-  }
-  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
-}
-
-void vp10_build_prediction_by_left_preds(VP10_COMP *cpi,
-                                         MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_stride[MAX_MB_PLANE]) {
-  VP10_COMMON *const cm = &cpi->common;
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int i, j, mi_step, ref;
-
-  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start) ||
-      (mi_col - 1) >= tile->mi_col_end)
-    return;
-
-  for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int mi_x, mi_y, bw, bh;
-    MODE_INFO *left_mi = xd->mi[mi_col_offset +
-                                mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
-    const int is_compound = has_second_ref(left_mbmi);
-
-    mi_step = VPXMIN(xd->n8_h,
-                     num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
-
-    if (!is_neighbor_overlappable(left_mbmi))
-      continue;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst,
-                       tmp_buf[j], tmp_stride[j],
-                       i, 0, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
-    }
-    /*
-    set_ref_ptrs(cm, xd, left_mbmi->ref_frame[0], left_mbmi->ref_frame[1]);
-    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
-      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
-                                                     left_mbmi->ref_frame[ref]);
-      assert(cfg != NULL);
-      vp10_setup_pre_planes(xd, ref, cfg, mi_row + i, mi_col,
-                            &xd->block_refs[ref]->sf);
-    }
-    */
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
-      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-      xd->block_refs[ref] = ref_buf;
-      if ((!vp10_is_valid_scale(&ref_buf->sf)))
-        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                           "Reference frame has invalid dimensions");
-      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
-                            &ref_buf->sf);
-    }
-
-    xd->mb_to_top_edge    = -(((mi_row + i) * MI_SIZE) * 8);
-    mi_x = mi_col << MI_SIZE_LOG2;
-    mi_y = (mi_row + i) << MI_SIZE_LOG2;
-
-    for (j = 0; j < MAX_MB_PLANE; ++j) {
-      const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = VPXMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
-                  4);
-      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
-
-      if (left_mbmi->sb_type < BLOCK_8X8) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - left_mbmi->sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
-        int x, y;
-
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT)
-                && x == 0 && !pd->subsampling_x)
-              continue;
-
-            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
-                                   y * 2 + x, bw, bh,
-                                   0, 4 * y, bw, ph,
-#if CONFIG_SUPERTX && CONFIG_EXT_INTER
-                                   0, 0,
-#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
-                                   mi_x, mi_y);
-          }
-      } else {
-        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0,
-                               bw, bh, 0, 0, bw, bh,
-#if CONFIG_SUPERTX && CONFIG_EXT_INTER
-                               0, 0,
-#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
-                               mi_x, mi_y);
-      }
-    }
-  }
-  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
-}
-#endif  // CONFIG_OBMC
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index 174ad4d..ab57c1e 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -90,19 +90,6 @@
                                    int use_fast_coef_casting);
 #endif  // CONFIG_SUPERTX
 
-#if CONFIG_OBMC
-void vp10_build_prediction_by_above_preds(VP10_COMP *cpi,
-                                          MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          uint8_t *tmp_buf[MAX_MB_PLANE],
-                                          int tmp_stride[MAX_MB_PLANE]);
-void vp10_build_prediction_by_left_preds(VP10_COMP *cpi,
-                                         MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         int tmp_stride[MAX_MB_PLANE]);
-#endif  // CONFIG_OBMC
-
 static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
                                                           const int ref) {
   // Use up-sampled reference frames.
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index 969b87f..477e32d 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -164,15 +164,106 @@
                           int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const int mis = cm->mi_stride;
-  int bw, bh;
   const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#else
+  const int bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
 
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize == BLOCK_8X8)
+    partition = PARTITION_NONE;
+  else
+    partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
+                              mi_row, mi_col, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      {
+        const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+        int n;
+
+        assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
+               num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
+
+        for (n = 0; n < 4; n++) {
+          const int mi_dc = hbs * (n & 1);
+          const int mi_dr = hbs * (n >> 1);
+
+          count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc],
+                        no_pred_segcounts, temporal_predictor_count,
+                        t_unpred_seg_counts,
+                        mi_row + mi_dr, mi_col + mi_dc, subsize);
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+#else
   if (bw == bs && bh == bs) {
     count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, bs, mi_row, mi_col);
@@ -204,6 +295,7 @@
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index 169ae2c..c50b949 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -199,9 +199,6 @@
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
-#if CONFIG_EXT_TX
-    sf->tx_type_search = PRUNE_THREE;
-#endif
   }
 
   if (speed >= 4) {
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 02ee204..ea4df6e 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -188,8 +188,6 @@
 #if CONFIG_EXT_TX
   // eliminates two tx types in each direction
   PRUNE_TWO = 2,
-  // eliminates three tx types in each direction
-  PRUNE_THREE = 3,
 #endif
 } TX_TYPE_SEARCH;
 
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index c71c985..822ccc9 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -565,11 +565,11 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  int blk_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
-                (blk_col >> (1 - pd->subsampling_x));
-  TX_SIZE plane_tx_size = plane ?
-      get_uv_tx_size_impl(mbmi->inter_tx_size[blk_idx], bsize, 0, 0) :
-      mbmi->inter_tx_size[blk_idx];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
 
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
@@ -693,14 +693,6 @@
       vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
                                               &arg);
       (*t)->token = EOSB_TOKEN;
-#if CONFIG_ANS
-      // TODO(aconverse): clip the number of bits in tokenize_b
-      // Smuggle TX_SIZE in the unused extrabits field so the ANS encoder
-      // knows the maximum number of extrabits to write at the end of the block
-      // (where it starts).
-      (*t)->extra = (EXTRABIT)(plane ? get_uv_tx_size(mbmi, &xd->plane[plane])
-                                     : mbmi->tx_size);
-#endif  // CONFIG_ANS
       (*t)++;
     }
   } else {
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index 8ff7c9c..8a55425 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -172,42 +172,6 @@
   transpose_4x4(in);
 }
 
-#if CONFIG_EXT_TX
-static void fdst4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u[4], v[4];
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpacklo_epi16(in[3], in[2]);
-
-  v[0] = _mm_add_epi16(u[0], u[1]);
-  v[1] = _mm_sub_epi16(u[0], u[1]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p24_p08);
-  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
-  u[2] = _mm_madd_epi16(v[0], k__cospi_p08_m24);
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  transpose_4x4(in);
-}
-#endif  // CONFIG_EXT_TX
-
 void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[4];
@@ -265,48 +229,6 @@
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    case DST_DST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdst4_sse2(in);
-      fdst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DCT_DST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdct4_sse2(in);
-      fdst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DST_DCT:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdst4_sse2(in);
-      fdct4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DST_ADST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fdst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case ADST_DST:
-      load_buffer_4x4(input, in, stride, 0, 0);
-      fadst4_sse2(in);
-      fdst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case DST_FLIPADST:
-      load_buffer_4x4(input, in, stride, 0, 1);
-      fdst4_sse2(in);
-      fadst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
-    case FLIPADST_DST:
-      load_buffer_4x4(input, in, stride, 1, 0);
-      fadst4_sse2(in);
-      fdst4_sse2(in);
-      write_buffer_4x4(output, in);
-      break;
 #endif  // CONFIG_EXT_TX
    default:
      assert(0);
@@ -1288,155 +1210,6 @@
   array_transpose_8x8(in, in);
 }
 
-#if CONFIG_EXT_TX
-static void fdst8_sse2(__m128i *in) {
-  // Constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t) -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
-
-  s0 = _mm_sub_epi16(in[0], in[7]);
-  s1 = _mm_sub_epi16(in[1], in[6]);  // -s1
-  s2 = _mm_sub_epi16(in[2], in[5]);
-  s3 = _mm_sub_epi16(in[3], in[4]);  // -s3
-  s4 = _mm_add_epi16(in[3], in[4]);  // -s4
-  s5 = _mm_add_epi16(in[2], in[5]);
-  s6 = _mm_add_epi16(in[1], in[6]);  // -s6
-  s7 = _mm_add_epi16(in[0], in[7]);
-
-  x0 = _mm_sub_epi16(s0, s3);
-  x1 = _mm_sub_epi16(s1, s2);  // -x1
-  x2 = _mm_add_epi16(s1, s2);  // -x2
-  x3 = _mm_add_epi16(s0, s3);
-
-  // Interleave
-  t0 = _mm_unpacklo_epi16(x0, x1);
-  t1 = _mm_unpackhi_epi16(x0, x1);
-  t2 = _mm_unpacklo_epi16(x2, x3);
-  t3 = _mm_unpackhi_epi16(x2, x3);
-
-  // Perform butterfly multiplication/addition
-  x0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-  x1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-  x2 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-  x3 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-  x4 = _mm_madd_epi16(t2, k__cospi_m24_p08);
-  x5 = _mm_madd_epi16(t3, k__cospi_m24_p08);
-  x6 = _mm_madd_epi16(t2, k__cospi_p08_p24);
-  x7 = _mm_madd_epi16(t3, k__cospi_p08_p24);
-
-  // Rounding
-  t0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  t1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  t2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  t3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-  t4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
-  t5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
-  t6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
-  t7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
-  // Shift
-  x0 = _mm_srai_epi32(t0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(t1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(t2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(t3, DCT_CONST_BITS);
-  x4 = _mm_srai_epi32(t4, DCT_CONST_BITS);
-  x5 = _mm_srai_epi32(t5, DCT_CONST_BITS);
-  x6 = _mm_srai_epi32(t6, DCT_CONST_BITS);
-  x7 = _mm_srai_epi32(t7, DCT_CONST_BITS);
-
-  // Pack 32b integer to 16b with signed saturation
-  in[7] = _mm_packs_epi32(x0, x1);
-  in[5] = _mm_packs_epi32(x4, x5);
-  in[3] = _mm_packs_epi32(x2, x3);
-  in[1] = _mm_packs_epi32(x6, x7);
-
-  // Interleave
-  s0 = _mm_unpacklo_epi16(s6, s5);
-  s1 = _mm_unpackhi_epi16(s6, s5);
-
-  // Perform butterfly multiplication/addition
-  x0 = _mm_madd_epi16(s0, k__cospi_m16_m16);
-  x1 = _mm_madd_epi16(s1, k__cospi_m16_m16);
-  x2 = _mm_madd_epi16(s0, k__cospi_m16_p16);
-  x3 = _mm_madd_epi16(s1, k__cospi_m16_p16);
-
-  // Rounding
-  t0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  t1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  t2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  t3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-
-  // Shift
-  x0 = _mm_srai_epi32(t0, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(t1, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(t2, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(t3, DCT_CONST_BITS);
-
-  // Pack 32b integer to 16b with signed saturation
-  t2 = _mm_packs_epi32(x0, x1);
-  t3 = _mm_packs_epi32(x2, x3);
-
-  x0 = _mm_sub_epi16(t2, s4);
-  x1 = _mm_add_epi16(t2, s4);  // -x1
-  x2 = _mm_sub_epi16(s7, t3);
-  x3 = _mm_add_epi16(s7, t3);
-
-  s0 = _mm_unpacklo_epi16(x0, x3);
-  s1 = _mm_unpackhi_epi16(x0, x3);
-  s2 = _mm_unpacklo_epi16(x1, x2);
-  s3 = _mm_unpackhi_epi16(x1, x2);
-
-  t0 = _mm_madd_epi16(s0, k__cospi_p28_p04);
-  t1 = _mm_madd_epi16(s1, k__cospi_p28_p04);
-  t2 = _mm_madd_epi16(s2, k__cospi_m12_p20);
-  t3 = _mm_madd_epi16(s3, k__cospi_m12_p20);
-  t4 = _mm_madd_epi16(s2, k__cospi_p20_p12);
-  t5 = _mm_madd_epi16(s3, k__cospi_p20_p12);
-  t6 = _mm_madd_epi16(s0, k__cospi_m04_p28);
-  t7 = _mm_madd_epi16(s1, k__cospi_m04_p28);
-
-  // Rounding
-  x0 = _mm_add_epi32(t0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(t1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(t2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(t3, k__DCT_CONST_ROUNDING);
-  x4 = _mm_add_epi32(t4, k__DCT_CONST_ROUNDING);
-  x5 = _mm_add_epi32(t5, k__DCT_CONST_ROUNDING);
-  x6 = _mm_add_epi32(t6, k__DCT_CONST_ROUNDING);
-  x7 = _mm_add_epi32(t7, k__DCT_CONST_ROUNDING);
-  // Shift
-  s0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  s1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  s2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  s3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-  s4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
-  s5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
-  s6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
-  s7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
-
-  in[6] = _mm_packs_epi32(s0, s1);
-  in[4] = _mm_packs_epi32(s4, s5);
-  in[2] = _mm_packs_epi32(s2, s3);
-  in[0] = _mm_packs_epi32(s6, s7);
-
-  // coeffs: [x3 x2 x1 x0, x7 x6 x5 x4]
-  // Transpose
-  array_transpose_8x8(in, in);
-}
-#endif  // CONFIG_EXT_TX
-
 void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[8];
@@ -1502,55 +1275,6 @@
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    case DST_DST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdst8_sse2(in);
-      fdst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DCT_DST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdct8_sse2(in);
-      fdst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DST_DCT:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdst8_sse2(in);
-      fdct8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DST_ADST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fdst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case ADST_DST:
-      load_buffer_8x8(input, in, stride, 0, 0);
-      fadst8_sse2(in);
-      fdst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case DST_FLIPADST:
-      load_buffer_8x8(input, in, stride, 0, 1);
-      fdst8_sse2(in);
-      fadst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
-    case FLIPADST_DST:
-      load_buffer_8x8(input, in, stride, 1, 0);
-      fadst8_sse2(in);
-      fdst8_sse2(in);
-      right_shift_8x8(in, 1);
-      write_buffer_8x8(output, in, 8);
-      break;
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
@@ -2420,351 +2144,6 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-#if CONFIG_EXT_TX
-static void fdst16_8col(__m128i *in) {
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t) -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i k__cospi_m08_m24 = pair_set_epi16(-cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-
-  const __m128i k__cospi_m30_p02 = pair_set_epi16(-cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_m14_p18 = pair_set_epi16(-cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m22_p10 = pair_set_epi16(-cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_m06_p26 = pair_set_epi16(-cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i x0, x1, x2, x3, t0, t1, t2, t3;
-  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
-
-  //  (1)
-  u0 = _mm_sub_epi16(in[0], in[15]);
-  v7 = _mm_add_epi16(in[0], in[15]);
-
-  u1 = _mm_sub_epi16(in[1], in[14]);  // -u1
-  v6 = _mm_add_epi16(in[1], in[14]);  // -v6
-
-  u2 = _mm_sub_epi16(in[2], in[13]);
-  v5 = _mm_add_epi16(in[2], in[13]);
-
-  u3 = _mm_sub_epi16(in[3], in[12]);  // -u3
-  v4 = _mm_add_epi16(in[3], in[12]);  // -v4
-
-  u4 = _mm_sub_epi16(in[4], in[11]);
-  v3 = _mm_add_epi16(in[4], in[11]);
-
-  u5 = _mm_sub_epi16(in[5], in[10]);  // -u5
-  v2 = _mm_add_epi16(in[5], in[10]);  // -v2
-
-  u6 = _mm_sub_epi16(in[6], in[9]);
-  v1 = _mm_add_epi16(in[6], in[9]);
-
-  u7 = _mm_sub_epi16(in[7], in[8]);   // -u7
-  v0 = _mm_add_epi16(in[7], in[8]);   // -v0
-
-  s0 = _mm_sub_epi16(u0, u7);
-  s1 = _mm_sub_epi16(u1, u6);  // -s1
-  s2 = _mm_sub_epi16(u2, u5);
-  s3 = _mm_sub_epi16(u3, u4);  // -s3
-  s4 = _mm_add_epi16(u3, u4);  // -s4
-  s5 = _mm_add_epi16(u2, u5);
-  s6 = _mm_add_epi16(u1, u6);  // -s6
-  s7 = _mm_add_epi16(u0, u7);
-
-  x0 = _mm_sub_epi16(s0, s3);
-  x1 = _mm_sub_epi16(s1, s2);  // -x1
-  x2 = _mm_add_epi16(s1, s2);  // -x2
-  x3 = _mm_add_epi16(s0, s3);
-
-  y0 = _mm_unpacklo_epi16(x0, x1);
-  y1 = _mm_unpackhi_epi16(x0, x1);
-  y2 = _mm_unpacklo_epi16(x2, x3);
-  y3 = _mm_unpackhi_epi16(x2, x3);
-
-  t0 = _mm_madd_epi16(y0, k__cospi_p16_m16);
-  t1 = _mm_madd_epi16(y1, k__cospi_p16_m16);
-  t2 = _mm_madd_epi16(y0, k__cospi_p16_p16);
-  t3 = _mm_madd_epi16(y1, k__cospi_p16_p16);
-  x0 = _mm_madd_epi16(y2, k__cospi_m24_p08);
-  x1 = _mm_madd_epi16(y3, k__cospi_m24_p08);
-  x2 = _mm_madd_epi16(y2, k__cospi_p08_p24);
-  x3 = _mm_madd_epi16(y3, k__cospi_p08_p24);
-
-  y0 = _mm_add_epi32(t0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(t1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(t2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(t3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
-
-  t0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  t1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  t2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  t3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  x0 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  x1 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  x2 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  x3 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[15] = _mm_packs_epi32(t0, t1);
-  in[11] = _mm_packs_epi32(x0, x1);
-  in[7] = _mm_packs_epi32(t2, t3);
-  in[3] = _mm_packs_epi32(x2, x3);
-
-  //  (2)
-  t0 = _mm_unpacklo_epi16(s6, s5);
-  t1 = _mm_unpackhi_epi16(s6, s5);
-
-  y0 = _mm_madd_epi16(t0, k__cospi_m16_m16);
-  y1 = _mm_madd_epi16(t1, k__cospi_m16_m16);
-  y2 = _mm_madd_epi16(t0, k__cospi_m16_p16);
-  y3 = _mm_madd_epi16(t1, k__cospi_m16_p16);
-
-  x0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
-  x1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
-  x2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
-  x3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
-
-  y4 = _mm_srai_epi32(x0, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(x1, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(x2, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(x3, DCT_CONST_BITS);
-
-  t2 = _mm_packs_epi32(y4, y5);
-  t3 = _mm_packs_epi32(y6, y7);
-
-  x0 = _mm_sub_epi16(s4, t2);  // -x0
-  x1 = _mm_add_epi16(s4, t2);  // -x1
-  x2 = _mm_sub_epi16(s7, t3);
-  x3 = _mm_add_epi16(s7, t3);
-
-  y0 = _mm_unpacklo_epi16(x0, x3);
-  y1 = _mm_unpackhi_epi16(x0, x3);
-  y2 = _mm_unpacklo_epi16(x1, x2);
-  y3 = _mm_unpackhi_epi16(x1, x2);
-
-  w0 = _mm_madd_epi16(y0, k__cospi_m28_p04);
-  w1 = _mm_madd_epi16(y1, k__cospi_m28_p04);
-  w2 = _mm_madd_epi16(y2, k__cospi_m12_p20);
-  w3 = _mm_madd_epi16(y3, k__cospi_m12_p20);
-  w4 = _mm_madd_epi16(y2, k__cospi_p20_p12);
-  w5 = _mm_madd_epi16(y3, k__cospi_p20_p12);
-  w6 = _mm_madd_epi16(y0, k__cospi_p04_p28);
-  w7 = _mm_madd_epi16(y1, k__cospi_p04_p28);
-
-  u0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  y1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  y2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  y3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  y4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  y5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  y6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  y7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  in[13] = _mm_packs_epi32(y0, y1);
-  in[9] = _mm_packs_epi32(y4, y5);
-  in[5] = _mm_packs_epi32(y2, y3);
-  in[1] = _mm_packs_epi32(y6, y7);
-
-  //  (3)
-  y0 = _mm_unpacklo_epi16(v5, v2);
-  y1 = _mm_unpackhi_epi16(v5, v2);
-  y2 = _mm_unpacklo_epi16(v4, v3);
-  y3 = _mm_unpackhi_epi16(v4, v3);
-
-  u0 = _mm_madd_epi16(y0, k__cospi_p16_p16);
-  u1 = _mm_madd_epi16(y1, k__cospi_p16_p16);
-  u2 = _mm_madd_epi16(y2, k__cospi_m16_m16);
-  u3 = _mm_madd_epi16(y3, k__cospi_m16_m16);
-  u4 = _mm_madd_epi16(y2, k__cospi_m16_p16);
-  u5 = _mm_madd_epi16(y3, k__cospi_m16_p16);
-  u6 = _mm_madd_epi16(y0, k__cospi_p16_m16);
-  u7 = _mm_madd_epi16(y1, k__cospi_p16_m16);
-
-  w0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  w1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  w2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  w3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  w4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  w5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  w6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  w7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  s0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
-  s1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
-  s2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
-  s3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
-  s4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
-  s5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
-  s6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
-  s7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
-
-  y2 = _mm_packs_epi32(s0, s1);
-  y3 = _mm_packs_epi32(s2, s3);
-  y4 = _mm_packs_epi32(s4, s5);
-  y5 = _mm_packs_epi32(s6, s7);
-
-  //  step 3
-  w0 = _mm_sub_epi16(v0, y3);  // -w0
-  w1 = _mm_add_epi16(v1, y2);
-  w2 = _mm_sub_epi16(v1, y2);
-  w3 = _mm_add_epi16(v0, y3);  // -w3
-  w4 = _mm_sub_epi16(v7, y4);
-  w5 = _mm_add_epi16(v6, y5);  // -w5
-  w6 = _mm_sub_epi16(v6, y5);  // -w6
-  w7 = _mm_add_epi16(v7, y4);
-
-  //  step 4
-  x0 = _mm_unpacklo_epi16(w1, w6);
-  x1 = _mm_unpackhi_epi16(w1, w6);
-  x2 = _mm_unpacklo_epi16(w2, w5);
-  x3 = _mm_unpackhi_epi16(w2, w5);
-
-  u0 = _mm_madd_epi16(x0, k__cospi_m08_m24);
-  u1 = _mm_madd_epi16(x1, k__cospi_m08_m24);
-  u2 = _mm_madd_epi16(x2, k__cospi_p24_m08);
-  u3 = _mm_madd_epi16(x3, k__cospi_p24_m08);
-  u4 = _mm_madd_epi16(x2, k__cospi_p08_p24);
-  u5 = _mm_madd_epi16(x3, k__cospi_p08_p24);
-  u6 = _mm_madd_epi16(x0, k__cospi_p24_m08);
-  u7 = _mm_madd_epi16(x1, k__cospi_p24_m08);
-
-  s0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  s1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  s2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  s3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  s4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  s5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  s6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  s7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(s0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(s1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(s2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(s3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(s4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(s5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(s6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(s7, DCT_CONST_BITS);
-
-  y1 = _mm_packs_epi32(u0, u1);
-  y2 = _mm_packs_epi32(u2, u3);
-  y5 = _mm_packs_epi32(u4, u5);
-  y6 = _mm_packs_epi32(u6, u7);
-
-  //  step 5
-  v0 = _mm_sub_epi16(w0, y1);  // -v0
-  v1 = _mm_add_epi16(w0, y1);  // -v1
-  v2 = _mm_sub_epi16(w3, y2);  // -v2
-  v3 = _mm_add_epi16(w3, y2);  // -v3
-  v4 = _mm_sub_epi16(w4, y5);
-  v5 = _mm_add_epi16(w4, y5);
-  v6 = _mm_sub_epi16(w7, y6);
-  v7 = _mm_add_epi16(w7, y6);
-
-  u0 = _mm_unpacklo_epi16(v0, v7);
-  u1 = _mm_unpackhi_epi16(v0, v7);
-  u2 = _mm_unpacklo_epi16(v1, v6);
-  u3 = _mm_unpackhi_epi16(v1, v6);
-  u4 = _mm_unpacklo_epi16(v2, v5);
-  u5 = _mm_unpackhi_epi16(v2, v5);
-  u6 = _mm_unpacklo_epi16(v3, v4);
-  u7 = _mm_unpackhi_epi16(v3, v4);
-
-  s0 = _mm_madd_epi16(u0, k__cospi_m30_p02);  // x0
-  s1 = _mm_madd_epi16(u1, k__cospi_m30_p02);
-  s2 = _mm_madd_epi16(u2, k__cospi_m14_p18);  // x1
-  s3 = _mm_madd_epi16(u3, k__cospi_m14_p18);
-  s4 = _mm_madd_epi16(u4, k__cospi_m22_p10);  // x2
-  s5 = _mm_madd_epi16(u5, k__cospi_m22_p10);
-  s6 = _mm_madd_epi16(u6, k__cospi_m06_p26);  // x3
-  s7 = _mm_madd_epi16(u7, k__cospi_m06_p26);
-
-  w0 = _mm_madd_epi16(u6, k__cospi_p26_p06);  // x4
-  w1 = _mm_madd_epi16(u7, k__cospi_p26_p06);
-  w2 = _mm_madd_epi16(u4, k__cospi_p10_p22);  // x5
-  w3 = _mm_madd_epi16(u5, k__cospi_p10_p22);
-  w4 = _mm_madd_epi16(u2, k__cospi_p18_p14);  // x6
-  w5 = _mm_madd_epi16(u3, k__cospi_p18_p14);
-  w6 = _mm_madd_epi16(u0, k__cospi_p02_p30);  // x7
-  w7 = _mm_madd_epi16(u1, k__cospi_p02_p30);
-
-  v0 = _mm_add_epi32(s0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(s1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(s2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(s3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(s4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(s5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(s6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(s7, k__DCT_CONST_ROUNDING);
-
-  y0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  y1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  y2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  y3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  y4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  y5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  y6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  y7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  s0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
-  s1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
-  s2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
-  s3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
-  s4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
-  s5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
-  s6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
-  s7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
-
-  in[14] = _mm_packs_epi32(u0, u1);
-  in[6] = _mm_packs_epi32(u2, u3);
-  in[10] = _mm_packs_epi32(u4, u5);
-  in[2] = _mm_packs_epi32(u6, u7);
-  in[12] = _mm_packs_epi32(s0, s1);
-  in[4] = _mm_packs_epi32(s2, s3);
-  in[8] = _mm_packs_epi32(s4, s5);
-  in[0] = _mm_packs_epi32(s6, s7);
-}
-#endif  // CONFIG_EXT_TX
-
 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
   fdct16_8col(in0);
   fdct16_8col(in1);
@@ -2777,14 +2156,6 @@
   array_transpose_16x16(in0, in1);
 }
 
-#if CONFIG_EXT_TX
-static void fdst16_sse2(__m128i *in0, __m128i *in1) {
-  fdst16_8col(in0);
-  fdst16_8col(in1);
-  array_transpose_16x16(in0, in1);
-}
-#endif  // CONFIG_EXT_TX
-
 void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
                        int stride, int tx_type) {
   __m128i in0[16], in1[16];
@@ -2850,55 +2221,6 @@
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    case DST_DST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DCT_DST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdct16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DST_DCT:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdct16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DST_ADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fdst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case ADST_DST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case DST_FLIPADST:
-      load_buffer_16x16(input, in0, in1, stride, 0, 1);
-      fdst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fadst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
-    case FLIPADST_DST:
-      load_buffer_16x16(input, in0, in1, stride, 1, 0);
-      fadst16_sse2(in0, in1);
-      right_shift_16x16(in0, in1);
-      fdst16_sse2(in0, in1);
-      write_buffer_16x16(output, in0, in1, 16);
-      break;
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index ee1e305..24f42df 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -651,6 +651,44 @@
     ref += ref_stride;
   }
 }
+
+void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred,
+                                 int width, int height,
+                                 const uint8_t *ref8,
+                                 int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ref[(j << 3)];
+    }
+    comp_pred += width;
+    ref += stride;
+  }
+}
+
+void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
+                                          const uint8_t *pred8,
+                                          int width, int height,
+                                          const uint8_t *ref8,
+                                          int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[(j << 3)];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += stride;
+  }
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP10 && CONFIG_EXT_INTER
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e5c002a..ced7009 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1238,6 +1238,13 @@
 add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 specialize qw/vpx_comp_avg_upsampled_pred sse2/;
 
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride";
+  specialize qw/vpx_highbd_upsampled_pred sse2/;
+  add_proto qw/void vpx_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+  specialize qw/vpx_highbd_comp_avg_upsampled_pred sse2/;
+}
+
 #
 # ...
 #
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 81ec5db..e2b79bf 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,7 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_ports/mem.h"
 
@@ -591,3 +595,136 @@
 #undef FNS
 #undef FN
 #endif  // CONFIG_USE_X86INC
+
+void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
+                                    int width, int height,
+                                    const uint8_t *ref8,
+                                    int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // read 8 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 8) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
+        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
+        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
+        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
+        __m128i t0, t1, t2, t3;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t2 = _mm_unpacklo_epi16(s4, s5);
+        t3 = _mm_unpacklo_epi16(s6, s7);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+        t2 = _mm_unpacklo_epi32(t2, t3);
+        t0 = _mm_unpacklo_epi64(t0, t2);
+
+        _mm_storeu_si128((__m128i *)(comp_pred), t0);
+        comp_pred += 8;
+        ref += 64;                            // 8 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  } else {
+    // read 4 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 4) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i t0, t1;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+
+        _mm_storel_epi64((__m128i *)(comp_pred), t0);
+        comp_pred += 4;
+        ref += 4 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  }
+}
+
+void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
+                                             const uint8_t *pred8,
+                                             int width, int height,
+                                             const uint8_t *ref8,
+                                             int ref_stride) {
+  const __m128i one = _mm_set1_epi16(1);
+  int i, j;
+  int stride = ref_stride << 3;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // read 8 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 8) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
+        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
+        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
+        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
+        __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+        __m128i t0, t1, t2, t3;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t2 = _mm_unpacklo_epi16(s4, s5);
+        t3 = _mm_unpacklo_epi16(s6, s7);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+        t2 = _mm_unpacklo_epi32(t2, t3);
+        t0 = _mm_unpacklo_epi64(t0, t2);
+
+        p0 = _mm_adds_epu16(t0, p0);
+        p0 = _mm_adds_epu16(p0, one);
+        p0 = _mm_srli_epi16(p0, 1);
+
+        _mm_storeu_si128((__m128i *)(comp_pred), p0);
+        comp_pred += 8;
+        pred += 8;
+        ref += 8 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  } else {
+    // read 4 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 4) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+        __m128i t0, t1;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+
+        p0 = _mm_adds_epu16(t0, p0);
+        p0 = _mm_adds_epu16(p0, one);
+        p0 = _mm_srli_epi16(p0, 1);
+
+        _mm_storel_epi64((__m128i *)(comp_pred), p0);
+        comp_pred += 4;
+        pred += 4;
+        ref += 4 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  }
+}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 63fc1e6..dc51173 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -509,12 +509,11 @@
           s2 = _mm_unpacklo_epi8(t1, s3);
           s4 = _mm_unpacklo_epi8(t2, s5);
           s6 = _mm_unpacklo_epi8(t3, s7);
+          s0 = _mm_unpacklo_epi32(s0, s2);
+          s4 = _mm_unpacklo_epi32(s4, s6);
+          s0 = _mm_unpacklo_epi64(s0, s4);
 
-          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
-          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2);
-          *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(s4);
-          *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(s6);
-
+          _mm_storeu_si128((__m128i *)(comp_pred), s0);
           comp_pred += 16;
           ref += 16 * 8;
         }
@@ -537,9 +536,9 @@
 
           s0 = _mm_unpacklo_epi8(t0, s1);
           s2 = _mm_unpacklo_epi8(t1, s3);
+          s0 = _mm_unpacklo_epi32(s0, s2);
 
-          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
-          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(s2);
+          _mm_storel_epi64((__m128i *)(comp_pred), s0);
           comp_pred += 8;
           ref += 8 * 8;
         }
@@ -558,7 +557,6 @@
           s0 = _mm_unpacklo_epi8(t0, s1);
 
           *(int *)comp_pred = _mm_cvtsi128_si32(s0);
-
           comp_pred += 4;
           ref += 4 * 8;
         }
@@ -621,14 +619,7 @@
           p1 = _mm_srli_epi16(p1, 1);
           p0 = _mm_packus_epi16(p0, p1);
 
-          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
-          p0 = _mm_srli_si128(p0, 4);
-          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0);
-          p0 = _mm_srli_si128(p0, 4);
-          *(int *)(comp_pred + 8) = _mm_cvtsi128_si32(p0);
-          p0 = _mm_srli_si128(p0, 4);
-          *(int *)(comp_pred + 12) = _mm_cvtsi128_si32(p0);
-
+          _mm_storeu_si128((__m128i *)(comp_pred), p0);
           comp_pred += 16;
           pred += 16;
           ref += 16 * 8;
@@ -662,10 +653,7 @@
           p0 = _mm_srli_epi16(p0, 1);
           p0 = _mm_packus_epi16(p0, zero);
 
-          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
-          p0 = _mm_srli_si128(p0, 4);
-          *(int *)(comp_pred + 4) = _mm_cvtsi128_si32(p0);
-
+          _mm_storel_epi64((__m128i *)(comp_pred), p0);
           comp_pred += 8;
           pred += 8;
           ref += 8 * 8;
@@ -693,7 +681,6 @@
           p0 = _mm_packus_epi16(p0, zero);
 
           *(int *)comp_pred = _mm_cvtsi128_si32(p0);
-
           comp_pred += 4;
           pred += 4;
           ref += 4 * 8;