Merge branch 'master' into nextgenv2 Manually resovled the following conflicts: vp10/common/blockd.h vp10/common/entropy.h vp10/common/entropymode.c vp10/common/entropymode.h vp10/common/enums.h vp10/common/thread_common.c vp10/decoder/decodeframe.c vp10/decoder/decodemv.c vp10/encoder/bitstream.c vp10/encoder/encodeframe.c vp10/encoder/rd.c vp10/encoder/rdopt.c Change-Id: I15d20ce5292b70f0c2b4ba55c1f1318181481596

diff --git a/configure b/configure
index c3c0f40..29b1da8 100755
--- a/configure
+++ b/configure

@@ -272,7 +272,15 @@
     spatial_svc
     fp_mb_stats
     emulate_hardware
+    var_tx
+    ref_mv
+    ext_tx
     misc_fixes
+    ext_intra
+    ext_inter
+    ext_interp
+    ext_refs
+    supertx
 "
 CONFIG_LIST="
     dependency_tracking

diff --git a/test/test.mk b/test/test.mk
index 80b57e5..471f870 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -175,6 +175,11 @@
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm1d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm2d_test.cc
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c

diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc
new file mode 100644
index 0000000..a39e0ef
--- /dev/null
+++ b/test/vp10_fwd_txfm1d_test.cc

@@ -0,0 +1,130 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm1d.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+static int txfm_type_num = 2;
+static TYPE_TXFM txfm_type_ls[2] = {TYPE_DCT, TYPE_ADST};
+
+static int txfm_size_num = 4;
+static int txfm_size_ls[4] = {4, 8, 16, 32};
+
+static TxfmFunc fwd_txfm_func_ls[2][4] = {
+    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new},
+    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new}};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
+static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+TEST(vp10_fwd_txfm1d, round_shift) {
+  EXPECT_EQ(round_shift(7, 1), 3);
+  EXPECT_EQ(round_shift(-7, 1), -3);
+
+  EXPECT_EQ(round_shift(7, 2), 2);
+  EXPECT_EQ(round_shift(-7, 2), -2);
+
+  EXPECT_EQ(round_shift(8, 2), 2);
+  EXPECT_EQ(round_shift(-8, 2), -2);
+}
+
+TEST(vp10_fwd_txfm1d, get_max_bit) {
+  int max_bit = get_max_bit(8);
+  EXPECT_EQ(max_bit, 3);
+}
+
+TEST(vp10_fwd_txfm1d, half_btf) {
+  int32_t max = (1 << 15) - 1;
+  int32_t w0 = max;
+  int32_t in0 = max;
+  int32_t w1 = max;
+  int32_t in1 = max;
+  int32_t result_32 = half_btf(w0, in0, w1, in1, 0);
+  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
+  EXPECT_EQ(result_32, result_64);
+}
+
+TEST(vp10_fwd_txfm1d, cospi_arr) {
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 64; j++) {
+      EXPECT_EQ(cospi_arr[i][j],
+                (int32_t)round(cos(M_PI * j / 128) * (1 << (cos_bit_min + i))));
+    }
+  }
+}
+
+TEST(vp10_fwd_txfm1d, clamp_block) {
+  int16_t block[5][5] = {{7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9}};
+
+  int16_t ref_block[5][5] = {{7, -5, 6, -3, 9},
+                             {7, -5, 6, -3, 9},
+                             {7, -4, 2, -3, 9},
+                             {7, -4, 2, -3, 9},
+                             {7, -4, 2, -3, 9}};
+
+  int row = 2;
+  int col = 1;
+  int block_size = 3;
+  int stride = 5;
+  clamp_block(block[row] + col, block_size, stride, -4, 2);
+  for (int r = 0; r < stride; r++) {
+    for (int c = 0; c < stride; c++) {
+      EXPECT_EQ(block[r][c], ref_block[r][c]);
+    }
+  }
+}
+
+TEST(vp10_fwd_txfm1d, accuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    double *ref_input = new double[txfm_size];
+    double *ref_output = new double[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TYPE_TXFM txfm_type = txfm_type_ls[ti];
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      int max_error = 7;
+
+      const int count_test_block = 5000;
+      for (int ti = 0; ti < count_test_block; ++ti) {
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          input[ni] = rnd.Rand16() % base - rnd.Rand16() % base;
+          ref_input[ni] = static_cast<double>(input[ni]);
+        }
+
+        fwd_txfm_func(input, output, cos_bit, range_bit);
+        reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          EXPECT_LE(
+              abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+              max_error);
+        }
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+}  // namespace

diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
new file mode 100644
index 0000000..e6416cc
--- /dev/null
+++ b/test/vp10_fwd_txfm2d_test.cc

@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm2d.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int txfm_size_num = 4;
+const int txfm_size_ls[4] = {4, 8, 16, 32};
+const TXFM_2D_CFG fwd_txfm_cfg_ls[4][4] = {
+    {fwd_txfm_2d_cfg_dct_dct_4, fwd_txfm_2d_cfg_dct_adst_4,
+     fwd_txfm_2d_cfg_adst_adst_4, fwd_txfm_2d_cfg_adst_dct_4},
+    {fwd_txfm_2d_cfg_dct_dct_8, fwd_txfm_2d_cfg_dct_adst_8,
+     fwd_txfm_2d_cfg_adst_adst_8, fwd_txfm_2d_cfg_adst_dct_8},
+    {fwd_txfm_2d_cfg_dct_dct_16, fwd_txfm_2d_cfg_dct_adst_16,
+     fwd_txfm_2d_cfg_adst_adst_16, fwd_txfm_2d_cfg_adst_dct_16},
+    {fwd_txfm_2d_cfg_dct_dct_32, fwd_txfm_2d_cfg_dct_adst_32,
+     fwd_txfm_2d_cfg_adst_adst_32, fwd_txfm_2d_cfg_adst_dct_32}};
+
+const Fwd_Txfm2d_Func fwd_txfm_func_ls[4] = {
+    vp10_fwd_txfm2d_4x4, vp10_fwd_txfm2d_8x8, vp10_fwd_txfm2d_16x16,
+    vp10_fwd_txfm2d_32x32};
+
+const int txfm_type_num = 4;
+const TYPE_TXFM type_ls_0[4] = {TYPE_DCT, TYPE_DCT, TYPE_ADST, TYPE_ADST};
+const TYPE_TXFM type_ls_1[4] = {TYPE_DCT, TYPE_ADST, TYPE_ADST, TYPE_DCT};
+
+TEST(vp10_fwd_txfm2d, accuracy) {
+  for (int txfm_size_idx = 0; txfm_size_idx < txfm_size_num; ++txfm_size_idx) {
+    int txfm_size = txfm_size_ls[txfm_size_idx];
+    int sqr_txfm_size = txfm_size * txfm_size;
+    int16_t* input = new int16_t[sqr_txfm_size];
+    int32_t* output = new int32_t[sqr_txfm_size];
+    double* ref_input = new double[sqr_txfm_size];
+    double* ref_output = new double[sqr_txfm_size];
+
+    for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
+         ++txfm_type_idx) {
+      TXFM_2D_CFG fwd_txfm_cfg = fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
+      Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
+      TYPE_TXFM type0 = type_ls_0[txfm_type_idx];
+      TYPE_TXFM type1 = type_ls_1[txfm_type_idx];
+      int amplify_bit =
+          fwd_txfm_cfg.shift[0] + fwd_txfm_cfg.shift[1] + fwd_txfm_cfg.shift[2];
+      double amplify_factor =
+          amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      int count = 5000;
+      double avg_abs_error = 0;
+      for (int ci = 0; ci < count; ci++) {
+        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+          input[ni] = rnd.Rand16() % base;
+          ref_input[ni] = static_cast<double>(input[ni]);
+          output[ni] = 0;
+          ref_output[ni] = 0;
+        }
+
+        fwd_txfm_func(input, output, txfm_size, &fwd_txfm_cfg, bd);
+        reference_hybrid_2d(ref_input, ref_output, txfm_size, type0, type1);
+
+        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+          ref_output[ni] = round(ref_output[ni] * amplify_factor);
+          EXPECT_LE(fabs(output[ni] - ref_output[ni]) / amplify_factor, 30);
+        }
+        avg_abs_error += compute_avg_abs_error<int32_t, double>(
+            output, ref_output, sqr_txfm_size);
+      }
+
+      avg_abs_error /= amplify_factor;
+      avg_abs_error /= count;
+      // max_abs_avg_error comes from upper bound of avg_abs_error
+      // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
+      // %f\n", type0, type1, txfm_size, avg_abs_error);
+      double max_abs_avg_error = 1.5;
+      EXPECT_LE(avg_abs_error, max_abs_avg_error);
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+
+}  // anonymous namespace

diff --git a/test/vp10_inv_txfm1d_test.cc b/test/vp10_inv_txfm1d_test.cc
new file mode 100644
index 0000000..3b716c8
--- /dev/null
+++ b/test/vp10_inv_txfm1d_test.cc

@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm1d.h"
+#include "vp10/common/vp10_inv_txfm1d.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+static int txfm_type_num = 2;
+static int txfm_size_num = 4;
+static int txfm_size_ls[4] = {4, 8, 16, 32};
+
+static TxfmFunc fwd_txfm_func_ls[2][4] = {
+    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new},
+    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new}};
+
+static TxfmFunc inv_txfm_func_ls[2][4] = {
+    {vp10_idct4_new, vp10_idct8_new, vp10_idct16_new, vp10_idct32_new},
+    {vp10_iadst4_new, vp10_iadst8_new, vp10_iadst16_new, vp10_iadst32_new}};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+static int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
+static int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+TEST(vp10_inv_txfm1d, round_trip) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    int32_t *round_trip_output = new int32_t[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      TxfmFunc inv_txfm_func = inv_txfm_func_ls[ti][si];
+      int max_error = 2;
+
+      const int count_test_block = 5000;
+      for (int ci = 0; ci < count_test_block; ++ci) {
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          input[ni] = rnd.Rand16() % base - rnd.Rand16() % base;
+        }
+
+        fwd_txfm_func(input, output, cos_bit, range_bit);
+        inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          EXPECT_LE(abs(input[ni] - round_shift(round_trip_output[ni],
+                                                get_max_bit(txfm_size) - 1)),
+                    max_error);
+        }
+      }
+    }
+    delete[] input;
+    delete[] output;
+    delete[] round_trip_output;
+  }
+}
+
+}  // namespace

diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
new file mode 100644
index 0000000..603821e
--- /dev/null
+++ b/test/vp10_inv_txfm2d_test.cc

@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm2d.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "vp10/common/vp10_inv_txfm2d.h"
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+const int txfm_size_num = 4;
+const int txfm_size_ls[4] = {4, 8, 16, 32};
+const TXFM_2D_CFG fwd_txfm_cfg_ls[4][4] = {
+    {fwd_txfm_2d_cfg_dct_dct_4, fwd_txfm_2d_cfg_dct_adst_4,
+     fwd_txfm_2d_cfg_adst_adst_4, fwd_txfm_2d_cfg_adst_dct_4},
+    {fwd_txfm_2d_cfg_dct_dct_8, fwd_txfm_2d_cfg_dct_adst_8,
+     fwd_txfm_2d_cfg_adst_adst_8, fwd_txfm_2d_cfg_adst_dct_8},
+    {fwd_txfm_2d_cfg_dct_dct_16, fwd_txfm_2d_cfg_dct_adst_16,
+     fwd_txfm_2d_cfg_adst_adst_16, fwd_txfm_2d_cfg_adst_dct_16},
+    {fwd_txfm_2d_cfg_dct_dct_32, fwd_txfm_2d_cfg_dct_adst_32,
+     fwd_txfm_2d_cfg_adst_adst_32, fwd_txfm_2d_cfg_adst_dct_32}};
+
+const TXFM_2D_CFG inv_txfm_cfg_ls[4][4] = {
+    {inv_txfm_2d_cfg_dct_dct_4, inv_txfm_2d_cfg_dct_adst_4,
+     inv_txfm_2d_cfg_adst_adst_4, inv_txfm_2d_cfg_adst_dct_4},
+    {inv_txfm_2d_cfg_dct_dct_8, inv_txfm_2d_cfg_dct_adst_8,
+     inv_txfm_2d_cfg_adst_adst_8, inv_txfm_2d_cfg_adst_dct_8},
+    {inv_txfm_2d_cfg_dct_dct_16, inv_txfm_2d_cfg_dct_adst_16,
+     inv_txfm_2d_cfg_adst_adst_16, inv_txfm_2d_cfg_adst_dct_16},
+    {inv_txfm_2d_cfg_dct_dct_32, inv_txfm_2d_cfg_dct_adst_32,
+     inv_txfm_2d_cfg_adst_adst_32, inv_txfm_2d_cfg_adst_dct_32}};
+
+const Fwd_Txfm2d_Func fwd_txfm_func_ls[4] = {
+    vp10_fwd_txfm2d_4x4, vp10_fwd_txfm2d_8x8, vp10_fwd_txfm2d_16x16,
+    vp10_fwd_txfm2d_32x32};
+const Inv_Txfm2d_Func inv_txfm_func_ls[4] = {
+    vp10_inv_txfm2d_add_4x4, vp10_inv_txfm2d_add_8x8, vp10_inv_txfm2d_add_16x16,
+    vp10_inv_txfm2d_add_32x32};
+
+const int txfm_type_num = 4;
+
+TEST(vp10_inv_txfm2d, round_trip) {
+  for (int txfm_size_idx = 0; txfm_size_idx < txfm_size_num; ++txfm_size_idx) {
+    const int txfm_size = txfm_size_ls[txfm_size_idx];
+    const int sqr_txfm_size = txfm_size * txfm_size;
+    int16_t* input = new int16_t[sqr_txfm_size];
+    uint16_t* ref_input = new uint16_t[sqr_txfm_size];
+    int32_t* output = new int32_t[sqr_txfm_size];
+
+    for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
+         ++txfm_type_idx) {
+      const TXFM_2D_CFG fwd_txfm_cfg =
+          fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
+      const TXFM_2D_CFG inv_txfm_cfg =
+          inv_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
+      const Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
+      const Inv_Txfm2d_Func inv_txfm_func = inv_txfm_func_ls[txfm_size_idx];
+      const int count = 5000;
+      double avg_abs_error = 0;
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      for (int ci = 0; ci < count; ci++) {
+        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+          if (ci == 0) {
+            int extreme_input = base - 1;
+            input[ni] = extreme_input;  // extreme case
+            ref_input[ni] = 0;
+          } else {
+            input[ni] = rnd.Rand16() % base;
+            ref_input[ni] = 0;
+          }
+        }
+
+        fwd_txfm_func(input, output, txfm_size, &fwd_txfm_cfg, bd);
+        inv_txfm_func(output, ref_input, txfm_size, &inv_txfm_cfg, bd);
+
+        for (int ni = 0; ni < sqr_txfm_size; ++ni) {
+          EXPECT_LE(abs(input[ni] - ref_input[ni]), 2);
+        }
+        avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
+            input, ref_input, sqr_txfm_size);
+      }
+
+      avg_abs_error /= count;
+      // max_abs_avg_error comes from upper bound of
+      // printf("txfm_size: %d accuracy_avg_abs_error: %f\n", txfm_size,
+      // avg_abs_error);
+      // TODO(angiebird): this upper bound is from adst_adst_8
+      const double max_abs_avg_error = 0.024;
+      EXPECT_LE(avg_abs_error, max_abs_avg_error);
+    }
+
+    delete[] input;
+    delete[] ref_input;
+    delete[] output;
+  }
+}
+
+}  // anonymous namespace

diff --git a/test/vp10_inv_txfm_test.cc b/test/vp10_inv_txfm_test.cc
index c49081e..6c0a3d2 100644
--- a/test/vp10_inv_txfm_test.cc
+++ b/test/vp10_inv_txfm_test.cc

@@ -203,7 +203,7 @@
       // quantization with maximum allowed step sizes
       test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
       for (int j = 1; j < last_nonzero_; ++j)
-        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+        test_coef_block1[get_scan(tx_size_, DCT_DCT, 0)->scan[j]]
                          = (output_ref_block[j] / 1828) * 1828;
     }
 
@@ -265,7 +265,7 @@
         max_energy_leftover = 0;
         coef = 0;
       }
-      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+      test_coef_block1[get_scan(tx_size_, DCT_DCT, 0)->scan[j]] = coef;
     }
 
     memcpy(test_coef_block2, test_coef_block1,

diff --git a/test/vp10_txfm_test.h b/test/vp10_txfm_test.h
new file mode 100644
index 0000000..967d38b
--- /dev/null
+++ b/test/vp10_txfm_test.h

@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_TXFM_TEST_H_
+#define VP10_TXFM_TEST_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/vp10_txfm.h"
+
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+static double invSqrt2 = 1 / pow(2, 0.5);
+
+static void reference_dct_1d(const double* in, double* out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * invSqrt2;
+  }
+}
+
+static void reference_adst_1d(const double* in, double* out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+static void reference_hybrid_1d(double* in, double* out, int size, int type) {
+  if (type == TYPE_DCT)
+    reference_dct_1d(in, out, size);
+  else
+    reference_adst_1d(in, out, size);
+}
+
+static void reference_hybrid_2d(double* in, double* out, int size, int type0,
+                                int type1) {
+  double* tempOut = new double[size * size];
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = in[c * size + r];
+    }
+  }
+
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type0);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type1);
+  }
+  delete[] tempOut;
+}
+
+template <typename Type1, typename Type2>
+static double compute_avg_abs_error(const Type1* a, const Type2* b,
+                                    const int size) {
+  double error = 0;
+  for (int i = 0; i < size; i++) {
+    error += fabs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
+  }
+  error = error / size;
+  return error;
+}
+
+typedef void (*TxfmFunc)(const int32_t* in, int32_t* out, const int8_t* cos_bit,
+                         const int8_t* range_bit);
+
+typedef void (*Fwd_Txfm2d_Func)(const int16_t*, int32_t*, const int,
+                                const TXFM_2D_CFG*, const int);
+typedef void (*Inv_Txfm2d_Func)(const int32_t*, uint16_t*, const int,
+                                const TXFM_2D_CFG*, const int);
+
+static const int bd = 10;
+static const int base = (1 << bd);
+
+#endif  // VP10_TXFM_TEST_H_

diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 89200d4..670529c 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc

@@ -78,19 +78,19 @@
     return !strcmp(dot, ".y4m");
 }
 
-class ArfFreqTest
+class ArfFreqTestLarge
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWith3Params<TestVideoParam, \
                                                  TestEncodeParam, int> {
  protected:
-  ArfFreqTest()
+  ArfFreqTestLarge()
       : EncoderTest(GET_PARAM(0)),
         test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)),
         min_arf_requested_(GET_PARAM(3)) {
   }
 
-  virtual ~ArfFreqTest() {}
+  virtual ~ArfFreqTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -190,7 +190,7 @@
   int run_of_visible_frames_;
 };
 
-TEST_P(ArfFreqTest, MinArfFreqTest) {
+TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
   cfg_.rc_target_bitrate = kBitrate;
   cfg_.g_error_resilient = 0;
   cfg_.g_profile = test_video_param_.profile;
@@ -225,26 +225,26 @@
 }
 
 VP9_INSTANTIATE_TEST_CASE(
-    ArfFreqTest,
+    ArfFreqTestLarge,
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-# if CONFIG_VP10_ENCODER
+#if CONFIG_VP10_ENCODER
 // TODO(angiebird): 25-29 fail in high bitdepth mode.
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_VP10, ArfFreqTest,
+    DISABLED_VP10, ArfFreqTestLarge,
     ::testing::Combine(
         ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
             &libvpx_test::kVP10)),
         ::testing::ValuesIn(kTestVectors),
         ::testing::ValuesIn(kEncodeVectors),
         ::testing::ValuesIn(kMinArfVectors)));
-# endif  // CONFIG_VP10_ENCODER
+#endif  // CONFIG_VP10_ENCODER
 #else
 VP10_INSTANTIATE_TEST_CASE(
-    ArfFreqTest,
+    ArfFreqTestLarge,
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));

diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 63f6dfe..8ac5c33 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc

@@ -108,7 +108,7 @@
 TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
   std::vector<std::string> single_thr_md5, multi_thr_md5;
 
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 18);
 
   cfg_.rc_target_bitrate = 1000;
 
@@ -138,5 +138,5 @@
 VP10_INSTANTIATE_TEST_CASE(
     VPxEncoderThreadTest,
     ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
-    ::testing::Range(1, 3));
+    ::testing::Range(1, 2));
 }  // namespace

diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index 9ca86e5..364afde 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c

@@ -97,6 +97,10 @@
   cm->above_context = NULL;
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+#if CONFIG_VAR_TX
+  vpx_free(cm->above_txfm_context);
+  cm->above_txfm_context = NULL;
+#endif
 }
 
 int vp10_alloc_context_buffers(VP10_COMMON *cm, int width, int height) {
@@ -128,6 +132,14 @@
     cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
         mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
     if (!cm->above_seg_context) goto fail;
+
+#if CONFIG_VAR_TX
+    vpx_free(cm->above_txfm_context);
+    cm->above_txfm_context = (TXFM_CONTEXT *)vpx_calloc(
+        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_txfm_context));
+    if (!cm->above_txfm_context) goto fail;
+#endif
+
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 

diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index fce1767..dd5c2d1 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h

@@ -38,6 +38,15 @@
   FRAME_TYPES,
 } FRAME_TYPE;
 
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+#define IsInterpolatingFilter(filter) \
+    (vp10_filter_kernels[filter][0][SUBPEL_TAPS / 2 - 1] == 128)
+#else
+#define IsInterpolatingFilter(filter)  (1)
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+
+#define MAXTXLEN 32
+
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
@@ -58,34 +67,76 @@
 #define NONE           -1
 #define INTRA_FRAME     0
 #define LAST_FRAME      1
+#if CONFIG_EXT_REFS
+#define LAST2_FRAME     2
+#define LAST3_FRAME     3
+#define LAST4_FRAME     4
+#define GOLDEN_FRAME    5
+#define ALTREF_FRAME    6
+#define MAX_REF_FRAMES  7
+#define LAST_REF_FRAMES (LAST4_FRAME - LAST_FRAME + 1)
+#else
 #define GOLDEN_FRAME    2
 #define ALTREF_FRAME    3
 #define MAX_REF_FRAMES  4
+#endif  // CONFIG_EXT_REFS
+
 typedef int8_t MV_REFERENCE_FRAME;
 
+typedef struct {
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
+  // Value of base colors for Y, U, and V
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+#else
+  uint8_t palette_colors[3 * PALETTE_MAX_SIZE];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // Only used by encoder to store the color index of the top left pixel.
+  // TODO(huisu): move this to encoder
+  uint8_t palette_first_color_idx[2];
+} PALETTE_MODE_INFO;
+
+#if CONFIG_EXT_INTRA
+typedef struct {
+  // 1: an ext intra mode is used; 0: otherwise.
+  uint8_t use_ext_intra_mode[PLANE_TYPES];
+  EXT_INTRA_MODE ext_intra_mode[PLANE_TYPES];
+} EXT_INTRA_MODE_INFO;
+#endif  // CONFIG_EXT_INTRA
+
 // This structure now relates to 8x8 block regions.
 typedef struct {
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
-  int8_t skip;
-#if CONFIG_MISC_FIXES
-  int8_t has_no_coeffs;
+#if CONFIG_VAR_TX
+  // TODO(jingning): This effectively assigned 64 entries for each 8x8 block.
+  // Apparently it takes much more space than needed.
+  TX_SIZE inter_tx_size[64];
 #endif
+  int8_t skip;
+  int8_t has_no_coeffs;
   int8_t segment_id;
   int8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
   // Only for INTRA blocks
   PREDICTION_MODE uv_mode;
+  PALETTE_MODE_INFO palette_mode_info;
 
   // Only for INTER blocks
   INTERP_FILTER interp_filter;
   MV_REFERENCE_FRAME ref_frame[2];
   TX_TYPE tx_type;
 
-  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
+#if CONFIG_EXT_INTRA
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  int8_t angle_delta[2];
+#endif  // CONFIG_EXT_INTRA
+
   int_mv mv[2];
+  int_mv pred_mv[2];
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
@@ -122,7 +173,7 @@
   int stride;
 };
 
-struct macroblockd_plane {
+typedef struct macroblockd_plane {
   tran_low_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
@@ -141,7 +192,7 @@
 
   // encoder
   const int16_t *dequant;
-};
+} MACROBLOCKD_PLANE;
 
 #define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
 
@@ -172,6 +223,8 @@
   int up_available;
   int left_available;
 
+  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
+
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
   int mb_to_right_edge;
@@ -192,6 +245,23 @@
   PARTITION_CONTEXT *above_seg_context;
   PARTITION_CONTEXT left_seg_context[8];
 
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT *left_txfm_context;
+  TXFM_CONTEXT left_txfm_context_buffer[8];
+
+  TX_SIZE max_tx_size;
+#endif
+
+  // dimension in the unit of 8x8 block of the current block
+  uint8_t n8_w, n8_h;
+
+#if CONFIG_REF_MV
+  uint8_t ref_mv_count[MAX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MAX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint8_t is_sec_rect;
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
   /* Bit depth: 8, 10, 12 */
   int bd;
@@ -221,17 +291,177 @@
   ADST_ADST,  // TM
 };
 
-static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
-                                  int block_idx) {
+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return (int)mbmi->tx_size >
+      VPXMIN(b_width_log2_lookup[mbmi->sb_type],
+             b_height_log2_lookup[mbmi->sb_type]);
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_TX
+#define ALLOW_INTRA_EXT_TX 1
+
+static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
+  1, 17, 10, 2
+};
+static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
+  1, 17, 10
+};
+
+#define USE_IDTX_FOR_32X32 0
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                 int is_inter) {
+  (void) is_inter;
+  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
+#if USE_IDTX_FOR_32X32
+  if (tx_size == TX_32X32) return is_inter ? 3 : 0;
+#else
+  if (tx_size == TX_32X32) return 0;
+#endif
+  return tx_size == TX_16X16 ? 2 : 1;
+}
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                   int is_inter) {
+  const int set = get_ext_tx_set(tx_size, bs, is_inter);
+  return is_inter ? num_ext_tx_set_inter[set] : num_ext_tx_set_intra[set];
+}
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, 0, 0, },
+  { 0, 0, 1, 0, },
+};
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, 0, 0, },
+  { 0, 0, 1, 0, },
+  { 0, 0, 0, USE_IDTX_FOR_32X32, },
+};
+
+// Transform types used in each intra set
+static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, },
+};
+
+// Transform types used in each inter set
+static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, },
+};
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA
+#define ALLOW_FILTER_INTRA_MODES 1
+#define ANGLE_STEP 3
+#define MAX_ANGLE_DELTAS 3
+#define ANGLE_FAST_SEARCH 1
+#define ANGLE_SKIP_THRESH 0.10
+
+static uint8_t mode_to_angle_map[INTRA_MODES] = {
+    0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
+};
+
+static const TX_TYPE filter_intra_mode_to_tx_type_lookup[FILTER_INTRA_MODES] = {
+  DCT_DCT,    // FILTER_DC
+  ADST_DCT,   // FILTER_V
+  DCT_ADST,   // FILTER_H
+  DCT_DCT,    // FILTER_D45
+  ADST_ADST,  // FILTER_D135
+  ADST_DCT,   // FILTER_D117
+  DCT_ADST,   // FILTER_D153
+  DCT_ADST,   // FILTER_D207
+  ADST_DCT,   // FILTER_D63
+  ADST_ADST,  // FILTER_TM
+};
+#endif  // CONFIG_EXT_INTRA
+
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
+                                  const MACROBLOCKD *xd,
+                                  int block_idx, TX_SIZE tx_size) {
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
+#if CONFIG_EXT_INTRA
+  if (!is_inter_block(mbmi)) {
+    const int use_ext_intra_mode_info =
+        mbmi->ext_intra_mode_info.use_ext_intra_mode[plane_type];
+    const EXT_INTRA_MODE ext_intra_mode =
+        mbmi->ext_intra_mode_info.ext_intra_mode[plane_type];
+    const PREDICTION_MODE mode = (plane_type == PLANE_TYPE_Y) ?
+        get_y_mode(mi, block_idx) : mbmi->uv_mode;
+
+    if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+      return DCT_DCT;
+
+#if CONFIG_EXT_TX
+    if (mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y &&
+        ALLOW_INTRA_EXT_TX)
+      return mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+
+    if (use_ext_intra_mode_info)
+      return filter_intra_mode_to_tx_type_lookup[ext_intra_mode];
+
+    if (mode == DC_PRED) {
+      return DCT_DCT;
+    } else if (mode == TM_PRED) {
+      return ADST_ADST;
+    } else {
+      int angle = mode_to_angle_map[mode];
+      if (mbmi->sb_type >= BLOCK_8X8)
+        angle += mbmi->angle_delta[plane_type] * ANGLE_STEP;
+      assert(angle > 0 && angle < 270);
+      if (angle == 135)
+        return ADST_ADST;
+      else if (angle < 45 || angle > 225)
+        return DCT_DCT;
+      else if (angle < 135)
+        return ADST_DCT;
+      else
+        return DCT_ADST;
+    }
+  }
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_TX
+#if USE_IDTX_FOR_32X32
+  if (xd->lossless[mbmi->segment_id] || tx_size > TX_32X32 ||
+      (tx_size >= TX_32X32 && !is_inter_block(mbmi)))
+#else
+  if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+#endif
+    return DCT_DCT;
+  if (mbmi->sb_type >= BLOCK_8X8) {
+    if (plane_type == PLANE_TYPE_Y) {
+      if (is_inter_block(mbmi) || ALLOW_INTRA_EXT_TX)
+      return mbmi->tx_type;
+    }
+    if (is_inter_block(mbmi))
+      // UV Inter only
+      return (mbmi->tx_type == IDTX && tx_size == TX_32X32 ?
+              DCT_DCT : mbmi->tx_type);
+  }
+
+  // Sub8x8-Inter/Intra OR UV-Intra
+  if (is_inter_block(mbmi))  // Sub8x8-Inter
+    return DCT_DCT;
+  else  // Sub8x8 Intra OR UV-Intra
+    return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y ?
+        get_y_mode(mi, block_idx) : mbmi->uv_mode];
+#else
   (void) block_idx;
   if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      mbmi->tx_size >= TX_32X32)
+      tx_size >= TX_32X32)
     return DCT_DCT;
-
   return mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
 }
 
 void vp10_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
@@ -248,8 +478,18 @@
 
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                      const struct macroblockd_plane *pd) {
+#if CONFIG_SUPERTX
+  if (!supertx_enabled(mbmi)) {
+    return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
+                               pd->subsampling_y);
+  } else {
+    return uvsupertx_size_lookup[mbmi->tx_size][pd->subsampling_x]
+                                               [pd->subsampling_y];
+  }
+#else
   return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
                              pd->subsampling_y);
+#endif  // CONFIG_SUPERTX
 }
 
 static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
@@ -279,7 +519,6 @@
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg);
 
-
 void vp10_foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg);

diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h
index 334489c..84476fa 100644
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h

@@ -170,6 +170,21 @@
   {0,  0 },  // 64X64 - {0b0000, 0b0000}
 };
 
+#if CONFIG_SUPERTX
+static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
+  //  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
+  //  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+  {{TX_4X4,   TX_4X4},   {TX_4X4,   TX_4X4}},
+  {{TX_8X8,   TX_4X4},   {TX_4X4,   TX_4X4}},
+  {{TX_16X16, TX_8X8},   {TX_8X8,   TX_8X8}},
+  {{TX_32X32, TX_16X16}, {TX_16X16, TX_16X16}},
+};
+
+static const int partition_supertx_context_lookup[PARTITION_TYPES] = {
+  -1, 0, 0, 1
+};
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h
index 9a471c8..747d1ad 100644
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h

@@ -21,8 +21,8 @@
 extern "C" {
 #endif
 
-#define DIFF_UPDATE_PROB        252
-#define GROUP_DIFF_UPDATE_PROB  252
+#define DIFF_UPDATE_PROB       252
+#define GROUP_DIFF_UPDATE_PROB 252
 
 // Coefficient token alphabet
 #define ZERO_TOKEN      0   // 0     Extra Bits 0+0

diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 78f3650..1b4fd26 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c

@@ -127,21 +127,6 @@
   }
 };
 
-#if !CONFIG_MISC_FIXES
-const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
-  { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
-  { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
-  { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
-  { 120,  11,  50, 123, 163, 135,  64,  77, 103 },  // y = d45
-  { 113,   9,  36, 155, 111, 157,  32,  44, 161 },  // y = d135
-  { 116,   9,  55, 176,  76,  96,  37,  61, 149 },  // y = d117
-  { 115,   9,  28, 141, 161, 167,  21,  25, 193 },  // y = d153
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 },  // y = d207
-  { 116,  12,  64, 120, 140, 125,  49, 115, 121 },  // y = d63
-  { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
-};
-#endif
-
 static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
   {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
   { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
@@ -162,32 +147,6 @@
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-#if !CONFIG_MISC_FIXES
-const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
-                                     [PARTITION_TYPES - 1] = {
-  // 8x8 -> 4x4
-  { 158,  97,  94 },  // a/l both not split
-  {  93,  24,  99 },  // a split, l not split
-  {  85, 119,  44 },  // l split, a not split
-  {  62,  59,  67 },  // a/l both split
-  // 16x16 -> 8x8
-  { 149,  53,  53 },  // a/l both not split
-  {  94,  20,  48 },  // a split, l not split
-  {  83,  53,  24 },  // l split, a not split
-  {  52,  18,  18 },  // a/l both split
-  // 32x32 -> 16x16
-  { 150,  40,  39 },  // a/l both not split
-  {  78,  12,  26 },  // a split, l not split
-  {  67,  33,  11 },  // l split, a not split
-  {  24,   7,   5 },  // a/l both split
-  // 64x64 -> 32x32
-  { 174,  35,  49 },  // a/l both not split
-  {  68,  11,  27 },  // a split, l not split
-  {  57,  15,   9 },  // l split, a not split
-  {  12,   3,   3 },  // a/l both split
-};
-#endif
-
 static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
@@ -212,6 +171,20 @@
   {  10,   7,   6 },  // a/l both split
 };
 
+#if CONFIG_REF_MV
+static const vpx_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
+    200, 180, 150, 150, 110, 70, 60,
+};
+
+static const vpx_prob default_zeromv_prob[ZEROMV_MODE_CONTEXTS] = {
+    192, 64,
+};
+
+static const vpx_prob default_refmv_prob[REFMV_MODE_CONTEXTS] = {
+    220, 220, 200, 200, 180, 128, 30, 220, 30,
+};
+#endif
+
 static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
                                               [INTER_MODES - 1] = {
   {2,       173,   34},  // 0 = both zero mv
@@ -256,16 +229,33 @@
   239, 183, 119,  96,  41
 };
 
-static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
-  50, 126, 123, 221, 226
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): To adjust the initial prob values.
+  {  33,  16,  16,  16 },
+  {  77,  74,  74,  74 },
+  { 142, 142, 142, 142 },
+  { 172, 170, 170, 170 },
+  { 238, 247, 247, 247 }
+#else
+  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
+#endif  // CONFIG_EXT_REFS
 };
 
-static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
+#if CONFIG_EXT_REFS
+  {  33,  16,  16,  16,  16 },
+  {  77,  74,  74,  74,  74 },
+  { 142, 142, 142, 142, 142 },
+  { 172, 170, 170, 170, 170 },
+  { 238, 247, 247, 247, 247 }
+#else
   {  33,  16 },
   {  77,  74 },
   { 142, 142 },
   { 172, 170 },
   { 238, 247 }
+#endif  // CONFIG_EXT_REFS
 };
 
 static const struct tx_probs default_tx_probs = {
@@ -279,6 +269,442 @@
     { 66  } }
 };
 
+const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)] = {
+    -TWO_COLORS, 2,
+    -THREE_COLORS, 4,
+    -FOUR_COLORS, 6,
+    -FIVE_COLORS, 8,
+    -SIX_COLORS, 10,
+    -SEVEN_COLORS, -EIGHT_COLORS,
+};
+
+// TODO(huisu): tune these probs
+const vpx_prob
+vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
+    {  96,  89, 100,  64,  77, 130},
+    {  22,  15,  44,  16,  34,  82},
+    {  30,  19,  57,  18,  38,  86},
+    {  94,  36, 104,  23,  43,  92},
+    { 116,  76, 107,  46,  65, 105},
+    { 112,  82,  94,  40,  70, 112},
+    { 147, 124, 123,  58,  69, 103},
+    { 180, 113, 136,  49,  45, 114},
+    { 107,  70,  87,  49, 154, 156},
+    {  98, 105, 142,  63,  64, 152},
+};
+
+const vpx_prob
+vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
+    { 160, 196, 228, 213, 175, 230},
+    {  87, 148, 208, 141, 166, 163},
+    {  72, 151, 204, 139, 155, 161},
+    {  78, 135, 171, 104, 120, 173},
+    {  59,  92, 131,  78,  92, 142},
+    {  75, 118, 149,  84,  90, 128},
+    {  89,  87,  92,  66,  66, 128},
+    {  67,  53,  54,  55,  66,  93},
+    { 120, 130,  83, 171,  75, 214},
+    {  72,  55,  66,  68,  79, 107},
+};
+
+const vpx_prob
+vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
+                                                      = {
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+};
+
+
+const vpx_prob default_uv_palette_mode_prob[2] = {
+    253, 229
+};
+
+const vpx_tree_index
+vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)] = {
+    {  // 2 colors
+        -PALETTE_COLOR_ONE, -PALETTE_COLOR_TWO,
+    },
+    {  // 3 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, -PALETTE_COLOR_THREE,
+    },
+    {  // 4 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, -PALETTE_COLOR_FOUR,
+    },
+    {  // 5 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, -PALETTE_COLOR_FIVE,
+    },
+    {  // 6 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8,
+        -PALETTE_COLOR_FIVE, -PALETTE_COLOR_SIX,
+    },
+    {  // 7 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8,
+        -PALETTE_COLOR_FIVE, 10,
+        -PALETTE_COLOR_SIX, -PALETTE_COLOR_SEVEN,
+    },
+    {  // 8 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8,
+        -PALETTE_COLOR_FIVE, 10,
+        -PALETTE_COLOR_SIX, 12,
+        -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT,
+    },
+};
+
+const vpx_prob vp10_default_palette_y_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+    {  // 2 colors
+        { 230, 255, 128, 128, 128, 128, 128 },
+        { 214, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 240, 255, 128, 128, 128, 128, 128 },
+        {  73, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 130, 255, 128, 128, 128, 128, 128 },
+        { 227, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 188, 255, 128, 128, 128, 128, 128 },
+        {  75, 255, 128, 128, 128, 128, 128 },
+        { 250, 255, 128, 128, 128, 128, 128 },
+        { 223, 255, 128, 128, 128, 128, 128 },
+        { 252, 255, 128, 128, 128, 128, 128 },
+    }, {  // 3 colors
+        { 229, 137, 255, 128, 128, 128, 128 },
+        { 197, 120, 255, 128, 128, 128, 128 },
+        { 107, 195, 255, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        {  27, 151, 255, 128, 128, 128, 128 },
+        { 230, 130, 255, 128, 128, 128, 128 },
+        {  37, 230, 255, 128, 128, 128, 128 },
+        {  67, 221, 255, 128, 128, 128, 128 },
+        { 124, 230, 255, 128, 128, 128, 128 },
+        { 195, 109, 255, 128, 128, 128, 128 },
+        {  99, 122, 255, 128, 128, 128, 128 },
+        { 205, 208, 255, 128, 128, 128, 128 },
+        {  40, 235, 255, 128, 128, 128, 128 },
+        { 251, 132, 255, 128, 128, 128, 128 },
+        { 237, 186, 255, 128, 128, 128, 128 },
+        { 253, 112, 255, 128, 128, 128, 128 },
+    }, {  // 4 colors
+        { 195,  87, 128, 255, 128, 128, 128 },
+        { 143, 100, 123, 255, 128, 128, 128 },
+        {  94, 124, 119, 255, 128, 128, 128 },
+        {  77,  91, 130, 255, 128, 128, 128 },
+        {  39, 114, 178, 255, 128, 128, 128 },
+        { 222,  94, 125, 255, 128, 128, 128 },
+        {  44, 203, 132, 255, 128, 128, 128 },
+        {  68, 175, 122, 255, 128, 128, 128 },
+        { 110, 187, 124, 255, 128, 128, 128 },
+        { 152,  91, 128, 255, 128, 128, 128 },
+        {  70, 109, 181, 255, 128, 128, 128 },
+        { 133, 113, 164, 255, 128, 128, 128 },
+        {  47, 205, 133, 255, 128, 128, 128 },
+        { 247,  94, 136, 255, 128, 128, 128 },
+        { 205, 122, 146, 255, 128, 128, 128 },
+        { 251, 100, 141, 255, 128, 128, 128 },
+    }, {  // 5 colors
+        { 195,  65,  84, 125, 255, 128, 128 },
+        { 150,  76,  84, 121, 255, 128, 128 },
+        {  94, 110,  81, 117, 255, 128, 128 },
+        {  79,  85,  91, 139, 255, 128, 128 },
+        {  26, 102, 139, 127, 255, 128, 128 },
+        { 220,  73,  91, 119, 255, 128, 128 },
+        {  38, 203,  86, 127, 255, 128, 128 },
+        {  61, 186,  72, 124, 255, 128, 128 },
+        { 132, 199,  84, 128, 255, 128, 128 },
+        { 172,  52,  62, 120, 255, 128, 128 },
+        { 102,  89, 121, 122, 255, 128, 128 },
+        { 182,  48,  69, 186, 255, 128, 128 },
+        {  36, 206,  87, 126, 255, 128, 128 },
+        { 249,  55,  67, 122, 255, 128, 128 },
+        { 218,  88,  75, 122, 255, 128, 128 },
+        { 253,  64,  80, 119, 255, 128, 128 },
+    }, {  // 6 colors
+        { 182,  54,  64,  75, 118, 255, 128 },
+        { 126,  67,  70,  76, 116, 255, 128 },
+        {  79,  92,  67,  85, 120, 255, 128 },
+        {  63,  61,  81, 118, 132, 255, 128 },
+        {  21,  80, 105,  83, 119, 255, 128 },
+        { 215,  72,  74,  74, 111, 255, 128 },
+        {  50, 176,  63,  79, 120, 255, 128 },
+        {  72, 148,  66,  77, 120, 255, 128 },
+        { 105, 177,  57,  78, 130, 255, 128 },
+        { 150,  66,  66,  80, 127, 255, 128 },
+        {  81,  76, 109,  85, 116, 255, 128 },
+        { 113,  81,  62,  96, 148, 255, 128 },
+        {  54, 179,  69,  82, 121, 255, 128 },
+        { 244,  47,  48,  67, 118, 255, 128 },
+        { 198,  83,  53,  65, 121, 255, 128 },
+        { 250,  42,  51,  69, 110, 255, 128 },
+    }, {  // 7 colors
+        { 182,  45,  54,  62,  74, 113, 255 },
+        { 124,  63,  57,  62,  77, 114, 255 },
+        {  77,  80,  56,  66,  76, 117, 255 },
+        {  63,  57,  69,  98,  85, 131, 255 },
+        {  19,  81,  98,  63,  80, 116, 255 },
+        { 215,  56,  60,  63,  68, 105, 255 },
+        {  50, 174,  50,  60,  79, 118, 255 },
+        {  68, 151,  50,  58,  73, 117, 255 },
+        { 104, 182,  53,  57,  79, 127, 255 },
+        { 156,  50,  51,  63,  77, 111, 255 },
+        {  88,  67,  97,  59,  82, 120, 255 },
+        { 114,  81,  46,  65, 103, 132, 255 },
+        {  55, 166,  57,  66,  82, 120, 255 },
+        { 245,  34,  38,  43,  63, 114, 255 },
+        { 203,  68,  45,  47,  60, 118, 255 },
+        { 250,  35,  37,  47,  66, 110, 255 },
+    }, {  // 8 colors
+        { 180,  43,  46,  50,  56,  69, 109 },
+        { 116,  53,  51,  49,  57,  73, 115 },
+        {  79,  70,  49,  50,  59,  74, 117 },
+        {  60,  54,  57,  70,  62,  83, 129 },
+        {  20,  73,  85,  52,  66,  81, 119 },
+        { 213,  56,  52,  49,  53,  62, 104 },
+        {  48, 161,  41,  45,  56,  77, 116 },
+        {  68, 139,  40,  47,  54,  71, 116 },
+        { 123, 166,  42,  43,  52,  76, 130 },
+        { 153,  44,  44,  47,  54,  79, 129 },
+        {  87,  64,  83,  49,  60,  75, 127 },
+        { 131,  68,  43,  48,  73,  96, 130 },
+        {  55, 152,  45,  51,  64,  77, 113 },
+        { 243,  30,  28,  33,  41,  65, 114 },
+        { 202,  56,  35,  36,  42,  63, 123 },
+        { 249,  31,  29,  32,  45,  68, 111 },
+    }
+};
+
+const vpx_prob vp10_default_palette_uv_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+    {  // 2 colors
+        { 228, 255, 128, 128, 128, 128, 128 },
+        { 195, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 228, 255, 128, 128, 128, 128, 128 },
+        {  71, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 129, 255, 128, 128, 128, 128, 128 },
+        { 206, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 136, 255, 128, 128, 128, 128, 128 },
+        {  98, 255, 128, 128, 128, 128, 128 },
+        { 236, 255, 128, 128, 128, 128, 128 },
+        { 222, 255, 128, 128, 128, 128, 128 },
+        { 249, 255, 128, 128, 128, 128, 128 },
+    }, {  // 3 colors
+        { 198, 136, 255, 128, 128, 128, 128 },
+        { 178, 105, 255, 128, 128, 128, 128 },
+        { 100, 206, 255, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        {  12, 136, 255, 128, 128, 128, 128 },
+        { 219, 134, 255, 128, 128, 128, 128 },
+        {  50, 198, 255, 128, 128, 128, 128 },
+        {  61, 231, 255, 128, 128, 128, 128 },
+        { 110, 209, 255, 128, 128, 128, 128 },
+        { 173, 106, 255, 128, 128, 128, 128 },
+        { 145, 166, 255, 128, 128, 128, 128 },
+        { 156, 175, 255, 128, 128, 128, 128 },
+        {  69, 183, 255, 128, 128, 128, 128 },
+        { 241, 163, 255, 128, 128, 128, 128 },
+        { 224, 160, 255, 128, 128, 128, 128 },
+        { 246, 154, 255, 128, 128, 128, 128 },
+    }, {  // 4 colors
+        { 173,  88, 143, 255, 128, 128, 128 },
+        { 146,  81, 127, 255, 128, 128, 128 },
+        {  84, 134, 102, 255, 128, 128, 128 },
+        {  69, 138, 140, 255, 128, 128, 128 },
+        {  31, 103, 200, 255, 128, 128, 128 },
+        { 217, 101, 139, 255, 128, 128, 128 },
+        {  51, 174, 121, 255, 128, 128, 128 },
+        {  64, 177, 109, 255, 128, 128, 128 },
+        {  96, 179, 145, 255, 128, 128, 128 },
+        { 164,  77, 114, 255, 128, 128, 128 },
+        {  87,  94, 156, 255, 128, 128, 128 },
+        { 105,  57, 173, 255, 128, 128, 128 },
+        {  63, 158, 137, 255, 128, 128, 128 },
+        { 236, 102, 156, 255, 128, 128, 128 },
+        { 197, 115, 153, 255, 128, 128, 128 },
+        { 245, 106, 154, 255, 128, 128, 128 },
+    }, {  // 5 colors
+        { 179,  64,  97, 129, 255, 128, 128 },
+        { 137,  56,  88, 125, 255, 128, 128 },
+        {  82, 107,  61, 118, 255, 128, 128 },
+        {  59, 113,  86, 115, 255, 128, 128 },
+        {  23,  88, 118, 130, 255, 128, 128 },
+        { 213,  66,  90, 125, 255, 128, 128 },
+        {  37, 181, 103, 121, 255, 128, 128 },
+        {  47, 188,  61, 131, 255, 128, 128 },
+        { 104, 185, 103, 144, 255, 128, 128 },
+        { 163,  39,  76, 112, 255, 128, 128 },
+        {  94,  74, 131, 126, 255, 128, 128 },
+        { 142,  42, 103, 163, 255, 128, 128 },
+        {  53, 162,  99, 149, 255, 128, 128 },
+        { 239,  54,  84, 108, 255, 128, 128 },
+        { 203,  84, 110, 147, 255, 128, 128 },
+        { 248,  70, 105, 151, 255, 128, 128 },
+    }, {  // 6 colors
+        { 189,  50,  67,  90, 130, 255, 128 },
+        { 114,  50,  55,  90, 123, 255, 128 },
+        {  66,  76,  54,  82, 128, 255, 128 },
+        {  43,  69,  69,  80, 129, 255, 128 },
+        {  22,  59,  87,  88, 141, 255, 128 },
+        { 203,  49,  68,  87, 122, 255, 128 },
+        {  43, 157,  74, 104, 146, 255, 128 },
+        {  54, 138,  51,  95, 138, 255, 128 },
+        {  82, 171,  58, 102, 146, 255, 128 },
+        { 129,  38,  59,  64, 168, 255, 128 },
+        {  56,  67, 119,  92, 112, 255, 128 },
+        {  96,  62,  53, 132,  82, 255, 128 },
+        {  60, 147,  77, 108, 145, 255, 128 },
+        { 238,  76,  73,  93, 148, 255, 128 },
+        { 189,  86,  73, 103, 157, 255, 128 },
+        { 246,  62,  75,  83, 167, 255, 128 },
+    }, {  // 7 colors
+        { 179,  42,  51,  73,  99, 134, 255 },
+        { 119,  52,  52,  61,  64, 114, 255 },
+        {  53,  77,  35,  65,  71, 131, 255 },
+        {  38,  70,  51,  68,  89, 144, 255 },
+        {  23,  65, 128,  73,  97, 131, 255 },
+        { 210,  47,  52,  63,  81, 143, 255 },
+        {  42, 159,  57,  68,  98, 143, 255 },
+        {  49, 153,  45,  82,  93, 143, 255 },
+        {  81, 169,  52,  72, 113, 151, 255 },
+        { 136,  46,  35,  56,  75,  96, 255 },
+        {  57,  84, 109,  47, 107, 131, 255 },
+        { 128,  78,  57,  36, 128,  85, 255 },
+        {  54, 149,  68,  77,  94, 153, 255 },
+        { 243,  58,  50,  71,  81, 167, 255 },
+        { 189,  92,  64,  70, 121, 173, 255 },
+        { 248,  35,  38,  51,  82, 201, 255 },
+    }, {  // 8 colors
+        { 201,  40,  36,  42,  64,  92, 123 },
+        { 116,  43,  33,  43,  73, 102, 128 },
+        {  46,  77,  37,  69,  62,  78, 150 },
+        {  40,  65,  52,  50,  76,  89, 133 },
+        {  28,  48,  91,  17,  64,  77, 133 },
+        { 218,  43,  43,  37,  56,  72, 163 },
+        {  41, 155,  44,  83,  82, 129, 180 },
+        {  44, 141,  29,  55,  64,  89, 147 },
+        {  92, 166,  48,  45,  59, 126, 179 },
+        { 169,  35,  49,  41,  36,  99, 139 },
+        {  55,  77,  77,  56,  60,  75, 156 },
+        { 155,  81,  51,  64,  57, 182, 255 },
+        {  60, 134,  49,  49,  93, 128, 174 },
+        { 244,  98,  51,  46,  22,  73, 238 },
+        { 189,  70,  40,  87,  93,  79, 201 },
+        { 248,  54,  49,  40,  29,  42, 227 },
+    }
+};
+
+static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
+    // (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
+    3993,  4235,  4378,  4380,
+    // (4, 3, 3, 0), (5, 0, 0, 0), (5, 3, 0, 0), (5, 3, 2, 0),
+    5720,  6655,  7018,  7040,
+    // (5, 5, 0, 0), (6, 2, 0, 0), (6, 2, 2, 0), (6, 4, 0, 0),
+    7260,  8228,  8250,  8470,
+    // (7, 3, 0, 0), (8, 0, 0, 0), (8, 2, 0, 0), (10, 0, 0, 0)
+    9680, 10648, 10890, 13310
+};
+
+int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
+                                   int r, int c, int n, int *color_order) {
+  int i, j, max, max_idx, temp;
+  int scores[PALETTE_MAX_SIZE + 10];
+  int weights[4] = {3, 2, 3, 2};
+  int color_ctx = 0;
+  int color_neighbors[4];
+
+  assert(n <= PALETTE_MAX_SIZE);
+
+  if (c - 1 >= 0)
+    color_neighbors[0] = color_map[r * cols + c - 1];
+  else
+    color_neighbors[0] = -1;
+  if (c - 1 >= 0 && r - 1 >= 0)
+    color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
+  else
+    color_neighbors[1] = -1;
+  if (r - 1 >= 0)
+    color_neighbors[2] = color_map[(r - 1) * cols + c];
+  else
+    color_neighbors[2] = -1;
+  if (r - 1 >= 0 && c + 1 <= cols - 1)
+    color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
+  else
+    color_neighbors[3] = -1;
+
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i)
+    color_order[i] = i;
+  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
+  for (i = 0; i < 4; ++i) {
+    if (color_neighbors[i] >= 0)
+      scores[color_neighbors[i]] += weights[i];
+  }
+
+  for (i = 0; i < 4; ++i) {
+    max = scores[i];
+    max_idx = i;
+    j = i + 1;
+    while (j < n) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+      ++j;
+    }
+
+    if (max_idx != i) {
+      temp = scores[i];
+      scores[i] = scores[max_idx];
+      scores[max_idx] = temp;
+
+      temp = color_order[i];
+      color_order[i] = color_order[max_idx];
+      color_order[max_idx] = temp;
+    }
+  }
+
+  for (i = 0; i < 4; ++i)
+    color_ctx = color_ctx * 11 + scores[i];
+
+  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
+    if (color_ctx == palette_color_context_lookup[i]) {
+      color_ctx = i;
+      break;
+    }
+
+  if (color_ctx >= PALETTE_COLOR_CONTEXTS)
+    color_ctx = 0;
+
+  return color_ctx;
+}
+
 void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]) {
   ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
@@ -306,10 +732,26 @@
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
+#if CONFIG_VAR_TX
+static const vpx_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
+    192, 128, 64, 192, 128, 64, 192, 128, 64,
+};
+#endif
+
 static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
+#if CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+  { 235, 192, 128},
+  { 36, 243, 208},
+  { 34, 16, 128},
+  { 36, 243, 48},
+  { 149, 160, 128},
+};
+#else
 static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
@@ -317,15 +759,440 @@
   { 34, 3, },
   { 149, 144, },
 };
+#endif
 
-#if CONFIG_MISC_FIXES
+#if CONFIG_EXT_TX
+const vpx_tree_index vp10_ext_tx_inter_tree[EXT_TX_SETS_INTER]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  { // ToDo(yaowu): remove used entry 0.
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -DST_DST, 6,
+    8, 18,
+    10, 12,
+    -DST_DCT, -DCT_DST,
+    14, 16,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    20, 26,
+    22, 24,
+    -DST_ADST, -ADST_DST,
+    -DST_FLIPADST, -FLIPADST_DST,
+    28, 30,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -DST_DST, 6,
+    8, 18,
+    10, 12,
+    -DST_DCT, -DCT_DST,
+    14, 16,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    20, 26,
+    22, 24,
+    -DST_ADST, -ADST_DST,
+    -DST_FLIPADST, -FLIPADST_DST,
+    28, 30,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    6, 12,
+    8, 10,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    14, 16,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, -DCT_DCT,
+  }
+};
+
+const vpx_tree_index vp10_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  {  // ToDo(yaowu): remove unused entry 0.
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -DST_DST, 6,
+    8, 18,
+    10, 12,
+    -DST_DCT, -DCT_DST,
+    14, 16,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    20, 26,
+    22, 24,
+    -DST_ADST, -ADST_DST,
+    -DST_FLIPADST, -FLIPADST_DST,
+    28, 30,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -DST_DST, 6,
+    8, 18,
+    10, 12,
+    -DST_DCT, -DCT_DST,
+    14, 16,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    20, 26,
+    22, 24,
+    -DST_ADST, -ADST_DST,
+    -DST_FLIPADST, -FLIPADST_DST,
+    28, 30,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    6, 12,
+    8, 10,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    14, 16,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }
+};
+
+static const vpx_prob
+default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
+  { // ToDo(yaowu): remove unused entry 0.
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+#if EXT_TX_SIZES == 4
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+    128 },
+#endif
+  }, {
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128 },
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128 },
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128 },
+#if EXT_TX_SIZES == 4
+    { 12, 112, 16, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+      128 },
+#endif
+  }, {
+    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
+    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
+#if EXT_TX_SIZES == 4
+    { 12, 112, 128, 128, 128, 128, 128, 128, 128 },
+#endif
+  }, {
+    { 12, },
+    { 12, },
+    { 12, },
+#if EXT_TX_SIZES == 4
+    { 12, },
+#endif
+  }
+};
+
+static const vpx_prob
+default_intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES]
+                         [INTRA_MODES][TX_TYPES - 1] = {
+  { // ToDo(yaowu): remove unused entry 0.
+    {
+      { 8, 11, 24, 112, 87, 137, 127, 134,
+      128, 86, 128, 124, 125, 133, 176, 123, },
+      { 10, 9, 39, 106, 73, 155, 163, 228,
+      35, 62, 129, 127, 133, 114, 213, 234, },
+      { 10, 9, 14, 88, 91, 127, 151, 51,
+      210, 89, 126, 58, 52, 116, 217, 24, },
+      { 9, 6, 29, 113, 98, 131, 149, 210,
+      119, 60, 124, 93, 90, 143, 170, 197, },
+      { 8, 8, 38, 101, 111, 166, 167, 141,
+      130, 105, 128, 75, 75, 118, 197, 117, },
+      { 7, 8, 39, 91, 101, 153, 166, 200,
+      99, 77, 123, 90, 83, 144, 224, 192, },
+      { 7, 10, 26, 86, 119, 154, 130, 101,
+      152, 91, 129, 75, 79, 137, 219, 77, },
+      { 10, 13, 20, 86, 102, 162, 112, 76,
+      171, 86, 134, 122, 106, 124, 196, 44, },
+      { 8, 9, 33, 108, 100, 144, 148, 215,
+      77, 60, 125, 125, 128, 126, 198, 220, },
+      { 3, 10, 29, 111, 69, 141, 204, 141,
+      139, 93, 120, 75, 77, 163, 242, 124, },
+    }, {
+      { 2, 53, 18, 147, 96, 98, 136, 133,
+      131, 120, 153, 163, 169, 137, 173, 124, },
+      { 4, 18, 34, 133, 54, 130, 179, 228,
+      28, 72, 153, 164, 168, 118, 227, 239, },
+      { 4, 18, 13, 125, 72, 110, 176, 36,
+      221, 104, 148, 75, 72, 117, 225, 19, },
+      { 8, 33, 24, 162, 113, 99, 147, 226,
+      103, 85, 153, 143, 153, 124, 155, 210, },
+      { 2, 15, 35, 107, 127, 158, 192, 128,
+      126, 116, 151, 95, 88, 182, 241, 119, },
+      { 3, 15, 36, 112, 100, 146, 194, 189,
+      90, 98, 152, 99, 100, 165, 235, 175, },
+      { 3, 16, 29, 109, 103, 140, 182, 76,
+      173, 104, 147, 82, 85, 159, 235, 70, },
+      { 9, 24, 14, 120, 86, 156, 161, 34,
+      177, 121, 142, 128, 128, 126, 185, 37, },
+      { 5, 24, 29, 152, 98, 99, 174, 228,
+      82, 76, 147, 149, 128, 132, 191, 225, },
+      { 2, 15, 29, 111, 77, 126, 200, 135,
+      117, 93, 152, 96, 84, 191, 245, 135, },
+    }, {
+      { 2, 69, 13, 173, 111, 69, 137, 159,
+      159, 146, 151, 193, 203, 131, 180, 123, },
+      { 1, 12, 33, 164, 32, 98, 204, 242,
+      23, 99, 149, 215, 232, 110, 239, 245, },
+      { 1, 17, 9, 136, 82, 83, 171, 28,
+      231, 128, 135, 76, 64, 118, 235, 17, },
+      { 4, 41, 17, 195, 131, 58, 161, 237,
+      141, 97, 153, 189, 191, 117, 182, 202, },
+      { 2, 17, 36, 104, 149, 137, 217, 139,
+      191, 119, 125, 107, 115, 223, 249, 110, },
+      { 2, 14, 24, 127, 91, 135, 219, 198,
+      113, 91, 164, 125, 173, 211, 250, 116, },
+      { 3, 19, 24, 120, 102, 130, 209, 81,
+      187, 95, 143, 102, 50, 190, 244, 56, },
+      { 4, 27, 10, 128, 91, 157, 181, 33,
+      181, 150, 141, 141, 166, 114, 215, 25, },
+      { 2, 34, 27, 187, 102, 77, 210, 245,
+      113, 107, 136, 184, 188, 121, 210, 234, },
+      { 1, 15, 22, 141, 59, 94, 208, 133,
+      154, 95, 152, 112, 105, 191, 242, 111, },
+#if EXT_TX_SIZES == 4
+    }, {
+      { 2, 69, 13, 173, 111, 69, 137, 159,
+      159, 146, 151, 193, 203, 131, 180, 123, },
+      { 1, 12, 33, 164, 32, 98, 204, 242,
+      23, 99, 149, 215, 232, 110, 239, 245, },
+      { 1, 17, 9, 136, 82, 83, 171, 28,
+      231, 128, 135, 76, 64, 118, 235, 17, },
+      { 4, 41, 17, 195, 131, 58, 161, 237,
+      141, 97, 153, 189, 191, 117, 182, 202, },
+      { 2, 17, 36, 104, 149, 137, 217, 139,
+      191, 119, 125, 107, 115, 223, 249, 110, },
+      { 2, 14, 24, 127, 91, 135, 219, 198,
+      113, 91, 164, 125, 173, 211, 250, 116, },
+      { 3, 19, 24, 120, 102, 130, 209, 81,
+      187, 95, 143, 102, 50, 190, 244, 56, },
+      { 4, 27, 10, 128, 91, 157, 181, 33,
+      181, 150, 141, 141, 166, 114, 215, 25, },
+      { 2, 34, 27, 187, 102, 77, 210, 245,
+      113, 107, 136, 184, 188, 121, 210, 234, },
+      { 1, 15, 22, 141, 59, 94, 208, 133,
+      154, 95, 152, 112, 105, 191, 242, 111, },
+#endif
+    },
+  }, {
+    {
+      {   8,  11,  24, 112,  87, 137, 127, 134,
+        128,  86, 128, 124, 125, 133, 176, 123, },
+      {  10,   9,  39, 106,  73, 155, 163, 228,
+        35,  62, 129, 127, 133, 114, 213, 234, },
+      {  10,   9,  14,  88,  91, 127, 151,  51,
+        210,  89, 126,  58,  52, 116, 217,  24, },
+      {   9,   6,  29, 113,  98, 131, 149, 210,
+        119,  60, 124,  93,  90, 143, 170, 197, },
+      {   8,   8,  38, 101, 111, 166, 167, 141,
+        130, 105, 128,  75,  75, 118, 197, 117, },
+      {   7,   8,  39,  91, 101, 153, 166, 200,
+        99,  77, 123,  90,  83, 144, 224, 192, },
+      {   7,  10,  26,  86, 119, 154, 130, 101,
+        152,  91, 129,  75,  79, 137, 219,  77, },
+      {  10,  13,  20,  86, 102, 162, 112,  76,
+        171,  86, 134, 122, 106, 124, 196,  44, },
+      {   8,   9,  33, 108, 100, 144, 148, 215,
+        77,  60, 125, 125, 128, 126, 198, 220, },
+      {   3,  10,  29, 111,  69, 141, 204, 141,
+        139,  93, 120,  75,  77, 163, 242, 124, },
+    }, {
+      {   2,  53,  18, 147,  96,  98, 136, 133,
+        131, 120, 153, 163, 169, 137, 173, 124, },
+      {   4,  18,  34, 133,  54, 130, 179, 228,
+        28,  72, 153, 164, 168, 118, 227, 239, },
+      {   4,  18,  13, 125,  72, 110, 176,  36,
+        221, 104, 148,  75,  72, 117, 225,  19, },
+      {   8,  33,  24, 162, 113,  99, 147, 226,
+        103,  85, 153, 143, 153, 124, 155, 210, },
+      {   2,  15,  35, 107, 127, 158, 192, 128,
+        126, 116, 151,  95,  88, 182, 241, 119, },
+      {   3,  15,  36, 112, 100, 146, 194, 189,
+        90,  98, 152,  99, 100, 165, 235, 175, },
+      {   3,  16,  29, 109, 103, 140, 182,  76,
+        173, 104, 147,  82,  85, 159, 235,  70, },
+      {   9,  24,  14, 120,  86, 156, 161,  34,
+        177, 121, 142, 128, 128, 126, 185,  37, },
+      {   5,  24,  29, 152,  98,  99, 174, 228,
+        82,  76, 147, 149, 128, 132, 191, 225, },
+      {   2,  15,  29, 111,  77, 126, 200, 135,
+        117,  93, 152,  96,  84, 191, 245, 135, },
+    }, {
+      {   2,  69,  13, 173, 111,  69, 137, 159,
+        159, 146, 151, 193, 203, 131, 180, 123, },
+      {   1,  12,  33, 164,  32,  98, 204, 242,
+         23,  99, 149, 215, 232, 110, 239, 245, },
+      {   1,  17,   9, 136,  82,  83, 171,  28,
+        231, 128, 135,  76,  64, 118, 235,  17, },
+      {   4,  41,  17, 195, 131,  58, 161, 237,
+        141,  97, 153, 189, 191, 117, 182, 202, },
+      {   2,  17,  36, 104, 149, 137, 217, 139,
+        191, 119, 125, 107, 115, 223, 249, 110, },
+      {   2,  14,  24, 127,  91, 135, 219, 198,
+        113,  91, 164, 125, 173, 211, 250, 116, },
+      {   3,  19,  24, 120, 102, 130, 209,  81,
+        187,  95, 143, 102,  50, 190, 244,  56, },
+      {   4,  27,  10, 128,  91, 157, 181,  33,
+        181, 150, 141, 141, 166, 114, 215,  25, },
+      {   2,  34,  27, 187, 102,  77, 210, 245,
+        113, 107, 136, 184, 188, 121, 210, 234, },
+      {   1,  15,  22, 141,  59,  94, 208, 133,
+        154,  95, 152, 112, 105, 191, 242, 111, },
+#if EXT_TX_SIZES == 4
+    }, {
+      {   2,  69,  13, 173, 111,  69, 137, 159,
+        159, 146, 151, 193, 203, 131, 180, 123, },
+      {   1,  12,  33, 164,  32,  98, 204, 242,
+        23,  99, 149, 215, 232, 110, 239, 245, },
+      {   1,  17,   9, 136,  82,  83, 171,  28,
+        231, 128, 135,  76,  64, 118, 235,  17, },
+      {   4,  41,  17, 195, 131,  58, 161, 237,
+        141,  97, 153, 189, 191, 117, 182, 202, },
+      {   2,  17,  36, 104, 149, 137, 217, 139,
+        191, 119, 125, 107, 115, 223, 249, 110, },
+      {   2,  14,  24, 127,  91, 135, 219, 198,
+        113,  91, 164, 125, 173, 211, 250, 116, },
+      {   3,  19,  24, 120, 102, 130, 209,  81,
+        187,  95, 143, 102,  50, 190, 244,  56, },
+      {   4,  27,  10, 128,  91, 157, 181,  33,
+        181, 150, 141, 141, 166, 114, 215,  25, },
+      {   2,  34,  27, 187, 102,  77, 210, 245,
+        113, 107, 136, 184, 188, 121, 210, 234, },
+      {   1,  15,  22, 141,  59,  94, 208, 133,
+        154,  95, 152, 112, 105, 191, 242, 111, },
+#endif
+    },
+  }, {
+    {
+      {   8, 176, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {  10,  28, 176, 192, 208, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {  10,  28, 176, 192,  48, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   9, 160, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   8,  28,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   7,  28, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   7,  20, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {  10,  23, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   8,  29, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   3,  20,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+    }, {
+      {   2, 176, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  28, 176, 192, 208, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  28, 176, 192,  48, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   8, 160, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   3,  28, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   3,  26, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   9,  24, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   5,  24, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  25,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+    }, {
+      {   2, 176, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  28, 176, 192, 208, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  28, 176, 192,  48, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4, 160, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  28, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   3,  29, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  27, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  34, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  25,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+#if EXT_TX_SIZES == 4
+    }, {
+      {   2, 176, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  12, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  17, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  41, 128, 128, 128, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  17,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  14, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   3,  19, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   4,  27, 160, 176,  64, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   2,  34, 160, 176, 192, 128, 128, 128,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+      {   1,  15,  96, 128, 128, 128, 160, 192,
+        128, 128, 128, 128, 128, 128, 128, 128, },
+#endif
+    },
+  },
+};
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_SUPERTX
+static const vpx_prob default_supertx_prob[PARTITION_SUPERTX_CONTEXTS]
+                                          [TX_SIZES] = {
+  { 1, 160, 160, 170 },
+  { 1, 200, 200, 210 },
+};
+#endif  // CONFIG_SUPERTX
+
 // FIXME(someone) need real defaults here
 static const struct segmentation_probs default_seg_probs = {
   { 128, 128, 128, 128, 128, 128, 128 },
   { 128, 128, 128 },
 };
-#endif
 
+#if CONFIG_EXT_INTRA
+static  const vpx_prob default_ext_intra_probs[2] = {230, 230};
+#endif  // CONFIG_EXT_INTRA
+
+#if !CONFIG_EXT_TX
 const vpx_tree_index vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
   -DCT_DCT, 2,
   -ADST_ADST, 4,
@@ -345,6 +1212,7 @@
   {176, 85, 128},
   {192, 85, 128},
 };
+#endif
 
 static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp10_copy(fc->uv_mode_prob, default_uv_probs);
@@ -356,21 +1224,42 @@
   vp10_copy(fc->comp_ref_prob, default_comp_ref_p);
   vp10_copy(fc->single_ref_prob, default_single_ref_p);
   fc->tx_probs = default_tx_probs;
+#if CONFIG_VAR_TX
+  vp10_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
+#endif
   vp10_copy(fc->skip_probs, default_skip_probs);
+#if CONFIG_REF_MV
+  vp10_copy(fc->newmv_prob, default_newmv_prob);
+  vp10_copy(fc->zeromv_prob, default_zeromv_prob);
+  vp10_copy(fc->refmv_prob, default_refmv_prob);
+#endif
   vp10_copy(fc->inter_mode_probs, default_inter_mode_probs);
-#if CONFIG_MISC_FIXES
+#if CONFIG_SUPERTX
+  vp10_copy(fc->supertx_prob, default_supertx_prob);
+#endif  // CONFIG_SUPERTX
   vp10_copy(fc->seg.tree_probs, default_seg_probs.tree_probs);
   vp10_copy(fc->seg.pred_probs, default_seg_probs.pred_probs);
-#endif
-  vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+#if CONFIG_EXT_INTRA
+  vp10_copy(fc->ext_intra_probs, default_ext_intra_probs);
+#endif  // CONFIG_EXT_INTRA
   vp10_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob);
+  vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
 }
 
+#if CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
 const vpx_tree_index vp10_switchable_interp_tree
-                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
+[TREE_SIZE(SWITCHABLE_FILTERS)] = {
+  -EIGHTTAP, 2,
+  4, -EIGHTTAP_SHARP,
+  -EIGHTTAP_SMOOTH, -EIGHTTAP_SMOOTH2,
+};
+#else
+const vpx_tree_index vp10_switchable_interp_tree
+[TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
+#endif  // CONFIG_EXT_INTERP
 
 void vp10_adapt_inter_frame_probs(VP10_COMMON *cm) {
   int i, j;
@@ -385,31 +1274,34 @@
     fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
                                                  counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
-                                               counts->comp_ref[i]);
+    for (j = 0; j < (COMP_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
+                                                    counts->comp_ref[i][j]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
+    for (j = 0; j < (SINGLE_REFS - 1); j++)
       fc->single_ref_prob[i][j] = mode_mv_merge_probs(
           pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
+#if CONFIG_REF_MV
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    fc->newmv_prob[i] = mode_mv_merge_probs(pre_fc->newmv_prob[i],
+                                            counts->newmv_mode[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    fc->zeromv_prob[i] = mode_mv_merge_probs(pre_fc->zeromv_prob[i],
+                                             counts->zeromv_mode[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    fc->refmv_prob[i] = mode_mv_merge_probs(pre_fc->refmv_prob[i],
+                                            counts->refmv_mode[i]);
+#else
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
     vpx_tree_merge_probs(vp10_inter_mode_tree, pre_fc->inter_mode_probs[i],
                 counts->inter_mode[i], fc->inter_mode_probs[i]);
+#endif
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
-#if !CONFIG_MISC_FIXES
-  for (i = 0; i < INTRA_MODES; ++i)
-    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
-                         counts->uv_mode[i], fc->uv_mode_prob[i]);
-
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-#endif
-
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       vpx_tree_merge_probs(vp10_switchable_interp_tree,
@@ -449,10 +1341,41 @@
     }
   }
 
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT)
+    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+      fc->txfm_partition_prob[i] =
+          mode_mv_merge_probs(pre_fc->txfm_partition_prob[i],
+                              counts->txfm_partition[i]);
+#endif
+
   for (i = 0; i < SKIP_CONTEXTS; ++i)
     fc->skip_probs[i] = mode_mv_merge_probs(
         pre_fc->skip_probs[i], counts->skip[i]);
 
+#if CONFIG_EXT_TX
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        vpx_tree_merge_probs(vp10_ext_tx_inter_tree[s],
+                             pre_fc->inter_ext_tx_prob[s][i],
+                             counts->inter_ext_tx[s][i],
+                             fc->inter_ext_tx_prob[s][i]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        int j;
+        for (j = 0; j < INTRA_MODES; ++j)
+          vpx_tree_merge_probs(vp10_ext_tx_intra_tree[s],
+                               pre_fc->intra_ext_tx_prob[s][i][j],
+                               counts->intra_ext_tx[s][i][j],
+                               fc->intra_ext_tx_prob[s][i][j]);
+      }
+    }
+  }
+#else
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     int j;
     for (j = 0; j < TX_TYPES; ++j)
@@ -467,8 +1390,18 @@
                          counts->inter_ext_tx[i],
                          fc->inter_ext_tx_prob[i]);
   }
+#endif  // CONFIG_EXT_TX
 
-#if CONFIG_MISC_FIXES
+#if CONFIG_SUPERTX
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    int j;
+    for (j = 1; j < TX_SIZES; ++j) {
+      fc->supertx_prob[i][j] = mode_mv_merge_probs(pre_fc->supertx_prob[i][j],
+                                                   counts->supertx[i][j]);
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
   if (cm->seg.temporal_update) {
     for (i = 0; i < PREDICTION_PROBS; i++)
       fc->seg.pred_probs[i] = mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
@@ -488,7 +1421,13 @@
   for (i = 0; i < PARTITION_CONTEXTS; i++)
     vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
-#endif
+
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    fc->ext_intra_probs[i] = mode_mv_merge_probs(
+              pre_fc->ext_intra_probs[i], counts->ext_intra[i]);
+  }
+#endif  // CONFIG_EXT_INTRA
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -497,6 +1436,11 @@
 
   lf->ref_deltas[INTRA_FRAME] = 1;
   lf->ref_deltas[LAST_FRAME] = 0;
+#if CONFIG_EXT_REFS
+  lf->ref_deltas[LAST2_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[LAST3_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[LAST4_FRAME] = lf->ref_deltas[LAST_FRAME];
+#endif  // CONFIG_EXT_REFS
   lf->ref_deltas[GOLDEN_FRAME] = -1;
   lf->ref_deltas[ALTREF_FRAME] = -1;
 

diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 611d3ad..a1ad2c4 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h

@@ -27,6 +27,11 @@
 
 #define INTER_OFFSET(mode) ((mode) - NEARESTMV)
 
+#define PALETTE_COLOR_CONTEXTS 16
+#define PALETTE_MAX_SIZE 8
+#define PALETTE_BLOCK_SIZES (BLOCK_64X64 - BLOCK_8X8 + 1)
+#define PALETTE_Y_MODE_CONTEXTS 3
+
 struct VP10Common;
 
 struct tx_probs {
@@ -55,20 +60,40 @@
   vp10_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
   vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
+
+#if CONFIG_REF_MV
+  vpx_prob newmv_prob[NEWMV_MODE_CONTEXTS];
+  vpx_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
+  vpx_prob refmv_prob[REFMV_MODE_CONTEXTS];
+#endif
+
   vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
   vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
   vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vpx_prob single_ref_prob[REF_CONTEXTS][2];
-  vpx_prob comp_ref_prob[REF_CONTEXTS];
+  vpx_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS-1];
+  vpx_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS-1];
   struct tx_probs tx_probs;
+#if CONFIG_VAR_TX
+  vpx_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
+#endif
   vpx_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
-#if CONFIG_MISC_FIXES
-  struct segmentation_probs seg;
-#endif
+  int initialized;
+#if CONFIG_EXT_TX
+  vpx_prob inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1];
+  vpx_prob intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                            [TX_TYPES - 1];
+#else
   vpx_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1];
   vpx_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1];
-  int initialized;
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  vpx_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct segmentation_probs seg;
+#if CONFIG_EXT_INTRA
+  vpx_prob ext_intra_probs[PLANE_TYPES];
+#endif  // CONFIG_EXT_INTRA
 } FRAME_CONTEXT;
 
 typedef struct FRAME_COUNTS {
@@ -81,34 +106,62 @@
                          [COEF_BANDS][COEFF_CONTEXTS];
   unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
+#if CONFIG_REF_MV
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+#endif
+
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref[REF_CONTEXTS][2][2];
-  unsigned int comp_ref[REF_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS-1][2];
+  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS-1][2];
   struct tx_counts tx;
+#if CONFIG_VAR_TX
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+#endif
   unsigned int skip[SKIP_CONTEXTS][2];
   nmv_context_counts mv;
-#if CONFIG_MISC_FIXES
-  struct seg_counts seg;
-#endif
+#if CONFIG_EXT_TX
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+#else
   unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
   unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
+  unsigned int supertx_size[TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct seg_counts seg;
+#if CONFIG_EXT_INTRA
+  unsigned int ext_intra[PLANE_TYPES][2];
+#endif  // CONFIG_EXT_INTRA
 } FRAME_COUNTS;
 
 extern const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-#if !CONFIG_MISC_FIXES
-extern const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
-                                            [PARTITION_TYPES - 1];
-#endif
+extern const vpx_prob
+vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS];
+extern const vpx_prob
+vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
+extern const vpx_prob
+vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
+extern const vpx_prob vp10_default_palette_y_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1];
+extern const vpx_prob vp10_default_palette_uv_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1];
 
 extern const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern const vpx_tree_index vp10_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
+extern const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
+extern const vpx_tree_index
+vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)];
 
 
 void vp10_setup_past_independence(struct VP10Common *cm);
@@ -123,8 +176,15 @@
 void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
+#if CONFIG_EXT_TX
+extern const vpx_tree_index
+    vp10_ext_tx_inter_tree[EXT_TX_SETS_INTER][TREE_SIZE(TX_TYPES)];
+extern const vpx_tree_index
+    vp10_ext_tx_intra_tree[EXT_TX_SETS_INTRA][TREE_SIZE(TX_TYPES)];
+#else
 extern const vpx_tree_index
     vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)];
+#endif  // CONFIG_EXT_TX
 
 static INLINE int vp10_ceil_log2(int n) {
   int i = 1, p = 2;
@@ -135,6 +195,9 @@
   return i;
 }
 
+int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
+                                   int r, int c, int n, int *color_order);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 18c7d16..f0d1ba2 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h

@@ -94,15 +94,44 @@
   ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
   DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
   ADST_ADST = 3,                      // ADST in both directions
-  TX_TYPES = 4
+#if CONFIG_EXT_TX
+  FLIPADST_DCT = 4,
+  DCT_FLIPADST = 5,
+  FLIPADST_FLIPADST = 6,
+  ADST_FLIPADST = 7,
+  FLIPADST_ADST = 8,
+  DST_DCT = 9,
+  DCT_DST = 10,
+  DST_ADST = 11,
+  ADST_DST = 12,
+  DST_FLIPADST = 13,
+  FLIPADST_DST = 14,
+  DST_DST = 15,
+  IDTX = 16,
+#endif  // CONFIG_EXT_TX
+  TX_TYPES,
 } TX_TYPE;
 
 #define EXT_TX_SIZES       3  // number of sizes that use extended transforms
 
+#if CONFIG_EXT_TX
+#define USE_DST2           1
+#define EXT_TX_SETS_INTER  4  // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA  3  // Sets of transform selections for INTRA
+#endif  // CONFIG_EXT_TX
+
 typedef enum {
   VP9_LAST_FLAG = 1 << 0,
+#if CONFIG_EXT_REFS
+  VP9_LAST2_FLAG = 1 << 1,
+  VP9_LAST3_FLAG = 1 << 2,
+  VP9_LAST4_FLAG = 1 << 3,
+  VP9_GOLD_FLAG = 1 << 4,
+  VP9_ALT_FLAG = 1 << 5,
+#else
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
+#endif  // CONFIG_EXT_REFS
 } VP9_REFFRAME;
 
 typedef enum {
@@ -111,6 +140,29 @@
   PLANE_TYPES
 } PLANE_TYPE;
 
+typedef enum {
+  TWO_COLORS,
+  THREE_COLORS,
+  FOUR_COLORS,
+  FIVE_COLORS,
+  SIX_COLORS,
+  SEVEN_COLORS,
+  EIGHT_COLORS,
+  PALETTE_SIZES
+} PALETTE_SIZE;
+
+typedef enum {
+  PALETTE_COLOR_ONE,
+  PALETTE_COLOR_TWO,
+  PALETTE_COLOR_THREE,
+  PALETTE_COLOR_FOUR,
+  PALETTE_COLOR_FIVE,
+  PALETTE_COLOR_SIX,
+  PALETTE_COLOR_SEVEN,
+  PALETTE_COLOR_EIGHT,
+  PALETTE_COLORS
+} PALETTE_COLOR;
+
 #define DC_PRED    0       // Average of above and left pixels
 #define V_PRED     1       // Vertical
 #define H_PRED     2       // Horizontal
@@ -130,18 +182,77 @@
 
 #define INTRA_MODES (TM_PRED + 1)
 
+#if CONFIG_EXT_INTRA
+typedef enum {
+  FILTER_DC_PRED,
+  FILTER_V_PRED,
+  FILTER_H_PRED,
+  FILTER_D45_PRED,
+  FILTER_D135_PRED,
+  FILTER_D117_PRED,
+  FILTER_D153_PRED,
+  FILTER_D207_PRED,
+  FILTER_D63_PRED,
+  FILTER_TM_PRED,
+  EXT_INTRA_MODES,
+} EXT_INTRA_MODE;
+
+#define FILTER_INTRA_MODES (FILTER_TM_PRED + 1)
+#define DIRECTIONAL_MODES (INTRA_MODES - 2)
+#endif  // CONFIG_EXT_INTRA
+
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
 
 #define SKIP_CONTEXTS 3
+
+#if CONFIG_REF_MV
+#define NEWMV_MODE_CONTEXTS  7
+#define ZEROMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS  9
+
+#define ZEROMV_OFFSET 3
+#define REFMV_OFFSET  4
+
+#define NEWMV_CTX_MASK ((1 << ZEROMV_OFFSET) - 1)
+#define ZEROMV_CTX_MASK ((1 << (REFMV_OFFSET - ZEROMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define ALL_ZERO_FLAG_OFFSET   8
+#define SKIP_NEARESTMV_OFFSET  9
+#define SKIP_NEARMV_OFFSET    10
+#define SKIP_NEARESTMV_SUB8X8_OFFSET 11
+#endif
+
 #define INTER_MODE_CONTEXTS 7
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
+#if CONFIG_REF_MV
+#define MAX_REF_MV_STACK_SIZE 16
+#endif
 
 #define INTRA_INTER_CONTEXTS 4
 #define COMP_INTER_CONTEXTS 5
 #define REF_CONTEXTS 5
 
+#if CONFIG_VAR_TX
+#define TXFM_PARTITION_CONTEXTS 9
+typedef TX_SIZE TXFM_CONTEXT;
+#endif
+
+#if CONFIG_EXT_REFS
+#define SINGLE_REFS 6
+#define COMP_REFS 5
+#else
+#define SINGLE_REFS 3
+#define COMP_REFS 2
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_SUPERTX
+#define PARTITION_SUPERTX_CONTEXTS 2
+#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/filter.c b/vp10/common/filter.c
index dda279f..a9225b6 100644
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c

@@ -32,9 +32,28 @@
   { 0, 0, 0,   8, 120, 0, 0, 0 }
 };
 
-// Lagrangian interpolation filter
 DECLARE_ALIGNED(256, static const InterpKernel,
                 sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+#if CONFIG_EXT_INTERP
+  // intfilt 0.575
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -5, 126,   8,  -3,   1, 0},
+  {-1,   3, -10, 123,  18,  -6,   2, -1},
+  {-1,   4, -14, 118,  27,  -9,   3, 0},
+  {-1,   5, -16, 112,  37, -12,   4, -1},
+  {-1,   5, -18, 105,  48, -14,   4, -1},
+  {-1,   6, -19,  97,  58, -17,   5, -1},
+  {-1,   6, -20,  88,  68, -18,   6, -1},
+  {-1,   6, -19,  78,  78, -19,   6, -1},
+  {-1,   6, -18,  68,  88, -20,   6, -1},
+  {-1,   5, -17,  58,  97, -19,   6, -1},
+  {-1,   4, -14,  48, 105, -18,   5, -1},
+  {-1,   4, -12,  37, 112, -16,   5, -1},
+  {0,   3,  -9,  27, 118, -14,   4, -1},
+  {-1,   2,  -6,  18, 123, -10,   3, -1},
+  {0,   1,  -3,   8, 126,  -5,   1, 0},
+#else
+  // Lagrangian interpolation filter
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -51,11 +70,31 @@
   { -1,   3,  -9,  27, 118, -13,   4, -1},
   { 0,   2,  -6,  18, 122, -10,   3, -1},
   { 0,   1,  -3,   8, 126,  -5,   1,  0}
+#endif  // CONFIG_EXT_INTERP
 };
 
-// DCT based filter
 DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+#if CONFIG_EXT_INTERP
+  // intfilt 0.8
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {-1,   2,  -6, 127,   9,  -4,   2, -1},
+  {-2,   5, -12, 124,  18,  -7,   4, -2},
+  {-2,   7, -16, 119,  28, -11,   5, -2},
+  {-3,   8, -19, 114,  38, -14,   7, -3},
+  {-3,   9, -22, 107,  49, -17,   8, -3},
+  {-4,  10, -23,  99,  60, -20,  10, -4},
+  {-4,  11, -23,  90,  70, -22,  10, -4},
+  {-4,  11, -23,  80,  80, -23,  11, -4},
+  {-4,  10, -22,  70,  90, -23,  11, -4},
+  {-4,  10, -20,  60,  99, -23,  10, -4},
+  {-3,   8, -17,  49, 107, -22,   9, -3},
+  {-3,   7, -14,  38, 114, -19,   8, -3},
+  {-2,   5, -11,  28, 119, -16,   7, -2},
+  {-2,   4,  -7,  18, 124, -12,   5, -2},
+  {-1,   2,  -4,   9, 127,  -6,   2, -1},
+#else
+  // DCT based filter
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -72,11 +111,58 @@
   {-2,   5, -10,  27, 121, -17,   7, -3},
   {-1,   3,  -6,  17, 125, -13,   5, -2},
   {0,   1,  -3,   8, 127,  -7,   3, -1}
+#endif  // CONFIG_EXT_INTERP
 };
 
-// freqmultiplier = 0.5
+#if CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
+
 DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+                sub_pel_filters_8smooth2[SUBPEL_SHIFTS]) = {
+// freqmultiplier = 0.35
+  {0,  0,  0, 128,  0,  0,  0,  0},
+  {-1,  8, 31, 47, 34, 10,  0, -1},
+  {-1,  7, 29, 46, 36, 12,  0, -1},
+  {-1,  6, 28, 46, 37, 13,  0, -1},
+  {-1,  5, 26, 46, 38, 14,  1, -1},
+  {-1,  4, 25, 45, 39, 16,  1, -1},
+  {-1,  4, 23, 44, 41, 17,  1, -1},
+  {-1,  3, 21, 44, 42, 18,  2, -1},
+  {-1,  2, 20, 43, 43, 20,  2, -1},
+  {-1,  2, 18, 42, 44, 21,  3, -1},
+  {-1,  1, 17, 41, 44, 23,  4, -1},
+  {-1,  1, 16, 39, 45, 25,  4, -1},
+  {-1,  1, 14, 38, 46, 26,  5, -1},
+  {-1,  0, 13, 37, 46, 28,  6, -1},
+  {-1,  0, 12, 36, 46, 29,  7, -1},
+  {-1,  0, 10, 34, 47, 31,  8, -1},
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+// freqmultiplier = 0.75
+  {0,  0,  0, 128,  0,  0,  0,  0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {1,  -4,  -2,  82,  61, -11,   0, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,   0, -11,  61,  82,  -2,  -4, 1},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+};
+
+#else
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+// freqmultiplier = 0.5
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -95,10 +181,14 @@
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
 
+#endif  // CONFIG_EXT_INTERP
 
-const InterpKernel *vp10_filter_kernels[4] = {
+const InterpKernel *vp10_filter_kernels[SWITCHABLE_FILTERS + 1] = {
   sub_pel_filters_8,
-  sub_pel_filters_8lp,
-  sub_pel_filters_8s,
+  sub_pel_filters_8smooth,
+  sub_pel_filters_8sharp,
+#if CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
+  sub_pel_filters_8smooth2,
+#endif
   bilinear_filters
 };

diff --git a/vp10/common/filter.h b/vp10/common/filter.h
index 826cd03..de26b76 100644
--- a/vp10/common/filter.h
+++ b/vp10/common/filter.h

@@ -24,16 +24,24 @@
 #define EIGHTTAP            0
 #define EIGHTTAP_SMOOTH     1
 #define EIGHTTAP_SHARP      2
+
+#if CONFIG_EXT_INTERP
+#define SUPPORT_NONINTERPOLATING_FILTERS 0  /* turn it on for experimentation */
+#define EIGHTTAP_SMOOTH2    3
+#define SWITCHABLE_FILTERS  4 /* Number of switchable filters */
+#else
 #define SWITCHABLE_FILTERS  3 /* Number of switchable filters */
-#define BILINEAR            3
+#endif  // CONFIG_EXT_INTERP
 // The codec can operate in four possible inter prediction filter mode:
 // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+
+#define BILINEAR            (SWITCHABLE_FILTERS)
+#define SWITCHABLE          (SWITCHABLE_FILTERS + 1)  /* the last one */
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-#define SWITCHABLE 4 /* should be the last one */
 
 typedef uint8_t INTERP_FILTER;
 
-extern const InterpKernel *vp10_filter_kernels[4];
+extern const InterpKernel *vp10_filter_kernels[SWITCHABLE_FILTERS + 1];
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 5ee15c8..d42f5f5 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c

@@ -13,107 +13,1138 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp10/common/blockd.h"
+#include "vp10/common/enums.h"
 #include "vp10/common/idct.h"
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
+#if CONFIG_EXT_TX
+void idst4_c(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  // stage 1
+  temp1 = (input[3] + input[1]) * cospi_16_64;
+  temp2 = (input[3] - input[1]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[2] * cospi_24_64 - input[0] * cospi_8_64;
+  temp2 = input[2] * cospi_8_64 + input[0] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], 8);
+  output[1] = WRAPLOW(-step[1] - step[2], 8);
+  output[2] = WRAPLOW(step[1] - step[2], 8);
+  output[3] = WRAPLOW(step[3] - step[0], 8);
+#else
+  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
+  static const int32_t sinvalue_lookup[] = {
+    141124871, 228344838,
+  };
+  int64_t sum;
+  int64_t s03 = (input[0] + input[3]);
+  int64_t d03 = (input[0] - input[3]);
+  int64_t s12 = (input[1] + input[2]);
+  int64_t d12 = (input[1] - input[2]);
+  sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
+  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
+  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
+  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
+  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+#endif  // USE_DST2
+}
+
+void idst8_c(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  // vp9_igentx8(input, output, Tx8);
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[7];
+  step1[2] = input[3];
+  step1[1] = input[5];
+  step1[3] = input[1];
+  temp1 = input[6] * cospi_28_64 - input[0] * cospi_4_64;
+  temp2 = input[6] * cospi_4_64 + input[0] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = input[2] * cospi_12_64 - input[4] * cospi_20_64;
+  temp2 = input[2] * cospi_20_64 + input[4] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], 8);
+  output[1] = WRAPLOW(-step1[1] - step1[6], 8);
+  output[2] = WRAPLOW(step1[2] + step1[5], 8);
+  output[3] = WRAPLOW(-step1[3] - step1[4], 8);
+  output[4] = WRAPLOW(step1[3] - step1[4], 8);
+  output[5] = WRAPLOW(-step1[2] + step1[5], 8);
+  output[6] = WRAPLOW(step1[1] - step1[6], 8);
+  output[7] = WRAPLOW(-step1[0] + step1[7], 8);
+#else
+  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
+  static const int32_t sinvalue_lookup[] = {
+    86559612, 162678858, 219176632, 249238470
+  };
+  int64_t sum;
+  int64_t s07 = (input[0] + input[7]);
+  int64_t d07 = (input[0] - input[7]);
+  int64_t s16 = (input[1] + input[6]);
+  int64_t d16 = (input[1] - input[6]);
+  int64_t s25 = (input[2] + input[5]);
+  int64_t d25 = (input[2] - input[5]);
+  int64_t s34 = (input[3] + input[4]);
+  int64_t d34 = (input[3] - input[4]);
+  sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
+        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
+  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
+        d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
+  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = (s07 + s16 - s34)* sinvalue_lookup[2];
+  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
+        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
+  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
+        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
+  output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = (d07 - d16 + d34)* sinvalue_lookup[2];
+  output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
+        s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
+  output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
+        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
+  output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+#endif  // USE_DST2
+}
+
+void idst16_c(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = input[15];
+  step1[1] = input[7];
+  step1[2] = input[11];
+  step1[3] = input[3];
+  step1[4] = input[13];
+  step1[5] = input[5];
+  step1[6] = input[9];
+  step1[7] = input[1];
+  step1[8] = input[14];
+  step1[9] = input[6];
+  step1[10] = input[10];
+  step1[11] = input[2];
+  step1[12] = input[12];
+  step1[13] = input[4];
+  step1[14] = input[8];
+  step1[15] = input[0];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
+  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
+  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
+  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
+  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
+  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
+  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
+  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
+  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
+  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
+  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
+  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
+  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
+  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
+  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
+  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
+  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
+  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
+  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
+  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
+  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
+  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
+  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], 8);
+  output[1] = WRAPLOW(-step2[1] - step2[14], 8);
+  output[2] = WRAPLOW(step2[2] + step2[13], 8);
+  output[3] = WRAPLOW(-step2[3] - step2[12], 8);
+  output[4] = WRAPLOW(step2[4] + step2[11], 8);
+  output[5] = WRAPLOW(-step2[5] - step2[10], 8);
+  output[6] = WRAPLOW(step2[6] + step2[9], 8);
+  output[7] = WRAPLOW(-step2[7] - step2[8], 8);
+  output[8] = WRAPLOW(step2[7] - step2[8], 8);
+  output[9] = WRAPLOW(-step2[6] + step2[9], 8);
+  output[10] = WRAPLOW(step2[5] - step2[10], 8);
+  output[11] = WRAPLOW(-step2[4] + step2[11], 8);
+  output[12] = WRAPLOW(step2[3] - step2[12], 8);
+  output[13] = WRAPLOW(-step2[2] + step2[13], 8);
+  output[14] = WRAPLOW(step2[1] - step2[14], 8);
+  output[15] = WRAPLOW(-step2[0] + step2[15], 8);
+#else
+  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
+  static const int32_t sinvalue_lookup[] = {
+    47852167, 94074787, 137093803, 175444254,
+    207820161, 233119001, 250479254, 259309736
+  };
+  int64_t sum;
+  int64_t s015 = (input[0] + input[15]);
+  int64_t d015 = (input[0] - input[15]);
+  int64_t s114 = (input[1] + input[14]);
+  int64_t d114 = (input[1] - input[14]);
+  int64_t s213 = (input[2] + input[13]);
+  int64_t d213 = (input[2] - input[13]);
+  int64_t s312 = (input[3] + input[12]);
+  int64_t d312 = (input[3] - input[12]);
+  int64_t s411 = (input[4] + input[11]);
+  int64_t d411 = (input[4] - input[11]);
+  int64_t s510 = (input[5] + input[10]);
+  int64_t d510 = (input[5] - input[10]);
+  int64_t s69  = (input[6] + input[9]);
+  int64_t d69  = (input[6] - input[9]);
+  int64_t s78  = (input[7] + input[8]);
+  int64_t d78  = (input[7] - input[8]);
+  sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
+        s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
+        s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
+        s69  * sinvalue_lookup[6] + s78  * sinvalue_lookup[7];
+  output[0]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
+        d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
+        d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
+        d69  * sinvalue_lookup[2] + d78  * sinvalue_lookup[0];
+  output[1]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
+        s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
+        s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
+        s69  * sinvalue_lookup[3] - s78  * sinvalue_lookup[6];
+  output[2]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
+        d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
+        d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
+        d69  * sinvalue_lookup[5] - d78  * sinvalue_lookup[1];
+  output[3]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
+        s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
+        s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
+        s69  * sinvalue_lookup[0] + s78  * sinvalue_lookup[5];
+  output[4]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
+        d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
+        d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
+        d69  * sinvalue_lookup[7] + d78  * sinvalue_lookup[2];
+  output[5]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
+        s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
+        s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
+        s69  * sinvalue_lookup[1] - s78  * sinvalue_lookup[4];
+  output[6]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
+        d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
+        d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
+        d69  * sinvalue_lookup[4] - d78  * sinvalue_lookup[3];
+  output[7]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
+        s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
+        s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
+        s69  * sinvalue_lookup[4] + s78  * sinvalue_lookup[3];
+  output[8]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
+        d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
+        d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
+        d69  * sinvalue_lookup[1] + d78  * sinvalue_lookup[4];
+  output[9]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
+        s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
+        s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
+        s69  * sinvalue_lookup[7] - s78  * sinvalue_lookup[2];
+  output[10] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
+        d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
+        d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
+        d69  * sinvalue_lookup[0] - d78  * sinvalue_lookup[5];
+  output[11] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
+        s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
+        s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
+        s69  * sinvalue_lookup[5] + s78  * sinvalue_lookup[1];
+  output[12] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
+        d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
+        d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
+        d69  * sinvalue_lookup[3] + d78  * sinvalue_lookup[6];
+  output[13] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
+        s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
+        s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
+        s69  * sinvalue_lookup[2] - s78  * sinvalue_lookup[0];
+  output[14] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+  sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
+        d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
+        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
+        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
+  output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
+#endif  // USE_DST2
+}
+
+// Inverse identiy transform and add.
+static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int bs) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c)
+      dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+    dest += stride;
+    input += bs;
+  }
+}
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+
+static void maybe_flip_strides(uint8_t **dst, int *dstride,
+                               tran_low_t **src, int *sstride,
+                               int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case FLIPADST_DST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case DST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_idst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+#if USE_DST2
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[3] + input[1]) * cospi_16_64;
+  temp2 = (input[3] - input[1]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = input[2] * cospi_24_64 - input[0] * cospi_8_64;
+  temp2 = input[2] * cospi_8_64 + input[0] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], bd);
+  output[1] = WRAPLOW(-step[1] - step[2], bd);
+  output[2] = WRAPLOW(step[1] - step[2], bd);
+  output[3] = WRAPLOW(step[3] - step[0], bd);
+#else
+  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
+  static const int32_t sinvalue_lookup[] = {
+    141124871, 228344838,
+  };
+  int64_t sum;
+  int64_t s03 = (input[0] + input[3]);
+  int64_t d03 = (input[0] - input[3]);
+  int64_t s12 = (input[1] + input[2]);
+  int64_t d12 = (input[1] - input[2]);
+
+#if !CONFIG_EMULATE_HARDWARE
+  (void)bd;
+#endif
+
+  sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
+  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
+  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
+  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
+  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+#endif  // USE_DST2
+}
+
+void highbd_idst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+#if USE_DST2
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  step1[0] = input[7];
+  step1[2] = input[3];
+  step1[1] = input[5];
+  step1[3] = input[1];
+  temp1 = input[6] * cospi_28_64 - input[0] * cospi_4_64;
+  temp2 = input[6] * cospi_4_64 + input[0] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = input[2] * cospi_12_64 - input[4] * cospi_20_64;
+  temp2 = input[2] * cospi_20_64 + input[4] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = WRAPLOW(-step1[1] - step1[6], bd);
+  output[2] = WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = WRAPLOW(-step1[3] - step1[4], bd);
+  output[4] = WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = WRAPLOW(-step1[2] + step1[5], bd);
+  output[6] = WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = WRAPLOW(-step1[0] + step1[7], bd);
+#else
+  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
+  static const int32_t sinvalue_lookup[] = {
+    86559612, 162678858, 219176632, 249238470
+  };
+  int64_t sum;
+  int64_t s07 = (input[0] + input[7]);
+  int64_t d07 = (input[0] - input[7]);
+  int64_t s16 = (input[1] + input[6]);
+  int64_t d16 = (input[1] - input[6]);
+  int64_t s25 = (input[2] + input[5]);
+  int64_t d25 = (input[2] - input[5]);
+  int64_t s34 = (input[3] + input[4]);
+  int64_t d34 = (input[3] - input[4]);
+
+#if !CONFIG_EMULATE_HARDWARE
+  (void)bd;
+#endif
+
+  sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
+        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
+  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
+        d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
+  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = (s07 + s16 - s34)* sinvalue_lookup[2];
+  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
+        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
+  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
+        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
+  output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = (d07 - d16 + d34)* sinvalue_lookup[2];
+  output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
+        s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
+  output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
+        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
+  output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+#endif  // USE_DST2
+}
+
+void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+#if USE_DST2
+  // vp9_highbd_igentx16(input, output, bd, Tx16);
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[15];
+  step1[1] = input[7];
+  step1[2] = input[11];
+  step1[3] = input[3];
+  step1[4] = input[13];
+  step1[5] = input[5];
+  step1[6] = input[9];
+  step1[7] = input[1];
+  step1[8] = input[14];
+  step1[9] = input[6];
+  step1[10] = input[10];
+  step1[11] = input[2];
+  step1[12] = input[12];
+  step1[13] = input[4];
+  step1[14] = input[8];
+  step1[15] = input[0];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = WRAPLOW(-step2[1] - step2[14], bd);
+  output[2] = WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = WRAPLOW(-step2[3] - step2[12], bd);
+  output[4] = WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = WRAPLOW(-step2[5] - step2[10], bd);
+  output[6] = WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = WRAPLOW(-step2[7] - step2[8], bd);
+  output[8] = WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = WRAPLOW(-step2[6] + step2[9], bd);
+  output[10] = WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = WRAPLOW(-step2[4] + step2[11], bd);
+  output[12] = WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = WRAPLOW(-step2[2] + step2[13], bd);
+  output[14] = WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = WRAPLOW(-step2[0] + step2[15], bd);
+#else
+  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
+  static const int32_t sinvalue_lookup[] = {
+    47852167, 94074787, 137093803, 175444254,
+    207820161, 233119001, 250479254, 259309736
+  };
+  int64_t sum;
+  int64_t s015 = (input[0] + input[15]);
+  int64_t d015 = (input[0] - input[15]);
+  int64_t s114 = (input[1] + input[14]);
+  int64_t d114 = (input[1] - input[14]);
+  int64_t s213 = (input[2] + input[13]);
+  int64_t d213 = (input[2] - input[13]);
+  int64_t s312 = (input[3] + input[12]);
+  int64_t d312 = (input[3] - input[12]);
+  int64_t s411 = (input[4] + input[11]);
+  int64_t d411 = (input[4] - input[11]);
+  int64_t s510 = (input[5] + input[10]);
+  int64_t d510 = (input[5] - input[10]);
+  int64_t s69  = (input[6] + input[9]);
+  int64_t d69  = (input[6] - input[9]);
+  int64_t s78  = (input[7] + input[8]);
+  int64_t d78  = (input[7] - input[8]);
+
+#if !CONFIG_EMULATE_HARDWARE
+  (void)bd;
+#endif
+
+  sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
+        s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
+        s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
+        s69  * sinvalue_lookup[6] + s78  * sinvalue_lookup[7];
+  output[0]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
+        d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
+        d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
+        d69  * sinvalue_lookup[2] + d78  * sinvalue_lookup[0];
+  output[1]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
+        s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
+        s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
+        s69  * sinvalue_lookup[3] - s78  * sinvalue_lookup[6];
+  output[2]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
+        d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
+        d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
+        d69  * sinvalue_lookup[5] - d78  * sinvalue_lookup[1];
+  output[3]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
+        s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
+        s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
+        s69  * sinvalue_lookup[0] + s78  * sinvalue_lookup[5];
+  output[4]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
+        d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
+        d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
+        d69  * sinvalue_lookup[7] + d78  * sinvalue_lookup[2];
+  output[5]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
+        s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
+        s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
+        s69  * sinvalue_lookup[1] - s78  * sinvalue_lookup[4];
+  output[6]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
+        d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
+        d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
+        d69  * sinvalue_lookup[4] - d78  * sinvalue_lookup[3];
+  output[7]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
+        s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
+        s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
+        s69  * sinvalue_lookup[4] + s78  * sinvalue_lookup[3];
+  output[8]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
+        d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
+        d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
+        d69  * sinvalue_lookup[1] + d78  * sinvalue_lookup[4];
+  output[9]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
+        s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
+        s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
+        s69  * sinvalue_lookup[7] - s78  * sinvalue_lookup[2];
+  output[10] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
+        d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
+        d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
+        d69  * sinvalue_lookup[0] - d78  * sinvalue_lookup[5];
+  output[11] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
+        s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
+        s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
+        s69  * sinvalue_lookup[5] + s78  * sinvalue_lookup[1];
+  output[12] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
+        d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
+        d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
+        d69  * sinvalue_lookup[3] + d78  * sinvalue_lookup[6];
+  output[13] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
+        s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
+        s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
+        s69  * sinvalue_lookup[2] - s78  * sinvalue_lookup[0];
+  output[14] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+  sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
+        d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
+        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
+        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
+  output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
+#endif  // USE_DST2
+}
+
+static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bs, int bd) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c)
+      dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+    dest += stride;
+    input += bs;
+  }
+}
+
+static void maybe_flip_strides16(uint16_t **dst, int *dstride,
+                                 tran_low_t **src, int *sstride,
+                                 int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case FLIPADST_DST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case DST_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
+
 void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         int tx_type) {
-  const transform_2d IHT_4[] = {
-    { idct4_c, idct4_c  },  // DCT_DCT  = 0
-    { iadst4_c, idct4_c  },   // ADST_DCT = 1
-    { idct4_c, iadst4_c },    // DCT_ADST = 2
-    { iadst4_c, iadst4_c }      // ADST_ADST = 3
+                          int tx_type) {
+  static const transform_2d IHT_4[] = {
+    { idct4_c,  idct4_c  },  // DCT_DCT           = 0,
+    { iadst4_c, idct4_c  },  // ADST_DCT          = 1,
+    { idct4_c,  iadst4_c },  // DCT_ADST          = 2,
+    { iadst4_c, iadst4_c },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+    { iadst4_c, idct4_c  },  // FLIPADST_DCT      = 4,
+    { idct4_c,  iadst4_c },  // DCT_FLIPADST      = 5,
+    { iadst4_c, iadst4_c },  // FLIPADST_FLIPADST = 6,
+    { iadst4_c, iadst4_c },  // ADST_FLIPADST     = 7,
+    { iadst4_c, iadst4_c },  // FLIPADST_ADST     = 8,
+    { idst4_c,  idct4_c  },  // DST_DCT           = 9,
+    { idct4_c,  idst4_c  },  // DCT_DST           = 10,
+    { idst4_c,  iadst4_c },  // DST_ADST          = 11,
+    { iadst4_c, idst4_c  },  // ADST_DST          = 12,
+    { idst4_c,  iadst4_c },  // DST_FLIPADST      = 13,
+    { iadst4_c, idst4_c  },  // FLIPADST_DST      = 14,
+    { idst4_c,  idst4_c  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
   };
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr);
+    IHT_4[tx_type].rows(input, out[i]);
     input  += 4;
-    outptr += 4;
+  }
+
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
     }
   }
 }
 
-static const transform_2d IHT_8[] = {
-  { idct8_c,  idct8_c  },  // DCT_DCT  = 0
-  { iadst8_c, idct8_c  },  // ADST_DCT = 1
-  { idct8_c,  iadst8_c },  // DCT_ADST = 2
-  { iadst8_c, iadst8_c }   // ADST_ADST = 3
-};
-
 void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
+  static const transform_2d IHT_8[] = {
+    { idct8_c,  idct8_c  },  // DCT_DCT           = 0,
+    { iadst8_c, idct8_c  },  // ADST_DCT          = 1,
+    { idct8_c,  iadst8_c },  // DCT_ADST          = 2,
+    { iadst8_c, iadst8_c },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+    { iadst8_c, idct8_c  },  // FLIPADST_DCT      = 4,
+    { idct8_c,  iadst8_c },  // DCT_FLIPADST      = 5,
+    { iadst8_c, iadst8_c },  // FLIPADST_FLIPADST = 6,
+    { iadst8_c, iadst8_c },  // ADST_FLIPADST     = 7,
+    { iadst8_c, iadst8_c },  // FLIPADST_ADST     = 8,
+    { idst8_c,  idct8_c  },  // DST_DCT           = 9,
+    { idct8_c,  idst8_c  },  // DCT_DST           = 10,
+    { idst8_c,  iadst8_c },  // DST_ADST          = 11,
+    { iadst8_c, idst8_c  },  // ADST_DST          = 12,
+    { idst8_c,  iadst8_c },  // DST_FLIPADST      = 13,
+    { iadst8_c, idst8_c  },  // FLIPADST_DST      = 14,
+    { idst8_c,  idst8_c  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
+  };
+
   int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const transform_2d ht = IHT_8[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
 
   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr);
-    input += 8;
-    outptr += 8;
+    IHT_8[tx_type].rows(input, out[i]);
+    input  += 8;
+  }
+
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
     }
   }
 }
 
-static const transform_2d IHT_16[] = {
-  { idct16_c,  idct16_c  },  // DCT_DCT  = 0
-  { iadst16_c, idct16_c  },  // ADST_DCT = 1
-  { idct16_c,  iadst16_c },  // DCT_ADST = 2
-  { iadst16_c, iadst16_c }   // ADST_ADST = 3
-};
-
 void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const transform_2d ht = IHT_16[tx_type];
+  static const transform_2d IHT_16[] = {
+    { idct16_c,  idct16_c  },  // DCT_DCT           = 0,
+    { iadst16_c, idct16_c  },  // ADST_DCT          = 1,
+    { idct16_c,  iadst16_c },  // DCT_ADST          = 2,
+    { iadst16_c, iadst16_c },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+    { iadst16_c, idct16_c  },  // FLIPADST_DCT      = 4,
+    { idct16_c,  iadst16_c },  // DCT_FLIPADST      = 5,
+    { iadst16_c, iadst16_c },  // FLIPADST_FLIPADST = 6,
+    { iadst16_c, iadst16_c },  // ADST_FLIPADST     = 7,
+    { iadst16_c, iadst16_c },  // FLIPADST_ADST     = 8,
+    { idst16_c,  idct16_c  },  // DST_DCT           = 9,
+    { idct16_c,  idst16_c  },  // DCT_DST           = 10,
+    { idst16_c,  iadst16_c },  // DST_ADST          = 11,
+    { iadst16_c, idst16_c  },  // ADST_DST          = 12,
+    { idst16_c,  iadst16_c },  // DST_FLIPADST      = 13,
+    { iadst16_c, idst16_c  },  // FLIPADST_DST      = 14,
+    { idst16_c,  idst16_c  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
+  };
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr);
-    input += 16;
-    outptr += 16;
+    IHT_16[tx_type].rows(input, out[i]);
+    input  += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
     }
   }
 }
@@ -183,20 +1214,43 @@
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_iwht4x4_add(input, dest, stride, eob);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vp10_idct4x4_add(input, dest, stride, eob);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_iht4x4_16_add(input, dest, stride, tx_type);
-        break;
-      default:
-        assert(0);
-        break;
-    }
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_idct4x4_add(input, dest, stride, eob);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_iht4x4_16_add(input, dest, stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_iht4x4_16_add(input, dest, stride, tx_type);
+      break;
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -211,6 +1265,28 @@
     case ADST_ADST:
       vp10_iht8x8_64_add(input, dest, stride, tx_type);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_iht8x8_64_add(input, dest, stride, tx_type);
+      break;
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -228,6 +1304,28 @@
     case ADST_ADST:
       vp10_iht16x16_256_add(input, dest, stride, tx_type);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_iht16x16_256_add(input, dest, stride, tx_type);
+      break;
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -240,6 +1338,11 @@
     case DCT_DCT:
       vp10_idct32x32_add(input, dest, stride, eob);
       break;
+#if CONFIG_EXT_TX
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 32);
+      break;
+#endif  // CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -254,104 +1357,198 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  const highbd_transform_2d IHT_4[] = {
-    { vpx_highbd_idct4_c, vpx_highbd_idct4_c  },    // DCT_DCT  = 0
-    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c },    // ADST_DCT = 1
-    { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },    // DCT_ADST = 2
-    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }    // ADST_ADST = 3
+  static const highbd_transform_2d HIGH_IHT_4[] = {
+    { vpx_highbd_idct4_c,  vpx_highbd_idct4_c  },  // DCT_DCT           = 0,
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // ADST_DCT          = 1,
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_ADST          = 2,
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // FLIPADST_DCT      = 4,
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_FLIPADST      = 5,
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_FLIPADST = 6,
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_FLIPADST     = 7,
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_ADST     = 8,
+    {     highbd_idst4_c,  vpx_highbd_idct4_c  },  // DST_DCT           = 9,
+    { vpx_highbd_idct4_c,      highbd_idst4_c  },  // DCT_DST           = 10,
+    {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_ADST          = 11,
+    { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // ADST_DST          = 12,
+    {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_FLIPADST      = 13,
+    { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // FLIPADST_DST      = 14,
+    {     highbd_idst4_c,      highbd_idst4_c  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
   };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
-  // Inverse transform row vectors.
+  // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr, bd);
+    HIGH_IHT_4[tx_type].rows(input, out[i], bd);
     input  += 4;
-    outptr += 4;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 4), bd);
     }
   }
 }
 
-static const highbd_transform_2d HIGH_IHT_8[] = {
-  { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT  = 0
-  { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT = 1
-  { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST = 2
-  { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }   // ADST_ADST = 3
-};
-
 void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
+  static const highbd_transform_2d HIGH_IHT_8[] = {
+    { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT           = 0,
+    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT          = 1,
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST          = 2,
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // FLIPADST_DCT      = 4,
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_FLIPADST      = 5,
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_FLIPADST = 6,
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_FLIPADST     = 7,
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_ADST     = 8,
+    {     highbd_idst8_c,  vpx_highbd_idct8_c  },  // DST_DCT           = 9,
+    { vpx_highbd_idct8_c,      highbd_idst8_c  },  // DCT_DST           = 10,
+    {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_ADST          = 11,
+    { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // ADST_DST          = 12,
+    {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_FLIPADST      = 13,
+    { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // FLIPADST_DST      = 14,
+    {     highbd_idst8_c,      highbd_idst8_c  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
+  };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Inverse transform row vectors.
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 8;
-    outptr += 8;
+    HIGH_IHT_8[tx_type].rows(input, out[i], bd);
+    input  += 8;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out, bd);
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
     }
   }
 }
 
-static const highbd_transform_2d HIGH_IHT_16[] = {
-  { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT  = 0
-  { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT = 1
-  { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST = 2
-  { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }   // ADST_ADST = 3
-};
-
 void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
+  static const highbd_transform_2d HIGH_IHT_16[] = {
+    { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT           = 0,
+    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT          = 1,
+    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST          = 2,
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // FLIPADST_DCT      = 4,
+    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_FLIPADST      = 5,
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_FLIPADST = 6,
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_FLIPADST     = 7,
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_ADST     = 8,
+    {     highbd_idst16_c,  vpx_highbd_idct16_c  },  // DST_DCT           = 9,
+    { vpx_highbd_idct16_c,      highbd_idst16_c  },  // DCT_DST           = 10,
+    {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_ADST          = 11,
+    { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // ADST_DST          = 12,
+    {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_FLIPADST      = 13,
+    { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // FLIPADST_DST      = 14,
+    {     highbd_idst16_c,      highbd_idst16_c  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
+  };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 16;
-    outptr += 16;
+    HIGH_IHT_16[tx_type].rows(input, out[i], bd);
+    input  += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out, bd);
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
     }
   }
 }
@@ -425,20 +1622,43 @@
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_highbd_iwht4x4_add(input, dest, stride, eob, bd);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-         vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
-         break;
-      default:
-        assert(0);
-        break;
-    }
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+      break;
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 4, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -454,6 +1674,28 @@
     case ADST_ADST:
       vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+      break;
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 8, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -472,6 +1714,28 @@
     case ADST_ADST:
       vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+      break;
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 16, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -485,6 +1749,11 @@
     case DCT_DCT:
       vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
       break;
+#if CONFIG_EXT_TX
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
@@ -496,3 +1765,66 @@
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                  INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp10_inv_txfm_add_32x32(input, dest, stride, eob, tx_type);
+      break;
+    case TX_16X16:
+      vp10_inv_txfm_add_16x16(input, dest, stride, eob, tx_type);
+      break;
+    case TX_8X8:
+      vp10_inv_txfm_add_8x8(input, dest, stride, eob, tx_type);
+      break;
+    case TX_4X4:
+      // this is like vp10_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      vp10_inv_txfm_add_4x4(input, dest, stride, eob, tx_type,
+                            lossless);
+      break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                         INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int bd = inv_txfm_param->bd;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp10_highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X16:
+      vp10_highbd_inv_txfm_add_16x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X8:
+      vp10_highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_4X4:
+      // this is like vp10_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      vp10_highbd_inv_txfm_add_4x4(input, dest, stride, eob, bd, tx_type,
+                                   lossless);
+      break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vp10/common/idct.h b/vp10/common/idct.h
index 0883398..31b26b8 100644
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h

@@ -24,6 +24,16 @@
 extern "C" {
 #endif
 
+typedef struct INV_TXFM_PARAM {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int eob;
+  int lossless;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int bd;
+#endif
+} INV_TXFM_PARAM;
+
 typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
 
 typedef struct {
@@ -51,7 +61,8 @@
                              int stride, int eob, TX_TYPE tx_type);
 void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, TX_TYPE tx_type);
-
+void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                  INV_TXFM_PARAM *inv_txfm_param);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
@@ -74,6 +85,8 @@
 void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                     int stride, int eob, int bd,
                                     TX_TYPE tx_type);
+void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                         INV_TXFM_PARAM *inv_txfm_param);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index a1925de..20d724d 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c

@@ -719,11 +719,7 @@
   uint64_t *const int_4x4_y = &lfm->int_4x4_y;
   uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
   uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
-#if CONFIG_MISC_FIXES
   uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
-#else
-  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
-#endif
   int i;
 
   // If filter level is 0 we don't loop filter.
@@ -758,13 +754,8 @@
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-#if CONFIG_MISC_FIXES
   if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
     return;
-#else
-  if (mbmi->skip && is_inter_block(mbmi))
-    return;
-#endif
 
   // Here we are adding a mask for the transform size. The transform
   // size mask is set to be correct for a 64x64 prediction block size. We
@@ -796,10 +787,18 @@
 // we only update u and v masks on the first block.
 static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          const MODE_INFO *mi, const int shift_y,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                          LOOP_FILTER_MASK *lfm) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
   const TX_SIZE tx_size_y = mbmi->tx_size;
+#if CONFIG_SUPERTX
+  const BLOCK_SIZE block_size =
+      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
+#else
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+#endif
   const int filter_level = get_filter_level(lfi_n, mbmi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -821,13 +820,8 @@
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-#if CONFIG_MISC_FIXES
   if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
     return;
-#else
-  if (mbmi->skip && is_inter_block(mbmi))
-    return;
-#endif
 
   *above_y |= (size_mask[block_size] &
                above_64x64_txform_mask[tx_size_y]) << shift_y;
@@ -913,6 +907,10 @@
             break;
           case BLOCK_32X16:
             build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip[0]->mbmi))
+              break;
+#endif
             if (mi_32_row_offset + 2 >= max_rows)
               continue;
             mip2 = mip + mode_info_stride * 2;
@@ -920,12 +918,22 @@
             break;
           case BLOCK_16X32:
             build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip[0]->mbmi))
+              break;
+#endif
             if (mi_32_col_offset + 2 >= max_cols)
               continue;
             mip2 = mip + 2;
             build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
             break;
           default:
+#if CONFIG_SUPERTX
+            if (mip[0]->mbmi.tx_size == TX_32X32) {
+              build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+              break;
+            }
+#endif
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
               const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
               const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
@@ -942,23 +950,45 @@
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   break;
                 case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip[0]->mbmi))
+                    break;
+#endif
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   if (mi_16_row_offset + 1 >= max_rows)
                     continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y+8,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                   break;
                 case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip[0]->mbmi))
+                    break;
+#endif
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   if (mi_16_col_offset +1 >= max_cols)
                     continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y+1,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                   break;
                 default: {
                   const int shift_y = shift_32_y[idx_32] +
                                       shift_16_y[idx_16] +
                                       shift_8_y[0];
+#if CONFIG_SUPERTX
+                  if (mip[0]->mbmi.tx_size == TX_16X16) {
+                    build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                    break;
+                  }
+#endif
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
@@ -973,7 +1003,11 @@
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y,
+#if CONFIG_SUPERTX
+                                 supertx_enabled(&mip[0]->mbmi),
+#endif
+                                 lfm);
                   }
                   break;
                 }
@@ -1019,11 +1053,7 @@
       lfm->above_uv[i] &= mask_uv;
     }
     lfm->int_4x4_y &= mask_y;
-#if CONFIG_MISC_FIXES
     lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
-#else
-    lfm->int_4x4_uv &= mask_uv;
-#endif
 
     // We don't apply a wide loop filter on the last uv block row. If set
     // apply the shorter one instead.
@@ -1057,11 +1087,7 @@
       lfm->above_uv[i] &= mask_uv;
     }
     lfm->int_4x4_y &= mask_y;
-#if CONFIG_MISC_FIXES
     lfm->left_int_4x4_uv &= mask_uv_int;
-#else
-    lfm->int_4x4_uv &= mask_uv_int;
-#endif
 
     // We don't apply a wide loop filter on the last uv column. If set
     // apply the shorter one instead.
@@ -1091,11 +1117,7 @@
   assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
   assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
   assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
-#if CONFIG_MISC_FIXES
   assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
-#else
-  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
-#endif
   assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
   assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
   assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
@@ -1103,11 +1125,7 @@
   assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
   assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
   assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
-#if CONFIG_MISC_FIXES
   assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
-#else
-  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
-#endif
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1183,9 +1201,9 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp10_filter_block_plane_non420(VP10_COMMON *cm,
-                                   struct macroblockd_plane *plane,
-                                   MODE_INFO **mi_8x8,
-                                   int mi_row, int mi_col) {
+                                    struct macroblockd_plane *plane,
+                                    MODE_INFO **mi_8x8,
+                                    int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
@@ -1209,49 +1227,103 @@
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+      const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+      const BLOCK_SIZE sb_type = mbmi->sb_type;
+      const int skip_this = mbmi->skip && is_inter_block(mbmi);
+      const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+      const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
-          !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
+          !blk_col : 1;
       const int skip_this_c = skip_this && !block_edge_left;
       // top edge of current unit is block/partition edge -> no skip
       const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
-          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
+          !blk_row : 1;
       const int skip_this_r = skip_this && !block_edge_above;
+
+#if CONFIG_VAR_TX
+      TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+          ? get_uv_tx_size(mbmi, plane) : mbmi->tx_size;
+#else
       const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi, plane)
-                            : mi[0].mbmi.tx_size;
+                            ? get_uv_tx_size(mbmi, plane)
+                            : mbmi->tx_size;
+#endif
+
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
+      TX_SIZE tx_size_c = tx_size;
+      TX_SIZE tx_size_r = tx_size;
+
+      int tx_size_mask = 0;
       // Filter level can vary per MI
       if (!(lfl[(r << 3) + (c >> ss_x)] =
-            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
+            get_filter_level(&cm->lf_info, mbmi)))
         continue;
 
+      if (tx_size == TX_32X32)
+        tx_size_mask = 3;
+      else if (tx_size == TX_16X16)
+        tx_size_mask = 1;
+      else
+        tx_size_mask = 0;
+
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi) && !mbmi->skip)
+        tx_size = (plane->plane_type == PLANE_TYPE_UV) ?
+            get_uv_tx_size_impl(mbmi->inter_tx_size[blk_row * 8 + blk_col],
+                                sb_type, ss_x, ss_y) :
+            mbmi->inter_tx_size[blk_row * 8 + blk_col];
+
+      tx_size_r = VPXMIN(tx_size, cm->above_txfm_context[mi_col + c]);
+      tx_size_c = VPXMIN(tx_size, cm->left_txfm_context[(mi_row + r) & 0x07]);
+
+      cm->above_txfm_context[mi_col + c] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & 0x07] = tx_size;
+#endif
+
       // Build masks based on the transform size of each block
-      if (tx_size == TX_32X32) {
-        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
+      // handle vertical mask
+      if (tx_size_c == TX_32X32) {
+        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
           if (!skip_border_4x4_c)
             mask_16x16_c |= 1 << (c >> ss_x);
           else
             mask_8x8_c |= 1 << (c >> ss_x);
         }
-        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {
+      } else if (tx_size_c == TX_16X16) {
+        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+          if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= 1 << (c >> ss_x);
+          else
+            mask_4x4_c |= 1 << (c >> ss_x);
+        }
+
+        if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+            ((c >> ss_x) & tx_size_mask) == 0)
+          mask_4x4_int[r] |= 1 << (c >> ss_x);
+      }
+
+      // set horizontal mask
+      if (tx_size_r == TX_32X32) {
+        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= 1 << (c >> ss_x);
           else
             mask_8x8[r] |= 1 << (c >> ss_x);
         }
-      } else if (tx_size == TX_16X16) {
-        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {
-          if (!skip_border_4x4_c)
-            mask_16x16_c |= 1 << (c >> ss_x);
-          else
-            mask_8x8_c |= 1 << (c >> ss_x);
-        }
-        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {
+      } else if (tx_size_r == TX_16X16) {
+        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= 1 << (c >> ss_x);
           else
@@ -1259,21 +1331,15 @@
         }
       } else {
         // force 8x8 filtering on 32x32 boundaries
-        if (!skip_this_c) {
-          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)
-            mask_8x8_c |= 1 << (c >> ss_x);
-          else
-            mask_4x4_c |= 1 << (c >> ss_x);
-        }
-
-        if (!skip_this_r) {
-          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)
+        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
+          if (tx_size_r == TX_8X8 || ((r >> ss_y) & 3) == 0)
             mask_8x8[r] |= 1 << (c >> ss_x);
           else
             mask_4x4[r] |= 1 << (c >> ss_x);
         }
 
-        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+        if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+            ((r >> ss_y) & tx_size_mask) == 0)
           mask_4x4_int[r] |= 1 << (c >> ss_x);
       }
     }
@@ -1462,11 +1528,7 @@
   uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
   uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
   uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-#if CONFIG_MISC_FIXES
   uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
-#else
-  uint16_t mask_4x4_int = lfm->int_4x4_uv;
-#endif
 
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
 
@@ -1518,11 +1580,7 @@
   mask_16x16 = lfm->above_uv[TX_16X16];
   mask_8x8 = lfm->above_uv[TX_8X8];
   mask_4x4 = lfm->above_uv[TX_4X4];
-#if CONFIG_MISC_FIXES
   mask_4x4_int = lfm->above_int_4x4_uv;
-#else
-  mask_4x4_int = lfm->int_4x4_uv;
-#endif
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
     const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
@@ -1568,13 +1626,14 @@
 }
 
 void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          VP10_COMMON *cm,
-                          struct macroblockd_plane planes[MAX_MB_PLANE],
-                          int start, int stop, int y_only) {
+                           VP10_COMMON *cm,
+                           struct macroblockd_plane planes[MAX_MB_PLANE],
+                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+#if !CONFIG_VAR_TX
   enum lf_path path;
   LOOP_FILTER_MASK lfm;
-  int mi_row, mi_col;
 
   if (y_only)
     path = LF_PATH_444;
@@ -1584,19 +1643,29 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
+#endif
 
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
+#endif
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
+#if CONFIG_VAR_TX
+    memset(cm->left_txfm_context, TX_SIZES, 8);
+#endif
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
+#if CONFIG_VAR_TX
+      for (plane = 0; plane < num_planes; ++plane)
+        vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                       mi_row, mi_col);
+#else
       // TODO(JBB): Make setup_mask work for non 420.
       vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
                      &lfm);
-
       vp10_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
       for (plane = 1; plane < num_planes; ++plane) {
         switch (path) {
@@ -1612,6 +1681,7 @@
             break;
         }
       }
+#endif
     }
   }
 }

diff --git a/vp10/common/loopfilter.h b/vp10/common/loopfilter.h
index 8db705a..3d76439 100644
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h

@@ -43,7 +43,8 @@
   uint8_t mode_ref_delta_enabled;
   uint8_t mode_ref_delta_update;
 
-  // 0 = Intra, Last, GF, ARF
+  // 0 = Intra, Last, Last2+Last3+LAST4(CONFIG_EXT_REFS),
+  // GF, ARF
   signed char ref_deltas[MAX_REF_FRAMES];
   signed char last_ref_deltas[MAX_REF_FRAMES];
 
@@ -80,12 +81,8 @@
   uint64_t int_4x4_y;
   uint16_t left_uv[TX_SIZES];
   uint16_t above_uv[TX_SIZES];
-#if CONFIG_MISC_FIXES
   uint16_t left_int_4x4_uv;
   uint16_t above_int_4x4_uv;
-#else
-  uint16_t int_4x4_uv;
-#endif
   uint8_t lfl_y[64];
   uint8_t lfl_uv[16];
 } LOOP_FILTER_MASK;

diff --git a/vp10/common/mv.h b/vp10/common/mv.h
index b4971a5..289c591 100644
--- a/vp10/common/mv.h
+++ b/vp10/common/mv.h

@@ -34,6 +34,13 @@
   int32_t col;
 } MV32;
 
+#if CONFIG_REF_MV
+typedef struct candidate_mv {
+  int_mv this_mv;
+  int weight;
+} CANDIDATE_MV;
+#endif
+
 static INLINE int is_zero_mv(const MV *mv) {
   return *((const uint32_t *)mv) == 0;
 }
@@ -48,6 +55,9 @@
   mv->row = clamp(mv->row, min_row, max_row);
 }
 
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & 0x0F) || (mv->col & 0x0F);
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/mvref_common.c b/vp10/common/mvref_common.c
index 1ef80c2..319ef4a 100644
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c

@@ -11,6 +11,427 @@
 
 #include "vp10/common/mvref_common.h"
 
+#if CONFIG_REF_MV
+static uint8_t scan_row_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME ref_frame,
+                             int row_offset,
+                             CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  int i;
+  uint8_t newmv_count = 0;
+
+  for (i = 0; i < xd->n8_w && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
+    POSITION mi_pos;
+    mi_pos.row = row_offset;
+    mi_pos.col = i;
+
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      const int len = VPXMIN(xd->n8_w,
+                             num_8x8_blocks_wide_lookup[candidate->sb_type]);
+      const int weight = len;
+      int index = 0, ref;
+
+      for (ref = 0; ref < 2; ++ref) {
+        if (candidate->ref_frame[ref] == ref_frame) {
+          int_mv this_refmv =
+              get_sub_block_mv(candidate_mi, ref, mi_pos.col, block);
+          for (index = 0; index < *refmv_count; ++index)
+            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
+              break;
+
+          if (index < *refmv_count)
+            ref_mv_stack[index].weight += weight;
+
+          // Add a new item to the list.
+          if (index == *refmv_count) {
+            ref_mv_stack[index].this_mv = this_refmv;
+            ref_mv_stack[index].weight = weight;
+            ++(*refmv_count);
+
+            if (candidate->mode == NEWMV)
+              ++newmv_count;
+          }
+        }
+      }
+      i += len;
+    } else {
+      ++i;
+    }
+  }
+
+  return newmv_count;
+}
+
+static uint8_t scan_col_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME ref_frame,
+                             int col_offset,
+                             CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  int i;
+  uint8_t newmv_count = 0;
+
+  for (i = 0; i < xd->n8_h && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
+    POSITION mi_pos;
+    mi_pos.row = i;
+    mi_pos.col = col_offset;
+
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      const int len = VPXMIN(xd->n8_h,
+                       num_8x8_blocks_high_lookup[candidate->sb_type]);
+      const int weight = len;
+      int index = 0, ref;
+
+      for (ref = 0; ref < 2; ++ref) {
+        if (candidate->ref_frame[ref] == ref_frame) {
+          int_mv this_refmv =
+              get_sub_block_mv(candidate_mi, ref, mi_pos.col, block);
+          for (index = 0; index < *refmv_count; ++index)
+            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
+              break;
+
+          if (index < *refmv_count)
+            ref_mv_stack[index].weight += weight;
+
+          if (index == *refmv_count) {
+            ref_mv_stack[index].this_mv = this_refmv;
+            ref_mv_stack[index].weight = weight;
+            ++(*refmv_count);
+
+            if (candidate->mode == NEWMV)
+              ++newmv_count;
+          }
+        }
+      }
+      i += len;
+    } else {
+      ++i;
+    }
+  }
+
+  return newmv_count;
+}
+
+static uint8_t scan_blk_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME ref_frame,
+                             int row_offset, int col_offset,
+                             CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  POSITION mi_pos;
+  uint8_t newmv_count = 0;
+
+  mi_pos.row = row_offset;
+  mi_pos.col = col_offset;
+
+  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos) &&
+      *refmv_count < MAX_REF_MV_STACK_SIZE) {
+    const MODE_INFO *const candidate_mi =
+        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+    const int len = 1;
+    const int weight = len;
+    int index = 0, ref;
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (candidate->ref_frame[ref] == ref_frame) {
+        int_mv this_refmv =
+            get_sub_block_mv(candidate_mi, ref, mi_pos.col, block);
+        for (index = 0; index < *refmv_count; ++index)
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
+            break;
+
+        if (index < *refmv_count)
+          ref_mv_stack[index].weight += weight;
+
+        if (index == *refmv_count) {
+          ref_mv_stack[index].this_mv = this_refmv;
+          ref_mv_stack[index].weight = weight;
+          ++(*refmv_count);
+
+          if (candidate->mode == NEWMV)
+            ++newmv_count;
+        }
+
+        if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0) {
+          int alt_block = 3 - block;
+          this_refmv =
+              get_sub_block_mv(candidate_mi, ref, mi_pos.col, alt_block);
+          for (index = 0; index < *refmv_count; ++index)
+            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
+              break;
+
+          if (index < *refmv_count)
+            ref_mv_stack[index].weight += weight;
+
+          // Add a new item to the list.
+          if (index == *refmv_count) {
+            ref_mv_stack[index].this_mv = this_refmv;
+            ref_mv_stack[index].weight = weight;
+            ++(*refmv_count);
+          }
+        }
+      }
+    }
+  }  // Analyze a single 8x8 block motion information.
+  return newmv_count;
+}
+
+static int has_top_right(const MACROBLOCKD *xd,
+                         int mi_row, int mi_col, int bs) {
+  int has_tr = !((mi_row & bs) & (bs * 2 - 1)) ||
+               !((mi_col & bs) & (bs * 2 - 1));
+
+  // Filter out partial right-most boundaries
+  if ((mi_col & bs) & (bs * 2 - 1)) {
+    if (((mi_col & (2 * bs)) & (bs * 4 - 1)) &&
+        ((mi_row & (2 * bs)) & (bs * 4 - 1)))
+      has_tr = 0;
+  }
+
+  if (has_tr)
+    if (((mi_col + xd->n8_w) & 0x07) == 0)
+      if ((mi_row & 0x07) > 0)
+        has_tr = 0;
+
+  if (xd->n8_w < xd->n8_h)
+    if (!xd->is_sec_rect)
+      has_tr = 1;
+
+  if (xd->n8_w > xd->n8_h)
+    if (xd->is_sec_rect)
+      has_tr = 0;
+
+  return has_tr;
+}
+
+static void handle_sec_rect_block(const MB_MODE_INFO * const candidate,
+                                  uint8_t refmv_count,
+                                  CANDIDATE_MV *ref_mv_stack,
+                                  MV_REFERENCE_FRAME ref_frame,
+                                  int16_t *mode_context) {
+  int rf, idx;
+
+  for (rf = 0; rf < 2; ++rf) {
+    if (candidate->ref_frame[rf] == ref_frame) {
+      const int list_range = VPXMIN(refmv_count, MAX_MV_REF_CANDIDATES);
+
+      const int_mv pred_mv = candidate->mv[rf];
+      for (idx = 0; idx < list_range; ++idx)
+        if (pred_mv.as_int == ref_mv_stack[idx].this_mv.as_int)
+          break;
+
+      if (idx < list_range) {
+        if (idx == 0)
+          mode_context[ref_frame] |= (1 << SKIP_NEARESTMV_OFFSET);
+        else if (idx == 1)
+          mode_context[ref_frame] |= (1 << SKIP_NEARMV_OFFSET);
+      }
+    }
+  }
+}
+
+static void setup_ref_mv_list(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                              MV_REFERENCE_FRAME ref_frame,
+                              uint8_t *refmv_count,
+                              CANDIDATE_MV *ref_mv_stack,
+                              int_mv *mv_ref_list,
+                              int block, int mi_row, int mi_col,
+                              int16_t *mode_context) {
+  int idx, nearest_refmv_count = 0;
+  uint8_t newmv_count = 0;
+
+  CANDIDATE_MV tmp_mv;
+  int len, nr_len;
+
+  const MV_REF *const prev_frame_mvs_base = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+
+  int bs = VPXMAX(xd->n8_w, xd->n8_h);
+  int has_tr = has_top_right(xd, mi_row, mi_col, bs);
+
+  mode_context[ref_frame] = 0;
+  *refmv_count = 0;
+
+  // Scan the first above row mode info.
+  newmv_count = scan_row_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                              -1, ref_mv_stack, refmv_count);
+  // Scan the first left column mode info.
+  newmv_count += scan_col_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                               -1, ref_mv_stack, refmv_count);
+
+  // Check top-right boundary
+  if (has_tr)
+    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                                 -1, 1, ref_mv_stack, refmv_count);
+
+  nearest_refmv_count = *refmv_count;
+
+  if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame) {
+    int ref;
+    int blk_row, blk_col;
+
+    for (blk_row = 0; blk_row < xd->n8_h; ++blk_row) {
+      for (blk_col = 0; blk_col < xd->n8_w; ++blk_col) {
+        const MV_REF *prev_frame_mvs =
+            prev_frame_mvs_base + blk_row * cm->mi_cols + blk_col;
+
+        POSITION mi_pos;
+        mi_pos.row = blk_row;
+        mi_pos.col = blk_col;
+
+        if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos))
+          continue;
+
+        for (ref = 0; ref < 2; ++ref) {
+          if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
+            for (idx = 0; idx < *refmv_count; ++idx)
+              if (prev_frame_mvs->mv[ref].as_int ==
+                  ref_mv_stack[idx].this_mv.as_int)
+                break;
+
+            if (idx < *refmv_count)
+              ref_mv_stack[idx].weight += 1;
+
+            if (idx == *refmv_count &&
+                *refmv_count < MAX_REF_MV_STACK_SIZE) {
+              ref_mv_stack[idx].this_mv.as_int = prev_frame_mvs->mv[ref].as_int;
+              ref_mv_stack[idx].weight = 1;
+              ++(*refmv_count);
+
+              if (abs(ref_mv_stack[idx].this_mv.as_mv.row) >= 8 ||
+                  abs(ref_mv_stack[idx].this_mv.as_mv.col) >= 8)
+                mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (*refmv_count == nearest_refmv_count)
+    mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+
+  // Analyze the top-left corner block mode info.
+//  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+//                -1, -1, ref_mv_stack, refmv_count);
+
+  // Scan the second outer area.
+  scan_row_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                -2, ref_mv_stack, refmv_count);
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                -2, ref_mv_stack, refmv_count);
+
+  // Scan the third outer area.
+  scan_row_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                -3, ref_mv_stack, refmv_count);
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                -3, ref_mv_stack, refmv_count);
+
+  // Scan the fourth outer area.
+  scan_row_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                -4, ref_mv_stack, refmv_count);
+  // Scan the third left row mode info.
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+                -4, ref_mv_stack, refmv_count);
+
+  switch (nearest_refmv_count) {
+    case 0:
+      mode_context[ref_frame] |= 0;
+      if (*refmv_count >= 1)
+        mode_context[ref_frame] |= 1;
+
+      if (*refmv_count == 1)
+        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+      else if (*refmv_count >= 2)
+        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+      break;
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+
+      if (*refmv_count == 1)
+        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+      else if (*refmv_count >= 2)
+        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+      break;
+
+    case 2:
+    default:
+      if (newmv_count >= 2)
+        mode_context[ref_frame] |= 4;
+      else if (newmv_count == 1)
+        mode_context[ref_frame] |= 5;
+      else
+        mode_context[ref_frame] |= 6;
+
+      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+      break;
+  }
+
+  // Rank the likelihood and assign nearest and near mvs.
+  len = nearest_refmv_count;
+  while (len > 0) {
+    nr_len = 0;
+    for (idx = 1; idx < len; ++idx) {
+      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
+        tmp_mv = ref_mv_stack[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  len = *refmv_count;
+  while (len > nearest_refmv_count) {
+    nr_len = nearest_refmv_count;
+    for (idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
+        tmp_mv = ref_mv_stack[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  // TODO(jingning): Clean-up needed.
+  if (xd->is_sec_rect) {
+    if (xd->n8_w < xd->n8_h) {
+      const MODE_INFO *const candidate_mi = xd->mi[-1];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      handle_sec_rect_block(candidate, nearest_refmv_count, ref_mv_stack,
+                            ref_frame, mode_context);
+    }
+
+    if (xd->n8_w > xd->n8_h) {
+      const MODE_INFO *const candidate_mi = xd->mi[-xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      handle_sec_rect_block(candidate, nearest_refmv_count, ref_mv_stack,
+                            ref_frame, mode_context);
+    }
+  }
+
+  for (idx = 0; idx < VPXMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) {
+    mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+    clamp_mv_ref(&mv_ref_list[idx].as_mv,
+                 xd->n8_w << 3, xd->n8_h << 3, xd);
+  }
+}
+#endif
+
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
@@ -18,7 +439,7 @@
                              int_mv *mv_ref_list,
                              int block, int mi_row, int mi_col,
                              find_mv_refs_sync sync, void *const data,
-                             uint8_t *mode_context) {
+                             int16_t *mode_context) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
@@ -30,11 +451,6 @@
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
 
-#if !CONFIG_MISC_FIXES
-  // Blank the reference vector list
-  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
-#endif
-
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
   // and we also need to keep a mode count.
@@ -133,9 +549,6 @@
     }
 
     if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
-#if !CONFIG_MISC_FIXES
-        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int &&
-#endif
         prev_frame_mvs->ref_frame[1] != ref_frame) {
       int_mv mv = prev_frame_mvs->mv[1];
       if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
@@ -147,28 +560,40 @@
     }
   }
 
- Done:
-
-  mode_context[ref_frame] = counter_to_context[context_counter];
-
-#if CONFIG_MISC_FIXES
+Done:
+  if (mode_context)
+    mode_context[ref_frame] = counter_to_context[context_counter];
   for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
       mv_ref_list[i].as_int = 0;
-#else
-  // Clamp vectors
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
-    clamp_mv_ref(&mv_ref_list[i].as_mv, bw, bh, xd);
-#endif
 }
 
 void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+#if CONFIG_REF_MV
+                      uint8_t *ref_mv_count,
+                      CANDIDATE_MV *ref_mv_stack,
+#endif
                       int_mv *mv_ref_list,
                       int mi_row, int mi_col,
                       find_mv_refs_sync sync, void *const data,
-                      uint8_t *mode_context) {
+                      int16_t *mode_context) {
+#if CONFIG_REF_MV
+  int idx, all_zero = 1;
+#endif
   find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
                    mi_row, mi_col, sync, data, mode_context);
+
+#if CONFIG_REF_MV
+  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack,
+                    mv_ref_list, -1, mi_row, mi_col, mode_context);
+
+  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
+    if (mv_ref_list[idx].as_int != 0)
+      all_zero = 0;
+
+  if (all_zero)
+    mode_context[ref_frame] |= (1 << ALL_ZERO_FLAG_OFFSET);
+#endif
 }
 
 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -194,18 +619,45 @@
 }
 
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest_mv, int_mv *near_mv,
-                                   uint8_t *mode_context) {
+                                    int block, int ref, int mi_row, int mi_col,
+                                    int_mv *nearest_mv, int_mv *near_mv) {
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
   MODE_INFO *const mi = xd->mi[0];
   b_mode_info *bmi = mi->bmi;
   int n;
+#if CONFIG_REF_MV
+  CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE];
+  CANDIDATE_MV tmp_mv;
+  uint8_t ref_mv_count = 0, idx;
+  uint8_t above_count = 0, left_count = 0;
+#endif
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
   find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col, NULL, NULL, mode_context);
+                   mi_row, mi_col, NULL, NULL, NULL);
+
+#if CONFIG_REF_MV
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, mi->mbmi.ref_frame[ref],
+                -1, 0, ref_mv_stack, &ref_mv_count);
+  above_count = ref_mv_count;
+
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, mi->mbmi.ref_frame[ref],
+                0, -1, ref_mv_stack, &ref_mv_count);
+  left_count = ref_mv_count - above_count;
+
+  if (above_count > 1 && left_count > 0) {
+    tmp_mv = ref_mv_stack[1];
+    ref_mv_stack[1] = ref_mv_stack[above_count];
+    ref_mv_stack[above_count] = tmp_mv;
+  }
+
+  for (idx = 0; idx < VPXMIN(MAX_MV_REF_CANDIDATES, ref_mv_count); ++idx) {
+    mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+    clamp_mv_ref(&mv_list[idx].as_mv,
+                 xd->n8_w << 3, xd->n8_h << 3, xd);
+  }
+#endif
 
   near_mv->as_int = 0;
   switch (block) {

diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index 0a98866..3968469 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h

@@ -119,26 +119,13 @@
 };
 
 // clamp_mv_ref
-#if CONFIG_MISC_FIXES
 #define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
-#else
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-#endif
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
-#if CONFIG_MISC_FIXES
   clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
                xd->mb_to_right_edge + bw * 8 + MV_BORDER,
                xd->mb_to_top_edge - bh * 8 - MV_BORDER,
                xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
-#else
-  (void) bw;
-  (void) bh;
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-#endif
 }
 
 // This function returns either the appropriate sub block or block's mv
@@ -164,11 +151,7 @@
   return mv;
 }
 
-#if CONFIG_MISC_FIXES
 #define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
-#else
-#define CLIP_IN_ADD(mv, bw, bh, xd) do {} while (0)
-#endif
 
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
@@ -194,8 +177,6 @@
         ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
                         refmv_count, mv_ref_list, bw, bh, xd, Done); \
       if (has_second_ref(mbmi) && \
-          (CONFIG_MISC_FIXES || \
-           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) && \
           (mbmi)->ref_frame[1] != ref_frame) \
         ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
                         refmv_count, mv_ref_list, bw, bh, xd, Done); \
@@ -214,12 +195,39 @@
            mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
+#if CONFIG_REF_MV
+static int16_t vp10_mode_context_analyzer(const int16_t *const mode_context,
+                                          const MV_REFERENCE_FRAME *const rf,
+                                          BLOCK_SIZE bsize, int block) {
+  int16_t mode_ctx = 0;
+  if (block >= 0) {
+    mode_ctx = mode_context[rf[0]] & 0x00ff;
+
+    if (block > 0 && bsize < BLOCK_8X8 && bsize > BLOCK_4X4)
+      mode_ctx |= (1 << SKIP_NEARESTMV_SUB8X8_OFFSET);
+
+    return mode_ctx;
+  }
+
+  if (rf[1] > INTRA_FRAME)
+    return mode_context[rf[0]] & (mode_context[rf[1]] | 0x00ff);
+  else if (rf[0] != ALTREF_FRAME)
+    return mode_context[rf[0]] & ~(mode_context[ALTREF_FRAME] & 0xfe00);
+  else
+    return mode_context[rf[0]];
+}
+#endif
+
 typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col,
-                      find_mv_refs_sync sync, void *const data,
-                      uint8_t *mode_context);
+                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+#if CONFIG_REF_MV
+                       uint8_t *ref_mv_count,
+                       CANDIDATE_MV *ref_mv_stack,
+#endif
+                       int_mv *mv_ref_list, int mi_row, int mi_col,
+                       find_mv_refs_sync sync, void *const data,
+                       int16_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -228,9 +236,8 @@
                            int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
 
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest_mv, int_mv *near_mv,
-                                   uint8_t *mode_context);
+                                    int block, int ref, int mi_row, int mi_col,
+                                    int_mv *nearest_mv, int_mv *near_mv);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index ffef733..9b7a729 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h

@@ -20,6 +20,7 @@
 #include "vp10/common/entropymv.h"
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
+#include "vp10/common/mv.h"
 #include "vp10/common/frame_buffers.h"
 #include "vp10/common/quant_common.h"
 #include "vp10/common/tile_common.h"
@@ -173,6 +174,12 @@
 #endif
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
+#if CONFIG_EXT_REFS
+  // frame type of the frame before last frame
+  FRAME_TYPE last2_frame_type;
+  // frame type of the frame two frames before last frame
+  FRAME_TYPE last3_frame_type;
+#endif  // CONFIG_EXT_REFS
   FRAME_TYPE frame_type;
 
   int show_frame;
@@ -185,6 +192,8 @@
 
   int allow_high_precision_mv;
 
+  int allow_screen_content_tools;
+
   // Flag signaling which frame contexts should be reset to default values.
   RESET_FRAME_CONTEXT_MODE reset_frame_context;
 
@@ -252,15 +261,12 @@
 
   struct loopfilter lf;
   struct segmentation seg;
-#if !CONFIG_MISC_FIXES
-  struct segmentation_probs segp;
-#endif
 
   int frame_parallel_decode;  // frame-based threading.
 
   // Context probabilities for reference frame prediction
   MV_REFERENCE_FRAME comp_fixed_ref;
-  MV_REFERENCE_FRAME comp_var_ref[2];
+  MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
   REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT *fc;  /* this frame entropy */
@@ -299,6 +305,10 @@
 
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT left_txfm_context[8];
+#endif
   int above_context_alloc_cols;
 
   // scratch memory for intraonly/keyframe forward updates from default tables
@@ -395,6 +405,9 @@
   }
 
   xd->above_seg_context = cm->above_seg_context;
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context;
+#endif
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
 }
@@ -444,6 +457,19 @@
     xd->left_mi = NULL;
     xd->left_mbmi = NULL;
   }
+
+  xd->n8_h = bh;
+  xd->n8_w = bw;
+#if CONFIG_REF_MV
+  xd->is_sec_rect = 0;
+  if (xd->n8_w < xd->n8_h)
+    if (mi_col & (xd->n8_h - 1))
+      xd->is_sec_rect = 1;
+
+  if (xd->n8_w > xd->n8_h)
+    if (mi_row & (xd->n8_w - 1))
+      xd->is_sec_rect = 1;
+#endif
 }
 
 static INLINE const vpx_prob *get_y_mode_probs(const VP10_COMMON *cm,
@@ -487,6 +513,84 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+#if CONFIG_VAR_TX
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx,
+                                TX_SIZE tx_size,
+                                int len) {
+  int i;
+  for (i = 0; i < len; ++i)
+    txfm_ctx[i] = tx_size;
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int bs = num_8x8_blocks_high_lookup[bsize];
+  int i;
+  for (i = 0; i < bs; ++i) {
+    above_ctx[i] = tx_size;
+    left_ctx[i] = tx_size;
+  }
+}
+
+static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size) {
+  int above = *above_ctx < tx_size;
+  int left = *left_ctx < tx_size;
+  return (tx_size - 1) * 3 + above + left;
+}
+#endif
+
+#if CONFIG_EXT_INTERP
+static INLINE int vp10_is_interp_needed(const MACROBLOCKD *const xd) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int is_compound = has_second_ref(mbmi);
+  int intpel_mv;
+
+#if SUPPORT_NONINTERPOLATING_FILTERS
+  // TODO(debargha): This is is currently only for experimentation
+  // with non-interpolating filters. Remove later.
+  // If any of the filters are non-interpolating, then indicate the
+  // interpolation filter always.
+  int i;
+  for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+    if (!IsInterpolatingFilter(i)) return 1;
+  }
+#endif
+
+  // For scaled references, interpolation filter is indicated all the time.
+  if (vp10_is_scaled(&xd->block_refs[0]->sf))
+    return 1;
+  if (is_compound && vp10_is_scaled(&xd->block_refs[1]->sf))
+    return 1;
+
+  if (bsize < BLOCK_8X8) {
+    intpel_mv =
+        !mv_has_subpel(&mi->bmi[0].as_mv[0].as_mv) &&
+        !mv_has_subpel(&mi->bmi[1].as_mv[0].as_mv) &&
+        !mv_has_subpel(&mi->bmi[2].as_mv[0].as_mv) &&
+        !mv_has_subpel(&mi->bmi[3].as_mv[0].as_mv);
+    if (is_compound && intpel_mv) {
+      intpel_mv &=
+          !mv_has_subpel(&mi->bmi[0].as_mv[1].as_mv) &&
+          !mv_has_subpel(&mi->bmi[1].as_mv[1].as_mv) &&
+          !mv_has_subpel(&mi->bmi[2].as_mv[1].as_mv) &&
+          !mv_has_subpel(&mi->bmi[3].as_mv[1].as_mv);
+    }
+  } else {
+    intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
+    if (is_compound && intpel_mv) {
+      intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
+    }
+  }
+  return !intpel_mv;
+}
+#endif  // CONFIG_EXT_INTERP
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/pred_common.c b/vp10/common/pred_common.c
index 2e79e0d..9c42794 100644
--- a/vp10/common/pred_common.c
+++ b/vp10/common/pred_common.c

@@ -103,9 +103,424 @@
   return ctx;
 }
 
+#if CONFIG_EXT_REFS
+
+// TODO(zoeliu): Future work will be conducted to optimize the context design
+// for the coding of the reference frames.
+
+#define CHECK_LAST_OR_LAST2(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME))
+
+#define CHECK_GOLDEN_LAST3_LAST4(ref_frame) \
+  ((ref_frame == GOLDEN_FRAME) || (ref_frame == LAST3_FRAME) || \
+  (ref_frame == LAST4_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is either
+// GOLDEN/LAST3/LAST4, or LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is either
+//               GOLDEN_FRAME/LAST3_FRAME/LAST4_FRAME.
+int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 +
+            2 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[0]));
+      else  // comp pred (1/3)
+        pred_context = 1 +
+            2 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[var_ref_idx]));
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && CHECK_GOLDEN_LAST3_LAST4(vrfa)) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((vrfa == ALTREF_FRAME && CHECK_LAST_OR_LAST2(vrfl)) ||
+            (vrfl == ALTREF_FRAME && CHECK_LAST_OR_LAST2(vrfa))) {
+          pred_context = 4;
+        } else if (vrfa == vrfl || (CHECK_LAST_OR_LAST2(vrfa) &&
+                                    CHECK_LAST_OR_LAST2(vrfl))) {
+          pred_context = 3;
+        } else {  // Either vrfa or vrfl is GOLDEN / LAST3 / LAST4
+          // NOTE(zoeliu): Following assert may be removed once confirmed.
+          assert(CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                 CHECK_GOLDEN_LAST3_LAST4(vrfl));
+          pred_context = 1;
+        }
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (CHECK_GOLDEN_LAST3_LAST4(vrfc) && !CHECK_GOLDEN_LAST3_LAST4(rfs))
+          pred_context = 1;
+        else if (CHECK_GOLDEN_LAST3_LAST4(rfs) &&
+                 !CHECK_GOLDEN_LAST3_LAST4(vrfc))
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else {  // comp/comp
+        if ((CHECK_LAST_OR_LAST2(vrfa) && CHECK_LAST_OR_LAST2(vrfl))) {
+          pred_context = 4;
+        } else {
+          // NOTE(zoeliu): Following assert may be removed once confirmed.
+          assert(CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                 CHECK_GOLDEN_LAST3_LAST4(vrfl));
+          pred_context = 2;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context =
+            4 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[var_ref_idx]));
+      else
+        pred_context = 3 * (!CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[0]));
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is LAST,
+// conditioning on that it is known either LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
+// conditioning on it is either LAST_FRAME or LAST2_FRAME.
+int vp10_get_pred_context_comp_ref_p1(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
+                                != LAST_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && vrfa == LAST_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (vrfa == LAST_FRAME || vrfl == LAST_FRAME)
+          pred_context = 1;
+        else if (CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                 CHECK_GOLDEN_LAST3_LAST4(vrfl))
+          pred_context = 2 + (vrfa != vrfl);
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (vrfc == LAST_FRAME && rfs != LAST_FRAME)
+          pred_context = 1;
+        else if (rfs == LAST_FRAME && vrfc != LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (vrfc == LAST2_FRAME || CHECK_GOLDEN_LAST3_LAST4(rfs));
+      } else {  // comp/comp
+        if (vrfa == LAST_FRAME || vrfl == LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 + (CHECK_GOLDEN_LAST3_LAST4(vrfa) ||
+                              CHECK_GOLDEN_LAST3_LAST4(vrfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != LAST_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_GOLDEN_LAST3_LAST4(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#define CHECK_LAST3_OR_LAST4(ref_frame) \
+  ((ref_frame == LAST3_FRAME) || (ref_frame == LAST4_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is GOLDEN,
+// conditioning on that it is known either GOLDEN/LAST3/LAST4.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
+// conditioning on it is either GOLDEN / LAST3 / LAST4.
+int vp10_get_pred_context_comp_ref_p2(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != GOLDEN_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[var_ref_idx] != GOLDEN_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && vrfa == GOLDEN_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (vrfa == GOLDEN_FRAME || vrfl == GOLDEN_FRAME)
+          pred_context = 1;
+        else if (CHECK_LAST_OR_LAST2(vrfa) || CHECK_LAST_OR_LAST2(vrfl))
+          pred_context = 2 + (vrfa != vrfl);
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (vrfc == GOLDEN_FRAME && rfs != GOLDEN_FRAME)
+          pred_context = 1;
+        else if (rfs == GOLDEN_FRAME && vrfc != GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST3_OR_LAST4(vrfc) || CHECK_LAST_OR_LAST2(rfs));
+      } else {  // comp/comp
+        if (vrfa == GOLDEN_FRAME || vrfl == GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST_OR_LAST2(vrfa) || CHECK_LAST_OR_LAST2(vrfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != GOLDEN_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == GOLDEN_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#define CHECK_LAST_LAST2_GOLDEN(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME) || \
+  (ref_frame == GOLDEN_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode is LAST3,
+// conditioning on that it is known either LAST3/LAST4.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3_FRAME,
+// conditioning on it is either LAST3 / LAST4.
+int vp10_get_pred_context_comp_ref_p3(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+  const int var_ref_idx = !fix_ref_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST3_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[var_ref_idx] != LAST3_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
+                                           : above_mbmi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
+                                           : left_mbmi->ref_frame[var_ref_idx];
+
+      if (vrfa == vrfl && vrfa == LAST3_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (vrfa == LAST3_FRAME || vrfl == LAST3_FRAME)
+          pred_context = 1;
+        else if (CHECK_LAST_LAST2_GOLDEN(vrfa) || CHECK_LAST_LAST2_GOLDEN(vrfl))
+          pred_context = 2 + (vrfa != vrfl);
+        else if (vrfa == vrfl)
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+        if (vrfc == LAST3_FRAME && rfs != LAST3_FRAME)
+          pred_context = 1;
+        else if (rfs == LAST3_FRAME && vrfc != LAST3_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (vrfc == LAST4_FRAME || CHECK_LAST_LAST2_GOLDEN(rfs));
+      } else {  // comp/comp
+        if (vrfa == LAST3_FRAME || vrfl == LAST3_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST_LAST2_GOLDEN(vrfa) || CHECK_LAST_LAST2_GOLDEN(vrfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != LAST3_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == LAST3_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_LAST_LAST2_GOLDEN(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#else  // CONFIG_EXT_REFS
+
 // Returns a context number for the given MB prediction signal
 int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
+                                     const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -186,6 +601,472 @@
   return pred_context;
 }
 
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+
+#define CHECK_GOLDEN_OR_ALTREF(ref_frame) \
+  ((ref_frame == GOLDEN_FRAME) || (ref_frame == ALTREF_FRAME))
+
+// For the bit to signal whether the single reference is a ALTREF_FRAME
+// or a GOLDEN_FRAME.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF/GOLDEN.
+int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]));
+      else
+        pred_context = 1 + (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
+                            !CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (!CHECK_GOLDEN_OR_ALTREF(above0) ||
+                            !CHECK_GOLDEN_OR_ALTREF(above1) ||
+                            !CHECK_GOLDEN_OR_ALTREF(left0) ||
+                            !CHECK_GOLDEN_OR_ALTREF(left1));
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (!CHECK_GOLDEN_OR_ALTREF(rfs))
+          pred_context = 3 + (!CHECK_GOLDEN_OR_ALTREF(crf1) ||
+                              !CHECK_GOLDEN_OR_ALTREF(crf2));
+        else
+          pred_context = !CHECK_GOLDEN_OR_ALTREF(crf1) ||
+                         !CHECK_GOLDEN_OR_ALTREF(crf2);
+      } else {
+        pred_context = 2 * (!CHECK_GOLDEN_OR_ALTREF(above0)) +
+                       2 * (!CHECK_GOLDEN_OR_ALTREF(left0));
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]));
+      else
+        pred_context = 1 + (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
+                            !CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// GOLDEN_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF_FRAME, conditioning
+// on it is either ALTREF_FRAME/GOLDEN_FRAME.
+int vp10_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+      } else {
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == GOLDEN_FRAME ||
+                              above1 == GOLDEN_FRAME ||
+                              left0 == GOLDEN_FRAME ||
+                              left1 == GOLDEN_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == GOLDEN_FRAME)
+          pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+      } else {
+        if (!CHECK_GOLDEN_OR_ALTREF(above0) && !CHECK_GOLDEN_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_GOLDEN_OR_ALTREF(above0) ||
+                   !CHECK_GOLDEN_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_GOLDEN_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == GOLDEN_FRAME);
+        } else {
+          pred_context = 2 * (above0 == GOLDEN_FRAME) +
+                         2 * (left0  == GOLDEN_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST3/LAST4 or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3/LAST4, conditioning
+// on it is either LAST3/LAST4/LAST2/LAST.
+int vp10_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      } else {
+        pred_context = 1 +
+            2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                 CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (CHECK_LAST_OR_LAST2(above0) ||
+                              CHECK_LAST_OR_LAST2(above1) ||
+                              CHECK_LAST_OR_LAST2(left0) ||
+                              CHECK_LAST_OR_LAST2(left1));
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (CHECK_LAST_OR_LAST2(rfs))
+          pred_context = 3 + (CHECK_LAST_OR_LAST2(crf1) ||
+                              CHECK_LAST_OR_LAST2(crf2));
+        else if (rfs == LAST3_FRAME || rfs == LAST4_FRAME)
+          pred_context = (CHECK_LAST_OR_LAST2(crf1) ||
+                          CHECK_LAST_OR_LAST2(crf2));
+        else
+          pred_context = 1 + 2 * (CHECK_LAST_OR_LAST2(crf1) ||
+                                  CHECK_LAST_OR_LAST2(crf2));
+      } else {
+        if (CHECK_GOLDEN_OR_ALTREF(above0) && CHECK_GOLDEN_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (CHECK_GOLDEN_OR_ALTREF(above0) ||
+                   CHECK_GOLDEN_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              CHECK_GOLDEN_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
+        } else {
+          pred_context = 2 * CHECK_LAST_OR_LAST2(above0) +
+                         2 * CHECK_LAST_OR_LAST2(left0);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (CHECK_GOLDEN_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
+    else
+      pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                          CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
+// on it is either LAST2_FRAME/LAST_FRAME.
+int vp10_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                 edge_mbmi->ref_frame[1] == LAST_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                              left0 == LAST_FRAME || left1 == LAST_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else if (rfs == LAST2_FRAME)
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        if (!CHECK_LAST_OR_LAST2(above0) &&
+            !CHECK_LAST_OR_LAST2(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_LAST_OR_LAST2(above0) ||
+                   !CHECK_LAST_OR_LAST2(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_LAST_OR_LAST2(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST4_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST4_FRAME, conditioning
+// on it is either LAST4_FRAME/LAST3_FRAME.
+int vp10_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_LAST3_OR_LAST4(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                 edge_mbmi->ref_frame[1] == LAST3_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
+                              left0 == LAST3_FRAME || left1 == LAST3_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST3_FRAME)
+          pred_context = 3 + (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else if (rfs == LAST4_FRAME)
+          pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+      } else {
+        if (!CHECK_LAST3_OR_LAST4(above0) &&
+            !CHECK_LAST3_OR_LAST4(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_LAST3_OR_LAST4(above0) ||
+                   !CHECK_LAST3_OR_LAST4(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_LAST3_OR_LAST4(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST3_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST3_FRAME) +
+                         2 * (left0 == LAST3_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_LAST3_OR_LAST4(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST3_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+#else  // CONFIG_EXT_REFS
+
 int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -337,3 +1218,5 @@
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
 }
+
+#endif  // CONFIG_EXT_REFS

diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index d6d7146..4ebfcdb 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h

@@ -87,25 +87,77 @@
                                     const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p(const VP10_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
+                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp10_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context];
+  return cm->fc->comp_ref_prob[pred_context][0];
 }
 
+#if CONFIG_EXT_REFS
+int vp10_get_pred_context_comp_ref_p1(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p1(const VP10_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_ref_p1(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][1];
+}
+
+int vp10_get_pred_context_comp_ref_p2(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p2(const VP10_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_ref_p2(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][2];
+}
+
+int vp10_get_pred_context_comp_ref_p3(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p3(const VP10_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_ref_p3(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][3];
+}
+#endif  // CONFIG_EXT_REFS
+
 int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_single_ref_p1(const VP10_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
+                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp10_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_single_ref_p2(const VP10_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
+                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p2(xd)][1];
 }
 
+#if CONFIG_EXT_REFS
+int vp10_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p3(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p3(xd)][2];
+}
+
+int vp10_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p4(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p4(xd)][3];
+}
+
+int vp10_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p5(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p5(xd)][4];
+}
+#endif  // CONFIG_EXT_REFS
+
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
@@ -165,6 +217,71 @@
   }
 }
 
+#if CONFIG_VAR_TX
+static void update_tx_counts(VP10_COMMON *cm, MACROBLOCKD *xd,
+                             MB_MODE_INFO *mbmi, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int blk_row, int blk_col,
+                             TX_SIZE max_tx_size, int ctx,
+                             struct tx_counts *tx_counts) {
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    ++get_tx_counts(max_tx_size, ctx, tx_counts)[tx_size];
+    mbmi->tx_size = tx_size;
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+      update_tx_counts(cm, xd, mbmi, plane_bsize,
+                       tx_size - 1, offsetr, offsetc,
+                       max_tx_size, ctx, tx_counts);
+    }
+  }
+}
+
+static INLINE void inter_block_tx_count_update(VP10_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               MB_MODE_INFO *mbmi,
+                                               BLOCK_SIZE plane_bsize,
+                                               int ctx,
+                                               struct tx_counts *tx_counts) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      update_tx_counts(cm, xd, mbmi, plane_bsize, max_tx_size, idy, idx,
+                       max_tx_size, ctx, tx_counts);
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index fdcb967..241b9aa 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c

@@ -64,9 +64,9 @@
 }
 
 void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                   int bw, int bh,
-                                   int x, int y, int w, int h,
-                                   int mi_x, int mi_y) {
+                            int bw, int bh,
+                            int x, int y, int w, int h,
+                            int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const MODE_INFO *mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
@@ -264,3 +264,227 @@
     }
   }
 }
+
+#if CONFIG_SUPERTX
+static const uint8_t mask_8[8] = {
+  64, 64, 62, 52, 12,  2,  0,  0
+};
+
+static const uint8_t mask_16[16] = {
+  63, 62, 60, 58, 55, 50, 43, 36, 28, 21, 14, 9, 6, 4, 2, 1
+};
+
+static const uint8_t mask_32[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 61, 57, 52, 45, 36,
+  28, 19, 12,  7,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+static const uint8_t mask_8_uv[8] = {
+  64, 64, 62, 52,  12,  2,  0,  0
+};
+
+static const uint8_t mask_16_uv[16] = {
+  64, 64, 64, 64, 61, 53, 45, 36, 28, 19, 11, 3, 0,  0,  0,  0
+};
+
+static const uint8_t mask_32_uv[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 54, 46, 36,
+  28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+static void generate_1dmask(int length, uint8_t *mask, int plane) {
+  switch (length) {
+    case 8:
+      memcpy(mask, plane ? mask_8_uv : mask_8, length);
+      break;
+    case 16:
+      memcpy(mask, plane ? mask_16_uv : mask_16, length);
+      break;
+    case 32:
+      memcpy(mask, plane ? mask_32_uv : mask_32, length);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+void vp10_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd,
+    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    const struct macroblockd_plane *pd, int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+    PARTITION_TYPE partition, int plane) {
+  int i, j;
+  uint8_t mask[MAXTXLEN];
+  int top_w = 4 << b_width_log2_lookup[top_bsize],
+      top_h = 4 << b_height_log2_lookup[top_bsize];
+  int w = 4 << b_width_log2_lookup[bsize], h = 4 << b_height_log2_lookup[bsize];
+  int w_offset = (mi_col - mi_col_ori) << 3,
+      h_offset = (mi_row - mi_row_ori) << 3;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
+  uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
+  int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  top_w >>= pd->subsampling_x;
+  top_h >>= pd->subsampling_y;
+  w >>= pd->subsampling_x;
+  h >>= pd->subsampling_y;
+  w_offset >>= pd->subsampling_x;
+  h_offset >>= pd->subsampling_y;
+
+  switch (partition) {
+    case PARTITION_HORZ:
+    {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (b_hdb) {
+        uint16_t *dst_tmp = dst16 + h_offset * dst_stride;
+        uint16_t *dst2_tmp = dst216 + h_offset * dst2_stride;
+        generate_1dmask(h, mask + h_offset,
+                        plane && xd->plane[plane].subsampling_y);
+
+        for (i = h_offset; i < h_offset + h; i++) {
+          for (j = 0; j < top_w; j++) {
+            const int m = mask[i];  assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+
+        for (; i < top_h; i ++) {
+          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint16_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        uint8_t *dst_tmp = dst + h_offset * dst_stride;
+        uint8_t *dst2_tmp = dst2 + h_offset * dst2_stride;
+        generate_1dmask(h, mask + h_offset,
+                        plane && xd->plane[plane].subsampling_y);
+
+        for (i = h_offset; i < h_offset + h; i++) {
+          for (j = 0; j < top_w; j++) {
+            const int m = mask[i];  assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+
+        for (; i < top_h; i ++) {
+          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint8_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+
+      break;
+    case PARTITION_VERT:
+    {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (b_hdb) {
+        uint16_t *dst_tmp = dst16;
+        uint16_t *dst2_tmp = dst216;
+        generate_1dmask(w, mask + w_offset,
+                        plane && xd->plane[plane].subsampling_x);
+
+        for (i = 0; i < top_h; i++) {
+          for (j = w_offset; j < w_offset + w; j++) {
+            const int m = mask[j];   assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+          memcpy(dst_tmp + j, dst2_tmp + j,
+                     (top_w - w_offset - w) * sizeof(uint16_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        uint8_t *dst_tmp = dst;
+        uint8_t *dst2_tmp = dst2;
+        generate_1dmask(w, mask + w_offset,
+                        plane && xd->plane[plane].subsampling_x);
+
+        for (i = 0; i < top_h; i++) {
+          for (j = w_offset; j < w_offset + w; j++) {
+            const int m = mask[j];   assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+            memcpy(dst_tmp + j, dst2_tmp + j,
+                       (top_w - w_offset - w) * sizeof(uint8_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+      break;
+    default:
+      assert(0);
+  }
+  (void) xd;
+}
+
+void vp10_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize, int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
+  // bsize can be larger than 8x8.
+  // block (0-3): the sub8x8 location of current block
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; plane++) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane, block, bw, bh,
+                           0, 0, bw, bh,
+                           mi_x, mi_y);
+  }
+}
+#endif  // CONFIG_SUPERTX

diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 5678f47..bc2df9e 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h

@@ -28,9 +28,22 @@
                                    int w, int h, int ref,
                                    const InterpKernel *kernel,
                                    int xs, int ys) {
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  if (kernel[0][SUBPEL_TAPS / 2 - 1] == 128) {
+    // Interpolating filter
+    sf->predict[subpel_x != 0][subpel_y != 0][ref](
+        src, src_stride, dst, dst_stride,
+        kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+  } else {
+    sf->predict_ni[subpel_x != 0][subpel_y != 0][ref](
+        src, src_stride, dst, dst_stride,
+        kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+  }
+#else
   sf->predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
       kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -42,9 +55,22 @@
                                         int w, int h, int ref,
                                         const InterpKernel *kernel,
                                         int xs, int ys, int bd) {
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  if (kernel[0][SUBPEL_TAPS / 2 - 1] == 128) {
+    // Interpolating filter
+    sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+        src, src_stride, dst, dst_stride,
+        kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+  } else {
+    sf->highbd_predict_ni[subpel_x != 0][subpel_y != 0][ref](
+        src, src_stride, dst, dst_stride,
+        kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+  }
+#else
   sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
       kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -127,25 +153,39 @@
 }
 
 void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                   int bw, int bh,
-                                   int x, int y, int w, int h,
-                                   int mi_x, int mi_y);
+                            int bw, int bh,
+                            int x, int y, int w, int h,
+                            int mi_x, int mi_y);
 
 void vp10_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane,
                                        int i, int ir, int ic,
                                        int mi_row, int mi_col);
 
 void vp10_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize);
-
-void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize, int plane);
-
-void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize);
 
+void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, int plane);
+
+void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize);
+
 void vp10_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize);
+                                    BLOCK_SIZE bsize);
+
+#if CONFIG_SUPERTX
+void vp10_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize, int block);
+struct macroblockd_plane;
+void vp10_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd,
+    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    const struct macroblockd_plane *pd, int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+    PARTITION_TYPE partition, int plane);
+
+#endif  // CONFIG_SUPERTX
 
 void vp10_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
@@ -192,7 +232,6 @@
 void vp10_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index e9e3949..4be5394 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c

@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
@@ -21,7 +23,6 @@
 #include "vp10/common/reconintra.h"
 #include "vp10/common/onyxc_int.h"
 
-#if CONFIG_MISC_FIXES
 enum {
   NEED_LEFT = 1 << 1,
   NEED_ABOVE = 1 << 2,
@@ -42,28 +43,7 @@
   NEED_ABOVE | NEED_ABOVERIGHT,             // D63
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
 };
-#else
-enum {
-  NEED_LEFT = 1 << 1,
-  NEED_ABOVE = 1 << 2,
-  NEED_ABOVERIGHT = 1 << 3,
-};
 
-static const uint8_t extend_modes[INTRA_MODES] = {
-  NEED_ABOVE | NEED_LEFT,       // DC
-  NEED_ABOVE,                   // V
-  NEED_LEFT,                    // H
-  NEED_ABOVERIGHT,              // D45
-  NEED_LEFT | NEED_ABOVE,       // D135
-  NEED_LEFT | NEED_ABOVE,       // D117
-  NEED_LEFT | NEED_ABOVE,       // D153
-  NEED_LEFT,                    // D207
-  NEED_ABOVERIGHT,              // D63
-  NEED_LEFT | NEED_ABOVE,       // TM
-};
-#endif
-
-#if CONFIG_MISC_FIXES
 static const uint8_t orders_64x64[1] = { 0 };
 static const uint8_t orders_64x32[2] = { 0, 1 };
 static const uint8_t orders_32x64[2] = { 0, 1 };
@@ -123,11 +103,12 @@
 static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
                           int right_available,
                           TX_SIZE txsz, int y, int x, int ss_x) {
+  const int wl = mi_width_log2_lookup[bsize];
+  const int w = VPXMAX(num_4x4_blocks_wide_lookup[bsize] >> ss_x, 1);
+  const int step = 1 << txsz;
+
   if (y == 0) {
-    int wl = mi_width_log2_lookup[bsize];
-    int hl = mi_height_log2_lookup[bsize];
-    int w = 1 << (wl + 1 - ss_x);
-    int step = 1 << txsz;
+    const int hl = mi_height_log2_lookup[bsize];
     const uint8_t *order = orders[bsize];
     int my_order, tr_order;
 
@@ -148,10 +129,6 @@
 
     return my_order > tr_order && right_available;
   } else {
-    int wl = mi_width_log2_lookup[bsize];
-    int w = 1 << (wl + 1 - ss_x);
-    int step = 1 << txsz;
-
     return x + step < w;
   }
 }
@@ -160,10 +137,10 @@
                            int bottom_available, TX_SIZE txsz,
                            int y, int x, int ss_y) {
   if (x == 0) {
-    int wl = mi_width_log2_lookup[bsize];
-    int hl = mi_height_log2_lookup[bsize];
-    int h = 1 << (hl + 1 - ss_y);
-    int step = 1 << txsz;
+    const int wl = mi_width_log2_lookup[bsize];
+    const int hl = mi_height_log2_lookup[bsize];
+    const int h = 1 << (hl + 1 - ss_y);
+    const int step = 1 << txsz;
     const uint8_t *order = orders[bsize];
     int my_order, bl_order;
 
@@ -188,7 +165,6 @@
     return 0;
   }
 }
-#endif
 
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
@@ -216,15 +192,9 @@
 
   INIT_ALL_SIZES(pred[V_PRED], v);
   INIT_ALL_SIZES(pred[H_PRED], h);
-#if CONFIG_MISC_FIXES
   INIT_ALL_SIZES(pred[D207_PRED], d207e);
   INIT_ALL_SIZES(pred[D45_PRED], d45e);
   INIT_ALL_SIZES(pred[D63_PRED], d63e);
-#else
-  INIT_ALL_SIZES(pred[D207_PRED], d207);
-  INIT_ALL_SIZES(pred[D45_PRED], d45);
-  INIT_ALL_SIZES(pred[D63_PRED], d63);
-#endif
   INIT_ALL_SIZES(pred[D117_PRED], d117);
   INIT_ALL_SIZES(pred[D135_PRED], d135);
   INIT_ALL_SIZES(pred[D153_PRED], d153);
@@ -238,15 +208,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
-#if CONFIG_MISC_FIXES
   INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
   INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
-  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
-#else
-  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
-  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
-  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
-#endif
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63e);
   INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
   INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
   INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
@@ -261,12 +225,598 @@
 #undef intra_pred_allsizes
 }
 
-#if CONFIG_MISC_FIXES
 static INLINE void memset16(uint16_t *dst, int val, int n) {
   while (n--)
     *dst++ = val;
 }
-#endif
+
+#if CONFIG_EXT_INTRA
+#define PI 3.14159265
+#define FILTER_INTRA_PREC_BITS 10
+#define FILTER_INTRA_ROUND_VAL 511
+
+static const uint8_t ext_intra_extend_modes[FILTER_INTRA_MODES] = {
+  NEED_LEFT | NEED_ABOVE,      // FILTER_DC
+  NEED_LEFT | NEED_ABOVE,      // FILTER_V
+  NEED_LEFT | NEED_ABOVE,      // FILTER_H
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D45
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D135
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D117
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D153
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D207
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D63
+  NEED_LEFT | NEED_ABOVE,      // FILTER_TM
+};
+
+// Directional prediction, zone 1: 0 < angle < 90
+static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+                             int dx, int dy) {
+  int r, c, x, y, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx < 0);
+
+  for (r = 0; r < bs; ++r) {
+    y = r + 1;
+    for (c = 0; c < bs; ++c) {
+      x = c * 256 - y * dx;
+      base = x >> 8;
+      shift = x - base * 256;
+      if (base < 2 * bs - 1) {
+        val =
+            (above[base] * (256 - shift) + above[base + 1] * shift + 128) >> 8;
+        dst[c] = clip_pixel(val);
+      } else {
+        dst[c] = above[2 * bs - 1];
+      }
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+                             int dx, int dy) {
+  int r, c, x, y, val1, val2, shift, val, base;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      y = r + 1;
+      x = c * 256 - y * dx;
+      if (x >= -256) {
+        if (x <= 0) {
+          val1 = above[-1];
+          val2 = above[0];
+          shift = x + 256;
+        } else {
+          base = x >> 8;
+          val1 = above[base];
+          val2 = above[base + 1];
+          shift = x - base * 256;
+        }
+      } else {
+        x = c + 1;
+        y = r * 256 - x * dy;
+        base = y >> 8;
+        if (base >= 0) {
+          val1 = left[base];
+          val2 = left[base + 1];
+          shift = y - base * 256;
+        } else {
+          val1 = val2 = left[0];
+          shift = 0;
+        }
+      }
+      val = (val1 * (256 - shift) + val2 * shift + 128) >> 8;
+      dst[c] = clip_pixel(val);
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+                             int dx, int dy) {
+  int r, c, x, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy < 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      x = c + 1;
+      y = r * 256 - x * dy;
+      base = y >> 8;
+      shift = y - base * 256;
+      if (base < 2 * bs - 1) {
+        val =
+            (left[base] * (256 - shift) + left[base + 1] * shift + 128) >> 8;
+        dst[c] = clip_pixel(val);
+      } else {
+        dst[c] = left[bs - 1];
+      }
+    }
+    dst += stride;
+  }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                         const uint8_t *above, const uint8_t *left, int angle) {
+  double t = 0;
+  int dx, dy;
+  int bs = 4 << tx_size;
+
+  if (angle != 90 && angle != 180)
+    t = tan(angle * PI / 180.0);
+  if (angle > 0 && angle < 90) {
+    dx = -((int)(256 / t));
+    dy = 1;
+    dr_prediction_z1(dst, stride, bs, above, left, dx, dy);
+  } else if (angle > 90 && angle < 180) {
+    t = -t;
+    dx = (int)(256 / t);
+    dy = (int)(256 * t);
+    dr_prediction_z2(dst, stride, bs, above, left, dx, dy);
+  } else if (angle > 180 && angle < 270) {
+    dx = 1;
+    dy = -((int)(256 * t));
+    dr_prediction_z3(dst, stride, bs, above, left, dx, dy);
+  } else if (angle == 90) {
+    pred[V_PRED][tx_size](dst, stride, above, left);
+  } else if (angle == 180) {
+    pred[H_PRED][tx_size](dst, stride, above, left);
+  }
+}
+
+static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
+    {
+        {735, 881, -537, -54},
+        {1005, 519, -488, -11},
+        {383, 990, -343, -6},
+        {442, 805, -542, 319},
+        {658, 616, -133, -116},
+        {875, 442, -141, -151},
+        {386, 741, -23, -80},
+        {390, 1027, -446, 51},
+        {679, 606, -523, 262},
+        {903, 922, -778, -23},
+    },
+    {
+        {648, 803, -444, 16},
+        {972, 620, -576, 7},
+        {561, 967, -499, -5},
+        {585, 762, -468, 144},
+        {596, 619, -182, -9},
+        {895, 459, -176, -153},
+        {557, 722, -126, -129},
+        {601, 839, -523, 105},
+        {562, 709, -499, 251},
+        {803, 872, -695, 43},
+    },
+    {
+        {423, 728, -347, 111},
+        {963, 685, -665, 23},
+        {281, 1024, -480, 216},
+        {640, 596, -437, 78},
+        {429, 669, -259, 99},
+        {740, 646, -415, 23},
+        {568, 771, -346, 40},
+        {404, 833, -486, 209},
+        {398, 712, -423, 307},
+        {939, 935, -887, 17},
+    },
+    {
+        {477, 737, -393, 150},
+        {881, 630, -546, 67},
+        {506, 984, -443, -20},
+        {114, 459, -270, 528},
+        {433, 528, 14, 3},
+        {837, 470, -301, -30},
+        {181, 777, 89, -107},
+        {-29, 716, -232, 259},
+        {589, 646, -495, 255},
+        {740, 884, -728, 77},
+    },
+};
+
+static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
+                                         const uint8_t *above,
+                                         const uint8_t *left,
+                                         int mode) {
+  int k, r, c;
+  int pred[33][65];
+  int mean, ipred;
+  const TX_SIZE tx_size = (bs == 32) ? TX_32X32 :
+      ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  const int c0 = filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r)
+    pred[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c)
+    pred[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * pred[r - 1][c] + c1 * pred[r][c - 1] +
+          c2 * pred[r - 1][c - 1] + c3 * pred[r - 1][c + 1];
+      pred[r][c] = ipred < 0 ?
+          -((-ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS) :
+          ((ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel(ipred);
+    }
+    dst += stride;
+  }
+}
+
+static void dc_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED);
+}
+
+static void v_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED);
+}
+
+static void h_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED);
+}
+
+static void d45_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED);
+}
+
+static void d135_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED);
+}
+
+static void d117_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED);
+}
+
+static void d153_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED);
+}
+
+static void d207_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED);
+}
+
+static void d63_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED);
+}
+
+static void tm_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED);
+}
+
+static void (*filter_intra_predictors[EXT_INTRA_MODES])(uint8_t *dst,
+    ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) = {
+        dc_filter_predictor, v_filter_predictor, h_filter_predictor,
+        d45_filter_predictor, d135_filter_predictor, d117_filter_predictor,
+        d153_filter_predictor, d207_filter_predictor, d63_filter_predictor,
+        tm_filter_predictor,
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Directional prediction, zone 1: 0 < angle < 90
+static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd) {
+  int r, c, x, y, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx < 0);
+
+  for (r = 0; r < bs; ++r) {
+    y = r + 1;
+    for (c = 0; c < bs; ++c) {
+      x = c * 256 - y * dx;
+      base = x >> 8;
+      shift = x - base * 256;
+      if (base < 2 * bs - 1) {
+        val =
+            (above[base] * (256 - shift) + above[base + 1] * shift + 128) >> 8;
+        dst[c] = clip_pixel_highbd(val, bd);
+      } else {
+        dst[c] = above[2 * bs - 1];
+      }
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+static void highbd_dr_prediction_z2(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd) {
+  int r, c, x, y, val1, val2, shift, val, base;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      y = r + 1;
+      x = c * 256 - y * dx;
+      if (x >= -256) {
+        if (x <= 0) {
+          val1 = above[-1];
+          val2 = above[0];
+          shift = x + 256;
+        } else {
+          base = x >> 8;
+          val1 = above[base];
+          val2 = above[base + 1];
+          shift = x - base * 256;
+        }
+      } else {
+        x = c + 1;
+        y = r * 256 - x * dy;
+        base = y >> 8;
+        if (base >= 0) {
+          val1 = left[base];
+          val2 = left[base + 1];
+          shift = y - base * 256;
+        } else {
+          val1 = val2 = left[0];
+          shift = 0;
+        }
+      }
+      val = (val1 * (256 - shift) + val2 * shift + 128) >> 8;
+      dst[c] = clip_pixel_highbd(val, bd);
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd) {
+  int r, c, x, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy < 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      x = c + 1;
+      y = r * 256 - x * dy;
+      base = y >> 8;
+      shift = y - base * 256;
+      if (base < 2 * bs - 1) {
+        val =
+            (left[base] * (256 - shift) + left[base + 1] * shift + 128) >> 8;
+        dst[c] = clip_pixel_highbd(val, bd);
+      } else {
+        dst[c] = left[bs - 1];
+      }
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) above;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                const uint16_t *above, const uint16_t *left,
+                                int angle, int bd) {
+  double t = 0;
+  int dx, dy;
+
+  if (angle != 90 && angle != 180)
+    t = tan(angle * PI / 180.0);
+  if (angle > 0 && angle < 90) {
+    dx = -((int)(256 / t));
+    dy = 1;
+    highbd_dr_prediction_z1(dst, stride, bs, above, left, dx, dy, bd);
+  } else if (angle > 90 && angle < 180) {
+    t = -t;
+    dx = (int)(256 / t);
+    dy = (int)(256 * t);
+    highbd_dr_prediction_z2(dst, stride, bs, above, left, dx, dy, bd);
+  } else if (angle > 180 && angle < 270) {
+    dx = 1;
+    dy = -((int)(256 * t));
+    highbd_dr_prediction_z3(dst, stride, bs, above, left, dx, dy, bd);
+  } else if (angle == 90) {
+    highbd_v_predictor(dst, stride, bs, above, left, bd);
+  } else if (angle == 180) {
+    highbd_h_predictor(dst, stride, bs, above, left, bd);
+  }
+}
+
+static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
+                                                int bs, const uint16_t *above,
+                                                const uint16_t *left, int mode,
+                                                int bd) {
+  int k, r, c;
+  int pred[33][65];
+  int mean, ipred;
+  const TX_SIZE tx_size = (bs == 32) ? TX_32X32 :
+      ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  const int c0 = filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r)
+    pred[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c)
+    pred[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * pred[r - 1][c] + c1 * pred[r][c - 1] +
+          c2 * pred[r - 1][c - 1] + c3 * pred[r - 1][c + 1];
+      pred[r][c] = ipred < 0 ?
+          -((-ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS) :
+          ((ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+static void highbd_dc_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
+                                      bd);
+}
+
+static void highbd_v_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED,
+                                      bd);
+}
+
+static void highbd_h_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED,
+                                      bd);
+}
+
+static void highbd_d45_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
+                                      bd);
+}
+
+static void highbd_d135_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
+                                      bd);
+}
+
+static void highbd_d117_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
+                                      bd);
+}
+
+static void highbd_d153_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
+                                      bd);
+}
+
+static void highbd_d207_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
+                                      bd);
+}
+
+static void highbd_d63_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
+                                      bd);
+}
+
+static void highbd_tm_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
+                                      bd);
+}
+
+static void (*highbd_filter_intra_predictors[EXT_INTRA_MODES])(uint16_t *dst,
+    ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left,
+    int bd) = {
+        highbd_dc_filter_predictor, highbd_v_filter_predictor,
+        highbd_h_filter_predictor, highbd_d45_filter_predictor,
+        highbd_d135_filter_predictor, highbd_d117_filter_predictor,
+        highbd_d153_filter_predictor, highbd_d207_filter_predictor,
+        highbd_d63_filter_predictor, highbd_tm_filter_predictor,
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_intra_predictors_high(const MACROBLOCKD *xd,
@@ -276,56 +826,71 @@
                                         int dst_stride,
                                         PREDICTION_MODE mode,
                                         TX_SIZE tx_size,
-#if CONFIG_MISC_FIXES
                                         int n_top_px, int n_topright_px,
                                         int n_left_px, int n_bottomleft_px,
-#else
-                                        int up_available,
-                                        int left_available,
-                                        int right_available,
-#endif
-                                        int x, int y,
-                                        int plane, int bd) {
+                                        int plane) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-#if CONFIG_MISC_FIXES
-  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
-#else
   DECLARE_ALIGNED(16, uint16_t, left_col[64]);
-#endif
   DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
-#if CONFIG_MISC_FIXES
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
   const uint16_t *above_ref = ref - ref_stride;
-#else
-  int frame_width, frame_height;
-  int x0, y0;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-  const int need_left = extend_modes[mode] & NEED_LEFT;
-  const int need_above = extend_modes[mode] & NEED_ABOVE;
-  const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT;
-  int base = 128 << (bd - 8);
+  int base = 128 << (xd->bd - 8);
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
 
-#if CONFIG_MISC_FIXES
-  (void) x;
-  (void) y;
+#if CONFIG_EXT_INTRA
+  const EXT_INTRA_MODE_INFO *ext_intra_mode_info =
+      &xd->mi[0]->mbmi.ext_intra_mode_info;
+  const EXT_INTRA_MODE ext_intra_mode =
+      ext_intra_mode_info->ext_intra_mode[plane != 0];
+  int p_angle = 0;
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    p_angle = mode_to_angle_map[mode] +
+        xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1;
+    else
+      need_above = 0, need_left = 1;
+  }
+
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    EXT_INTRA_MODE ext_intra_mode =
+        ext_intra_mode_info->ext_intra_mode[plane != 0];
+    need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
+    need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
+  }
+#endif  // CONFIG_EXT_INTRA
+
   (void) plane;
-  (void) need_left;
-  (void) need_above;
-  (void) need_aboveright;
 
   // NEED_LEFT
-  if (extend_modes[mode] & NEED_LEFT) {
+  if (need_left) {
+#if CONFIG_EXT_INTRA
+    int need_bottom;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+        need_bottom = 0;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+        need_bottom = p_angle > 180;
+    } else {
+      need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    }
+#else
     const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#endif  // CONFIG_EXT_INTRA
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++)
@@ -343,8 +908,20 @@
   }
 
   // NEED_ABOVE
-  if (extend_modes[mode] & NEED_ABOVE) {
+  if (need_above) {
+#if CONFIG_EXT_INTRA
+    int need_right;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+      need_right = 1;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_right = p_angle < 90;
+    } else {
+      need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    }
+#else
     const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#endif  // CONFIG_EXT_INTRA
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px * 2);
       i = n_top_px;
@@ -360,142 +937,41 @@
     }
   }
 
-  if (extend_modes[mode] & NEED_ABOVELEFT) {
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0] ||
+      (extend_modes[mode] & NEED_ABOVELEFT) ||
+      (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8)) {
     above_row[-1] = n_top_px > 0 ?
         (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
   }
 #else
-  // Get current frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = xd->cur_buf->y_width;
-    frame_height = xd->cur_buf->y_height;
-  } else {
-    frame_width = xd->cur_buf->uv_width;
-    frame_height = xd->cur_buf->uv_height;
+  if ((extend_modes[mode] & NEED_ABOVELEFT)) {
+    above_row[-1] = n_top_px > 0 ?
+        (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
+  }
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+        const_above_row, left_col, xd->bd);
+    return;
   }
 
-  // Get block position in current frame.
-  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-  // NEED_LEFT
-  if (need_left) {
-    if (left_available) {
-      if (xd->mb_to_bottom_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (y0 + bs <= frame_height) {
-          for (i = 0; i < bs; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-        } else {
-          const int extend_bottom = frame_height - y0;
-          for (i = 0; i < extend_bottom; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-          for (; i < bs; ++i)
-            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        for (i = 0; i < bs; ++i)
-          left_col[i] = ref[i * ref_stride - 1];
-      }
-    } else {
-      // TODO(Peter): this value should probably change for high bitdepth
-      vpx_memset16(left_col, base + 1, bs);
-    }
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    highbd_dr_predictor(dst, dst_stride, bs, const_above_row, left_col,
+                        p_angle, xd->bd);
+    return;
   }
-
-  // NEED_ABOVE
-  if (need_above) {
-    if (up_available) {
-      const uint16_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + bs <= frame_width) {
-          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-          vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width);
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-        }
-      }
-      above_row[-1] = left_available ? above_ref[-1] : (base + 1);
-    } else {
-      vpx_memset16(above_row, base - 1, bs);
-      above_row[-1] = base - 1;
-    }
-  }
-
-  // NEED_ABOVERIGHT
-  if (need_aboveright) {
-    if (up_available) {
-      const uint16_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + 2 * bs <= frame_width) {
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
-          } else {
-            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 + bs <= frame_width) {
-          const int r = frame_width - x0;
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-            vpx_memset16(above_row + r, above_row[r - 1],
-                         x0 + 2 * bs - frame_width);
-          } else {
-            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-          vpx_memset16(above_row + r, above_row[r - 1],
-                       x0 + 2 * bs - frame_width);
-        }
-        // TODO(Peter) this value should probably change for high bitdepth
-        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-          if (bs == 4 && right_available)
-            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
-          else
-            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          // TODO(Peter): this value should probably change for high bitdepth
-          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
-        }
-      }
-    } else {
-      vpx_memset16(above_row, base - 1, bs * 2);
-      // TODO(Peter): this value should probably change for high bitdepth
-      above_row[-1] = base - 1;
-    }
-  }
-#endif
+#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
-#if CONFIG_MISC_FIXES
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
                                                        const_above_row,
                                                        left_col, xd->bd);
-#else
-    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
-                                                        const_above_row,
-                                                        left_col, xd->bd);
-#endif
   } else {
     pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
                              xd->bd);
@@ -506,28 +982,44 @@
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
                                    PREDICTION_MODE mode, TX_SIZE tx_size,
-#if CONFIG_MISC_FIXES
                                    int n_top_px, int n_topright_px,
                                    int n_left_px, int n_bottomleft_px,
-#else
-                                   int up_available, int left_available,
-                                   int right_available,
-#endif
-                                   int x, int y, int plane) {
+                                   int plane) {
   int i;
-#if CONFIG_MISC_FIXES
   DECLARE_ALIGNED(16, uint8_t, left_col[64]);
   const uint8_t *above_ref = ref - ref_stride;
-#else
-  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
-  int frame_width, frame_height;
-  int x0, y0;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
   DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+#if CONFIG_EXT_INTRA
+  const EXT_INTRA_MODE_INFO *ext_intra_mode_info =
+      &xd->mi[0]->mbmi.ext_intra_mode_info;
+  const EXT_INTRA_MODE ext_intra_mode =
+      ext_intra_mode_info->ext_intra_mode[plane != 0];
+  int p_angle = 0;
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    p_angle = mode_to_angle_map[mode] +
+        xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1;
+    else
+      need_above = 0, need_left = 1;
+  }
+
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    EXT_INTRA_MODE ext_intra_mode =
+        ext_intra_mode_info->ext_intra_mode[plane != 0];
+    need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
+    need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
+  }
+#endif  // CONFIG_EXT_INTRA
 
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
@@ -536,34 +1028,28 @@
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
 
-#if CONFIG_MISC_FIXES
   (void) xd;
-  (void) x;
-  (void) y;
   (void) plane;
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
-#else
-  // Get current frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = xd->cur_buf->y_width;
-    frame_height = xd->cur_buf->y_height;
-  } else {
-    frame_width = xd->cur_buf->uv_width;
-    frame_height = xd->cur_buf->uv_height;
-  }
-
-  // Get block position in current frame.
-  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-#endif
 
   // NEED_LEFT
-  if (extend_modes[mode] & NEED_LEFT) {
-#if CONFIG_MISC_FIXES
+  if (need_left) {
+#if CONFIG_EXT_INTRA
+    int need_bottom;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+      need_bottom = 0;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_bottom = p_angle > 180;
+    } else {
+      need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    }
+#else
     const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#endif  // CONFIG_EXT_INTRA
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++)
@@ -578,35 +1064,23 @@
     } else {
       memset(left_col, 129, bs << need_bottom);
     }
-#else
-    if (left_available) {
-      if (xd->mb_to_bottom_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (y0 + bs <= frame_height) {
-          for (i = 0; i < bs; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-        } else {
-          const int extend_bottom = frame_height - y0;
-          for (i = 0; i < extend_bottom; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-          for (; i < bs; ++i)
-            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        for (i = 0; i < bs; ++i)
-          left_col[i] = ref[i * ref_stride - 1];
-      }
-    } else {
-      memset(left_col, 129, bs);
-    }
-#endif
   }
 
   // NEED_ABOVE
-  if (extend_modes[mode] & NEED_ABOVE) {
-#if CONFIG_MISC_FIXES
+  if (need_above) {
+#if CONFIG_EXT_INTRA
+    int need_right;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+      need_right = 1;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_right = p_angle < 90;
+    } else {
+      need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    }
+#else
     const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#endif  // CONFIG_EXT_INTRA
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px);
       i = n_top_px;
@@ -620,115 +1094,58 @@
     } else {
       memset(above_row, 127, bs << need_right);
     }
-#else
-    if (up_available) {
-      const uint8_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + bs <= frame_width) {
-          memcpy(above_row, above_ref, bs);
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r);
-          memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs);
-        }
-      }
-      above_row[-1] = left_available ? above_ref[-1] : 129;
-    } else {
-      memset(above_row, 127, bs);
-      above_row[-1] = 127;
-    }
-#endif
   }
 
-#if CONFIG_MISC_FIXES
-  if (extend_modes[mode] & NEED_ABOVELEFT) {
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0] ||
+      (extend_modes[mode] & NEED_ABOVELEFT) ||
+      (mode != DC_PRED && mode != TM_PRED &&
+          xd->mi[0]->mbmi.sb_type >= BLOCK_8X8)) {
     above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
   }
 #else
-  // NEED_ABOVERIGHT
-  if (extend_modes[mode] & NEED_ABOVERIGHT) {
-    if (up_available) {
-      const uint8_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + 2 * bs <= frame_width) {
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, 2 * bs);
-          } else {
-            memcpy(above_row, above_ref, bs);
-            memset(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 + bs <= frame_width) {
-          const int r = frame_width - x0;
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, r);
-            memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
-          } else {
-            memcpy(above_row, above_ref, bs);
-            memset(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r);
-          memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs);
-          if (bs == 4 && right_available)
-            memcpy(above_row + bs, above_ref + bs, bs);
-          else
-            memset(above_row + bs, above_row[bs - 1], bs);
-        }
-      }
-      above_row[-1] = left_available ? above_ref[-1] : 129;
-    } else {
-      memset(above_row, 127, bs * 2);
-      above_row[-1] = 127;
-    }
+  if ((extend_modes[mode] & NEED_ABOVELEFT)) {
+    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
   }
-#endif
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+        const_above_row, left_col);
+    return;
+  }
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    dr_predictor(dst, dst_stride, tx_size, const_above_row, left_col, p_angle);
+    return;
+  }
+#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
-#if CONFIG_MISC_FIXES
     dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
                                                   const_above_row, left_col);
-#else
-    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
-                                                   const_above_row, left_col);
-#endif
   } else {
     pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
   }
 }
 
 void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
-                             TX_SIZE tx_size, PREDICTION_MODE mode,
-                             const uint8_t *ref, int ref_stride,
-                             uint8_t *dst, int dst_stride,
-                             int aoff, int loff, int plane) {
+                              TX_SIZE tx_size, PREDICTION_MODE mode,
+                              const uint8_t *ref, int ref_stride,
+                              uint8_t *dst, int dst_stride,
+                              int aoff, int loff, int plane) {
   const int txw = (1 << tx_size);
   const int have_top = loff || xd->up_available;
   const int have_left = aoff || xd->left_available;
   const int x = aoff * 4;
   const int y = loff * 4;
-#if CONFIG_MISC_FIXES
   const int bw = VPXMAX(2, 1 << bwl_in);
   const int bh = VPXMAX(2, 1 << bhl_in);
-  const int mi_row = -xd->mb_to_top_edge >> 6;
-  const int mi_col = -xd->mb_to_left_edge >> 6;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int right_available =
@@ -744,16 +1161,49 @@
   const int wpx = 4 * bw;
   const int hpx = 4 * bh;
   const int txpx = 4 * txw;
+  // Distance between the right edge of this prediction block to
+  // the frame right edge
+  const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
+      (wpx - x - txpx);
+  // Distance between the bottom edge of this prediction block to
+  // the frame bottom edge
+  const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
+      (hpx - y - txpx);
 
-  int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + (wpx - x - txpx);
-  int yd =
-      (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + (hpx - y - txpx);
+  if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
+    const int bs = 4 * (1 << tx_size);
+    const int stride = 4 * (1 << bwl_in);
+    int r, c;
+    uint8_t *map = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
+        plane * PALETTE_MAX_SIZE;
 #else
-  const int bw = (1 << bwl_in);
-  const int have_right = (aoff + txw) < bw;
-#endif  // CONFIG_MISC_FIXES
+    uint8_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
+        plane * PALETTE_MAX_SIZE;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if CONFIG_MISC_FIXES
+    map = xd->plane[plane != 0].color_index_map;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < bs; ++r)
+        for (c = 0; c < bs; ++c)
+          dst16[r * dst_stride + c] =
+              palette[map[(r + y) * stride + c + x]];
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      for (r = 0; r < bs; ++r)
+        for (c = 0; c < bs; ++c)
+          dst[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    return;
+  }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
@@ -762,7 +1212,7 @@
                                 have_top && have_right ? VPXMIN(txpx, xr) : 0,
                                 have_left ? VPXMIN(txpx, yd + txpx) : 0,
                                 have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
-                                x, y, plane, xd->bd);
+                                plane);
     return;
   }
 #endif
@@ -772,20 +1222,7 @@
                          have_top && have_right ? VPXMIN(txpx, xr) : 0,
                          have_left ? VPXMIN(txpx, yd + txpx) : 0,
                          have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
-                         x, y, plane);
-#else  // CONFIG_MISC_FIXES
-  (void) bhl_in;
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
-                                tx_size, have_top, have_left, have_right,
-                                x, y, plane, xd->bd);
-    return;
-  }
-#endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
-                         have_top, have_left, have_right, x, y, plane);
-#endif  // CONFIG_MISC_FIXES
+                         plane);
 }
 
 void vp10_init_intra_predictors(void) {

diff --git a/vp10/common/scale.c b/vp10/common/scale.c
index ce6062c..65e14a9 100644
--- a/vp10/common/scale.c
+++ b/vp10/common/scale.c

@@ -46,15 +46,15 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h,
-                                       int use_highbd) {
+                                        int other_w, int other_h,
+                                        int this_w, int this_h,
+                                        int use_highbd) {
 #else
 void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h) {
+                                        int other_w, int other_h,
+                                        int this_w, int this_h) {
 #endif
-  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     sf->x_scale_fp = REF_INVALID_SCALE;
     sf->y_scale_fp = REF_INVALID_SCALE;
     return;
@@ -79,6 +79,16 @@
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  sf->predict_ni[0][0][0] = vpx_convolve8_c;
+  sf->predict_ni[0][0][1] = vpx_convolve8_avg_c;
+  sf->predict_ni[0][1][0] = vpx_convolve8_c;
+  sf->predict_ni[0][1][1] = vpx_convolve8_avg_c;
+  sf->predict_ni[1][0][0] = vpx_convolve8_c;
+  sf->predict_ni[1][0][1] = vpx_convolve8_avg_c;
+  sf->predict_ni[1][1][0] = vpx_convolve8;
+  sf->predict_ni[1][1][1] = vpx_convolve8_avg;
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
   if (sf->x_step_q4 == 16) {
     if (sf->y_step_q4 == 16) {
       // No scaling in either direction.
@@ -119,8 +129,19 @@
   // 2D subpel motion always gets filtered in both directions
   sf->predict[1][1][0] = vpx_convolve8;
   sf->predict[1][1][1] = vpx_convolve8_avg;
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (use_highbd) {
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    sf->highbd_predict_ni[0][0][0] = vpx_highbd_convolve8_c;
+    sf->highbd_predict_ni[0][0][1] = vpx_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[0][1][0] = vpx_highbd_convolve8_c;
+    sf->highbd_predict_ni[0][1][1] = vpx_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[1][0][0] = vpx_highbd_convolve8_c;
+    sf->highbd_predict_ni[1][0][1] = vpx_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[1][1][0] = vpx_highbd_convolve8;
+    sf->highbd_predict_ni[1][1][1] = vpx_highbd_convolve8_avg;
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
     if (sf->x_step_q4 == 16) {
       if (sf->y_step_q4 == 16) {
         // No scaling in either direction.
@@ -162,5 +183,5 @@
     sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
     sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
   }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }

diff --git a/vp10/common/scale.h b/vp10/common/scale.h
index 833f6c4..604b9d2 100644
--- a/vp10/common/scale.h
+++ b/vp10/common/scale.h

@@ -34,7 +34,15 @@
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Functions for non-interpolating filters (those that filter zero offsets)
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  convolve_fn_t predict_ni[2][2][2];  // horiz, vert, avg
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict_ni[2][2][2];  // horiz, vert, avg
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
 };
 
 MV32 vp10_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
@@ -48,7 +56,7 @@
 void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE int vp10_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&

diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index 7217f6d..23a7b98 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c

@@ -702,7 +702,228 @@
   {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
 };
 
-const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
+#if CONFIG_EXT_TX
+const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+  }, {  // TX_8X8
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+  }, {  // TX_16X16
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+  }, {  // TX_32X32
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+  }
+};
+
+const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+  }, {  // TX_8X8
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+  }, {  // TX_16X16
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+  }, {  // TX_32X32
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+  }
+};
+
+#else   // CONFIG_EXT_TX
+
+const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = {
   {  // TX_4X4
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
@@ -714,14 +935,21 @@
     {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors}
   }, {  // TX_16X16
-    {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors}
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors}
   }, {  // TX_32X32
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
   }
 };
+#endif  // CONFIG_EXT_TX

diff --git a/vp10/common/scan.h b/vp10/common/scan.h
index f5a020f..aadae40 100644
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h

@@ -30,7 +30,7 @@
 } scan_order;
 
 extern const scan_order vp10_default_scan_orders[TX_SIZES];
-extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
+extern const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
                                    const uint8_t *token_cache, int c) {
@@ -38,8 +38,31 @@
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
-static INLINE const scan_order *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
-  return &vp10_scan_orders[tx_size][tx_type];
+static INLINE const scan_order *get_intra_scan(TX_SIZE tx_size,
+                                               TX_TYPE tx_type) {
+  return &vp10_intra_scan_orders[tx_size][tx_type];
+}
+
+#if CONFIG_EXT_TX
+extern const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES];
+
+static INLINE const scan_order *get_inter_scan(TX_SIZE tx_size,
+                                               TX_TYPE tx_type) {
+  return &vp10_inter_scan_orders[tx_size][tx_type];
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE const scan_order *get_scan(TX_SIZE tx_size,
+                                         TX_TYPE tx_type,
+                                         int is_inter) {
+#if CONFIG_EXT_TX
+  return
+      is_inter ? &vp10_inter_scan_orders[tx_size][tx_type] :
+                 &vp10_intra_scan_orders[tx_size][tx_type];
+#else
+  (void) is_inter;
+  return &vp10_intra_scan_orders[tx_size][tx_type];
+#endif  // CONFIG_EXT_TX
 }
 
 #ifdef __cplusplus

diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index 0c7a1c2..a1f17e9 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c

@@ -366,6 +366,20 @@
     for (j = 0; j < SWITCHABLE_FILTERS; j++)
       cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
 
+#if CONFIG_REF_MV
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    for (j = 0; j < 2; ++j)
+      cm->counts.newmv_mode[i][j] += counts->newmv_mode[i][j];
+
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    for (j = 0; j < 2; ++j)
+      cm->counts.zeromv_mode[i][j] += counts->zeromv_mode[i][j];
+
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    for (j = 0; j < 2; ++j)
+      cm->counts.refmv_mode[i][j] += counts->refmv_mode[i][j];
+#endif
+
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
     for (j = 0; j < INTER_MODES; j++)
       cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
@@ -379,13 +393,14 @@
       cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
 
   for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
+    for (j = 0; j < (SINGLE_REFS - 1); j++)
       for (k = 0; k < 2; k++)
-      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+        cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
 
   for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+    for (j = 0; j < (COMP_REFS - 1); j++)
+      for (k = 0; k < 2; k++)
+        cm->counts.comp_ref[i][j][k] += counts->comp_ref[i][j][k];
 
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     for (j = 0; j < TX_SIZES; j++)
@@ -401,6 +416,12 @@
   for (i = 0; i < TX_SIZES; i++)
     cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
 
+#if CONFIG_VAR_TX
+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+    for (j = 0; j < 2; ++j)
+      cm->counts.txfm_partition[i][j] += counts->txfm_partition[i][j];
+#endif
+
   for (i = 0; i < SKIP_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
       cm->counts.skip[i][j] += counts->skip[i][j];
@@ -435,6 +456,26 @@
       comps->fp[i] += comps_t->fp[i];
   }
 
+#if CONFIG_EXT_TX
+  for (i = 0; i < EXT_TX_SIZES; i++) {
+    int s, k;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        for (k = 0; k < TX_TYPES; k++)
+          cm->counts.inter_ext_tx[s][i][k] += counts->inter_ext_tx[s][i][k];
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        int j;
+        for (j = 0; j < INTRA_MODES; ++j)
+          for (k = 0; k < TX_TYPES; k++)
+            cm->counts.intra_ext_tx[s][i][j][k] +=
+                counts->intra_ext_tx[s][i][j][k];
+      }
+    }
+  }
+#else
   for (i = 0; i < EXT_TX_SIZES; i++) {
     int j;
     for (j = 0; j < TX_TYPES; ++j)
@@ -445,8 +486,17 @@
     for (k = 0; k < TX_TYPES; k++)
       cm->counts.inter_ext_tx[i][k] += counts->inter_ext_tx[i][k];
   }
+#endif  // CONFIG_EXT_TX
 
-#if CONFIG_MISC_FIXES
+#if CONFIG_SUPERTX
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; i++)
+    for (j = 0; j < TX_SIZES; j++)
+      for (k = 0; k < 2; k++)
+        cm->counts.supertx[i][j][k] += counts->supertx[i][j][k];
+  for (i = 0; i < TX_SIZES; i++)
+    cm->counts.supertx_size[i] += counts->supertx_size[i];
+#endif  // CONFIG_SUPERTX
+
   for (i = 0; i < PREDICTION_PROBS; i++)
     for (j = 0; j < 2; j++)
       cm->counts.seg.pred[i][j] += counts->seg.pred[i][j];
@@ -455,5 +505,10 @@
     cm->counts.seg.tree_total[i] += counts->seg.tree_total[i];
     cm->counts.seg.tree_mispred[i] += counts->seg.tree_mispred[i];
   }
-#endif
+
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < PLANE_TYPES; ++i)
+    for (j = 0; j < 2; j++)
+      cm->counts.ext_intra[i][j] += counts->ext_intra[i][j];
+#endif  // CONFIG_EXT_INTRA
 }

diff --git a/vp10/common/vp10_fwd_txfm1d.c b/vp10/common/vp10_fwd_txfm1d.c
new file mode 100644
index 0000000..6e19e27
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm1d.c

@@ -0,0 +1,1530 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_fwd_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#define range_check(stage, input, buf, size, bit)                         \
+  {                                                                       \
+    int i, j;                                                             \
+    for (i = 0; i < size; ++i) {                                          \
+      int buf_bit = get_max_bit(abs(buf[i])) + 1;                         \
+      if (buf_bit > bit) {                                                \
+        printf("======== %s overflow ========\n", __func__);              \
+        printf("stage: %d node: %d\n", stage, i);                         \
+        printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
+        printf("input:\n");                                               \
+        for (j = 0; j < size; j++) {                                      \
+          printf("%d,", input[j]);                                        \
+        }                                                                 \
+        printf("\n");                                                     \
+        assert(0, "vp10_fwd_txfm1d.c: range_check overflow");             \
+      }                                                                   \
+    }                                                                     \
+  }
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void) stage;                                 \
+    (void) input;                                 \
+    (void) buf;                                   \
+    (void) size;                                  \
+    (void) bit;                                   \
+  }
+#endif
+
+void vp10_fdct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[3];
+  bf1[1] = input[0];
+  bf1[2] = input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+}
+
+void vp10_fadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[31];
+  bf1[1] = input[0];
+  bf1[2] = input[29];
+  bf1[3] = input[2];
+  bf1[4] = input[27];
+  bf1[5] = input[4];
+  bf1[6] = input[25];
+  bf1[7] = input[6];
+  bf1[8] = input[23];
+  bf1[9] = input[8];
+  bf1[10] = input[21];
+  bf1[11] = input[10];
+  bf1[12] = input[19];
+  bf1[13] = input[12];
+  bf1[14] = input[17];
+  bf1[15] = input[14];
+  bf1[16] = input[15];
+  bf1[17] = input[16];
+  bf1[18] = input[13];
+  bf1[19] = input[18];
+  bf1[20] = input[11];
+  bf1[21] = input[20];
+  bf1[22] = input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[7];
+  bf1[25] = input[24];
+  bf1[26] = input[5];
+  bf1[27] = input[26];
+  bf1[28] = input[3];
+  bf1[29] = input[28];
+  bf1[30] = input[1];
+  bf1[31] = input[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = -bf0[16] + bf0[0];
+  bf1[17] = -bf0[17] + bf0[1];
+  bf1[18] = -bf0[18] + bf0[2];
+  bf1[19] = -bf0[19] + bf0[3];
+  bf1[20] = -bf0[20] + bf0[4];
+  bf1[21] = -bf0[21] + bf0[5];
+  bf1[22] = -bf0[22] + bf0[6];
+  bf1[23] = -bf0[23] + bf0[7];
+  bf1[24] = -bf0[24] + bf0[8];
+  bf1[25] = -bf0[25] + bf0[9];
+  bf1[26] = -bf0[26] + bf0[10];
+  bf1[27] = -bf0[27] + bf0[11];
+  bf1[28] = -bf0[28] + bf0[12];
+  bf1[29] = -bf0[29] + bf0[13];
+  bf1[30] = -bf0[30] + bf0[14];
+  bf1[31] = -bf0[31] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = -bf0[24] + bf0[16];
+  bf1[25] = -bf0[25] + bf0[17];
+  bf1[26] = -bf0[26] + bf0[18];
+  bf1[27] = -bf0[27] + bf0[19];
+  bf1[28] = -bf0[28] + bf0[20];
+  bf1[29] = -bf0[29] + bf0[21];
+  bf1[30] = -bf0[30] + bf0[22];
+  bf1[31] = -bf0[31] + bf0[23];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = -bf0[20] + bf0[16];
+  bf1[21] = -bf0[21] + bf0[17];
+  bf1[22] = -bf0[22] + bf0[18];
+  bf1[23] = -bf0[23] + bf0[19];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = -bf0[28] + bf0[24];
+  bf1[29] = -bf0[29] + bf0[25];
+  bf1[30] = -bf0[30] + bf0[26];
+  bf1[31] = -bf0[31] + bf0[27];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = -bf0[18] + bf0[16];
+  bf1[19] = -bf0[19] + bf0[17];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = -bf0[22] + bf0[20];
+  bf1[23] = -bf0[23] + bf0[21];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = -bf0[26] + bf0[24];
+  bf1[27] = -bf0[27] + bf0[25];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = -bf0[30] + bf0[28];
+  bf1[31] = -bf0[31] + bf0[29];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[16];
+  bf1[2] = bf0[24];
+  bf1[3] = -bf0[8];
+  bf1[4] = bf0[12];
+  bf1[5] = -bf0[28];
+  bf1[6] = bf0[20];
+  bf1[7] = -bf0[4];
+  bf1[8] = bf0[6];
+  bf1[9] = -bf0[22];
+  bf1[10] = bf0[30];
+  bf1[11] = -bf0[14];
+  bf1[12] = bf0[10];
+  bf1[13] = -bf0[26];
+  bf1[14] = bf0[18];
+  bf1[15] = -bf0[2];
+  bf1[16] = bf0[3];
+  bf1[17] = -bf0[19];
+  bf1[18] = bf0[27];
+  bf1[19] = -bf0[11];
+  bf1[20] = bf0[15];
+  bf1[21] = -bf0[31];
+  bf1[22] = bf0[23];
+  bf1[23] = -bf0[7];
+  bf1[24] = bf0[5];
+  bf1[25] = -bf0[21];
+  bf1[26] = bf0[29];
+  bf1[27] = -bf0[13];
+  bf1[28] = bf0[9];
+  bf1[29] = -bf0[25];
+  bf1[30] = bf0[17];
+  bf1[31] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}

diff --git a/vp10/common/vp10_fwd_txfm1d.h b/vp10/common/vp10_fwd_txfm1d.h
new file mode 100644
index 0000000..d5b9f40
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm1d.h

@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_FWD_TXFM1D_H_
+#define VP10_FWD_TXFM1D_H_
+
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_fdct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+void vp10_fadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_FWD_TXFM1D_H_

diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
new file mode 100644
index 0000000..67449ec
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm2d.c

@@ -0,0 +1,84 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_txfm.h"
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_CFG *cfg,
+                                int32_t *txfm_buf) {
+  int i, j;
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = cfg->txfm_func_col;
+  const TxfmFunc txfm_func_row = cfg->txfm_func_row;
+
+  // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
+  // it is used for intermediate data buffering
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + txfm_size;
+  int32_t *buf = temp_out + txfm_size;
+
+  // Columns
+  for (i = 0; i < txfm_size; ++i) {
+    for (j = 0; j < txfm_size; ++j)
+      temp_in[j] = input[j * stride + i];
+    round_shift_array(temp_in, txfm_size, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    for (j = 0; j < txfm_size; ++j)
+      buf[j * txfm_size + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < txfm_size; ++i) {
+    for (j = 0; j < txfm_size; ++j)
+      temp_in[j] = buf[j + i * txfm_size];
+    txfm_func_row(temp_in, temp_out, cos_bit_row, stage_range_row);
+    round_shift_array(temp_out, txfm_size, -shift[2]);
+    for (j = 0; j < txfm_size; ++j)
+      output[j + i * txfm_size] = (int32_t)temp_out[j];
+  }
+}
+
+void vp10_fwd_txfm2d_4x4(const int16_t *input, int32_t *output,
+                         const int stride, const TXFM_2D_CFG *cfg,
+                         const int bd) {
+  int txfm_buf[4 * 4 + 4 + 4];
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_8x8(const int16_t *input, int32_t *output,
+                         const int stride, const TXFM_2D_CFG *cfg,
+                         const int bd) {
+  int txfm_buf[8 * 8 + 8 + 8];
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_16x16(const int16_t *input, int32_t *output,
+                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int bd) {
+  int txfm_buf[16 * 16 + 16 + 16];
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_32x32(const int16_t *input, int32_t *output,
+                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int bd) {
+  int txfm_buf[32 * 32 + 32 + 32];
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+}

diff --git a/vp10/common/vp10_fwd_txfm2d.h b/vp10/common/vp10_fwd_txfm2d.h
new file mode 100644
index 0000000..64e6f56
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm2d.h

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_FWD_TXFM2D_H_
+#define VP10_FWD_TXFM2D_H_
+
+#include "vp10/common/vp10_txfm.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+void vp10_fwd_txfm2d_4x4(const int16_t *input, int32_t *output,
+                         const int stride, const TXFM_2D_CFG *cfg,
+                         const int bd);
+void vp10_fwd_txfm2d_8x8(const int16_t *input, int32_t *output,
+                         const int stride, const TXFM_2D_CFG *cfg,
+                         const int bd);
+void vp10_fwd_txfm2d_16x16(const int16_t *input, int32_t *output,
+                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int bd);
+void vp10_fwd_txfm2d_32x32(const int16_t *input, int32_t *output,
+                           const int stride, const TXFM_2D_CFG *cfg,
+                           const int bd);
+#ifdef __cplusplus
+}
+#endif
+#endif  // VP10_FWD_TXFM2D_H_

diff --git a/vp10/common/vp10_fwd_txfm2d_cfg.h b/vp10/common/vp10_fwd_txfm2d_cfg.h
new file mode 100644
index 0000000..93fee6f
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm2d_cfg.h

@@ -0,0 +1,367 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_FWD_TXFM2D_CFG_H_
+#define VP10_FWD_TXFM2D_CFG_H_
+#include "vp10/common/vp10_fwd_txfm1d.h"
+
+//  ---------------- config fwd_dct_dct_4 ----------------
+static int8_t fwd_shift_dct_dct_4[3] = {4, 0, -2};
+static int8_t fwd_stage_range_col_dct_dct_4[4] = {15, 16, 17, 17};
+static int8_t fwd_stage_range_row_dct_dct_4[4] = {17, 18, 18, 18};
+static int8_t fwd_cos_bit_col_dct_dct_4[4] = {15, 15, 15, 15};
+static int8_t fwd_cos_bit_row_dct_dct_4[4] = {15, 14, 14, 14};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 4,
+    .stage_num_row = 4,
+
+    .shift = fwd_shift_dct_dct_4,
+    .stage_range_col = fwd_stage_range_col_dct_dct_4,
+    .stage_range_row = fwd_stage_range_row_dct_dct_4,
+    .cos_bit_col = fwd_cos_bit_col_dct_dct_4,
+    .cos_bit_row = fwd_cos_bit_row_dct_dct_4,
+    .txfm_func_col = vp10_fdct4_new,
+    .txfm_func_row = vp10_fdct4_new};
+
+//  ---------------- config fwd_dct_dct_8 ----------------
+static int8_t fwd_shift_dct_dct_8[3] = {5, -3, -1};
+static int8_t fwd_stage_range_col_dct_dct_8[6] = {16, 17, 18, 19, 19, 19};
+static int8_t fwd_stage_range_row_dct_dct_8[6] = {16, 17, 18, 18, 18, 18};
+static int8_t fwd_cos_bit_col_dct_dct_8[6] = {15, 15, 14, 13, 13, 13};
+static int8_t fwd_cos_bit_row_dct_dct_8[6] = {15, 15, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 6,
+    .stage_num_row = 6,
+
+    .shift = fwd_shift_dct_dct_8,
+    .stage_range_col = fwd_stage_range_col_dct_dct_8,
+    .stage_range_row = fwd_stage_range_row_dct_dct_8,
+    .cos_bit_col = fwd_cos_bit_col_dct_dct_8,
+    .cos_bit_row = fwd_cos_bit_row_dct_dct_8,
+    .txfm_func_col = vp10_fdct8_new,
+    .txfm_func_row = vp10_fdct8_new};
+
+//  ---------------- config fwd_dct_dct_16 ----------------
+static int8_t fwd_shift_dct_dct_16[3] = {4, -3, -1};
+static int8_t fwd_stage_range_col_dct_dct_16[8] = {15, 16, 17, 18,
+                                                   19, 19, 19, 19};
+static int8_t fwd_stage_range_row_dct_dct_16[8] = {16, 17, 18, 19,
+                                                   19, 19, 19, 19};
+static int8_t fwd_cos_bit_col_dct_dct_16[8] = {15, 15, 15, 14, 13, 13, 13, 13};
+static int8_t fwd_cos_bit_row_dct_dct_16[8] = {15, 15, 14, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 8,
+    .stage_num_row = 8,
+
+    .shift = fwd_shift_dct_dct_16,
+    .stage_range_col = fwd_stage_range_col_dct_dct_16,
+    .stage_range_row = fwd_stage_range_row_dct_dct_16,
+    .cos_bit_col = fwd_cos_bit_col_dct_dct_16,
+    .cos_bit_row = fwd_cos_bit_row_dct_dct_16,
+    .txfm_func_col = vp10_fdct16_new,
+    .txfm_func_row = vp10_fdct16_new};
+
+//  ---------------- config fwd_dct_dct_32 ----------------
+static int8_t fwd_shift_dct_dct_32[3] = {3, -3, -1};
+static int8_t fwd_stage_range_col_dct_dct_32[10] = {14, 15, 16, 17, 18,
+                                                    19, 19, 19, 19, 19};
+static int8_t fwd_stage_range_row_dct_dct_32[10] = {16, 17, 18, 19, 20,
+                                                    20, 20, 20, 20, 20};
+static int8_t fwd_cos_bit_col_dct_dct_32[10] = {15, 15, 15, 15, 14,
+                                                13, 13, 13, 13, 13};
+static int8_t fwd_cos_bit_row_dct_dct_32[10] = {15, 15, 14, 13, 12,
+                                                12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 10,
+    .stage_num_row = 10,
+
+    .shift = fwd_shift_dct_dct_32,
+    .stage_range_col = fwd_stage_range_col_dct_dct_32,
+    .stage_range_row = fwd_stage_range_row_dct_dct_32,
+    .cos_bit_col = fwd_cos_bit_col_dct_dct_32,
+    .cos_bit_row = fwd_cos_bit_row_dct_dct_32,
+    .txfm_func_col = vp10_fdct32_new,
+    .txfm_func_row = vp10_fdct32_new};
+
+//  ---------------- config fwd_dct_adst_4 ----------------
+static int8_t fwd_shift_dct_adst_4[3] = {5, -2, -1};
+static int8_t fwd_stage_range_col_dct_adst_4[4] = {16, 17, 18, 18};
+static int8_t fwd_stage_range_row_dct_adst_4[6] = {16, 16, 16, 17, 17, 17};
+static int8_t fwd_cos_bit_col_dct_adst_4[4] = {15, 15, 14, 14};
+static int8_t fwd_cos_bit_row_dct_adst_4[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 4,
+    .stage_num_row = 6,
+
+    .shift = fwd_shift_dct_adst_4,
+    .stage_range_col = fwd_stage_range_col_dct_adst_4,
+    .stage_range_row = fwd_stage_range_row_dct_adst_4,
+    .cos_bit_col = fwd_cos_bit_col_dct_adst_4,
+    .cos_bit_row = fwd_cos_bit_row_dct_adst_4,
+    .txfm_func_col = vp10_fdct4_new,
+    .txfm_func_row = vp10_fadst4_new};
+
+//  ---------------- config fwd_dct_adst_8 ----------------
+static int8_t fwd_shift_dct_adst_8[3] = {7, -3, -3};
+static int8_t fwd_stage_range_col_dct_adst_8[6] = {18, 19, 20, 21, 21, 21};
+static int8_t fwd_stage_range_row_dct_adst_8[8] = {18, 18, 18, 19,
+                                                   19, 20, 20, 20};
+static int8_t fwd_cos_bit_col_dct_adst_8[6] = {14, 13, 12, 11, 11, 11};
+static int8_t fwd_cos_bit_row_dct_adst_8[8] = {14, 14, 14, 13, 13, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 6,
+    .stage_num_row = 8,
+
+    .shift = fwd_shift_dct_adst_8,
+    .stage_range_col = fwd_stage_range_col_dct_adst_8,
+    .stage_range_row = fwd_stage_range_row_dct_adst_8,
+    .cos_bit_col = fwd_cos_bit_col_dct_adst_8,
+    .cos_bit_row = fwd_cos_bit_row_dct_adst_8,
+    .txfm_func_col = vp10_fdct8_new,
+    .txfm_func_row = vp10_fadst8_new};
+
+//  ---------------- config fwd_dct_adst_16 ----------------
+static int8_t fwd_shift_dct_adst_16[3] = {4, -1, -3};
+static int8_t fwd_stage_range_col_dct_adst_16[8] = {15, 16, 17, 18,
+                                                    19, 19, 19, 19};
+static int8_t fwd_stage_range_row_dct_adst_16[10] = {18, 18, 18, 19, 19,
+                                                     20, 20, 21, 21, 21};
+static int8_t fwd_cos_bit_col_dct_adst_16[8] = {15, 15, 15, 14, 13, 13, 13, 13};
+static int8_t fwd_cos_bit_row_dct_adst_16[10] = {14, 14, 14, 13, 13,
+                                                 12, 12, 11, 11, 11};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 8,
+    .stage_num_row = 10,
+
+    .shift = fwd_shift_dct_adst_16,
+    .stage_range_col = fwd_stage_range_col_dct_adst_16,
+    .stage_range_row = fwd_stage_range_row_dct_adst_16,
+    .cos_bit_col = fwd_cos_bit_col_dct_adst_16,
+    .cos_bit_row = fwd_cos_bit_row_dct_adst_16,
+    .txfm_func_col = vp10_fdct16_new,
+    .txfm_func_row = vp10_fadst16_new};
+
+//  ---------------- config fwd_dct_adst_32 ----------------
+static int8_t fwd_shift_dct_adst_32[3] = {3, -1, -3};
+static int8_t fwd_stage_range_col_dct_adst_32[10] = {14, 15, 16, 17, 18,
+                                                     19, 19, 19, 19, 19};
+static int8_t fwd_stage_range_row_dct_adst_32[12] = {18, 18, 18, 19, 19, 20,
+                                                     20, 21, 21, 22, 22, 22};
+static int8_t fwd_cos_bit_col_dct_adst_32[10] = {15, 15, 15, 15, 14,
+                                                 13, 13, 13, 13, 13};
+static int8_t fwd_cos_bit_row_dct_adst_32[12] = {14, 14, 14, 13, 13, 12,
+                                                 12, 11, 11, 10, 10, 10};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 10,
+    .stage_num_row = 12,
+
+    .shift = fwd_shift_dct_adst_32,
+    .stage_range_col = fwd_stage_range_col_dct_adst_32,
+    .stage_range_row = fwd_stage_range_row_dct_adst_32,
+    .cos_bit_col = fwd_cos_bit_col_dct_adst_32,
+    .cos_bit_row = fwd_cos_bit_row_dct_adst_32,
+    .txfm_func_col = vp10_fdct32_new,
+    .txfm_func_row = vp10_fadst32_new};
+
+//  ---------------- config fwd_adst_adst_4 ----------------
+static int8_t fwd_shift_adst_adst_4[3] = {6, 1, -5};
+static int8_t fwd_stage_range_col_adst_adst_4[6] = {17, 17, 18, 19, 19, 19};
+static int8_t fwd_stage_range_row_adst_adst_4[6] = {20, 20, 20, 21, 21, 21};
+static int8_t fwd_cos_bit_col_adst_adst_4[6] = {15, 15, 14, 13, 13, 13};
+static int8_t fwd_cos_bit_row_adst_adst_4[6] = {12, 12, 12, 11, 11, 11};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 6,
+    .stage_num_row = 6,
+
+    .shift = fwd_shift_adst_adst_4,
+    .stage_range_col = fwd_stage_range_col_adst_adst_4,
+    .stage_range_row = fwd_stage_range_row_adst_adst_4,
+    .cos_bit_col = fwd_cos_bit_col_adst_adst_4,
+    .cos_bit_row = fwd_cos_bit_row_adst_adst_4,
+    .txfm_func_col = vp10_fadst4_new,
+    .txfm_func_row = vp10_fadst4_new};
+
+//  ---------------- config fwd_adst_adst_8 ----------------
+static int8_t fwd_shift_adst_adst_8[3] = {3, -1, -1};
+static int8_t fwd_stage_range_col_adst_adst_8[8] = {14, 14, 15, 16,
+                                                    16, 17, 17, 17};
+static int8_t fwd_stage_range_row_adst_adst_8[8] = {16, 16, 16, 17,
+                                                    17, 18, 18, 18};
+static int8_t fwd_cos_bit_col_adst_adst_8[8] = {15, 15, 15, 15, 15, 15, 15, 15};
+static int8_t fwd_cos_bit_row_adst_adst_8[8] = {15, 15, 15, 15, 15, 14, 14, 14};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 8,
+    .stage_num_row = 8,
+
+    .shift = fwd_shift_adst_adst_8,
+    .stage_range_col = fwd_stage_range_col_adst_adst_8,
+    .stage_range_row = fwd_stage_range_row_adst_adst_8,
+    .cos_bit_col = fwd_cos_bit_col_adst_adst_8,
+    .cos_bit_row = fwd_cos_bit_row_adst_adst_8,
+    .txfm_func_col = vp10_fadst8_new,
+    .txfm_func_row = vp10_fadst8_new};
+
+//  ---------------- config fwd_adst_adst_16 ----------------
+static int8_t fwd_shift_adst_adst_16[3] = {2, 0, -2};
+static int8_t fwd_stage_range_col_adst_adst_16[10] = {13, 13, 14, 15, 15,
+                                                      16, 16, 17, 17, 17};
+static int8_t fwd_stage_range_row_adst_adst_16[10] = {17, 17, 17, 18, 18,
+                                                      19, 19, 20, 20, 20};
+static int8_t fwd_cos_bit_col_adst_adst_16[10] = {15, 15, 15, 15, 15,
+                                                  15, 15, 15, 15, 15};
+static int8_t fwd_cos_bit_row_adst_adst_16[10] = {15, 15, 15, 14, 14,
+                                                  13, 13, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 10,
+    .stage_num_row = 10,
+
+    .shift = fwd_shift_adst_adst_16,
+    .stage_range_col = fwd_stage_range_col_adst_adst_16,
+    .stage_range_row = fwd_stage_range_row_adst_adst_16,
+    .cos_bit_col = fwd_cos_bit_col_adst_adst_16,
+    .cos_bit_row = fwd_cos_bit_row_adst_adst_16,
+    .txfm_func_col = vp10_fadst16_new,
+    .txfm_func_row = vp10_fadst16_new};
+
+//  ---------------- config fwd_adst_adst_32 ----------------
+static int8_t fwd_shift_adst_adst_32[3] = {4, -2, -3};
+static int8_t fwd_stage_range_col_adst_adst_32[12] = {15, 15, 16, 17, 17, 18,
+                                                      18, 19, 19, 20, 20, 20};
+static int8_t fwd_stage_range_row_adst_adst_32[12] = {18, 18, 18, 19, 19, 20,
+                                                      20, 21, 21, 22, 22, 22};
+static int8_t fwd_cos_bit_col_adst_adst_32[12] = {15, 15, 15, 15, 15, 14,
+                                                  14, 13, 13, 12, 12, 12};
+static int8_t fwd_cos_bit_row_adst_adst_32[12] = {14, 14, 14, 13, 13, 12,
+                                                  12, 11, 11, 10, 10, 10};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 12,
+    .stage_num_row = 12,
+
+    .shift = fwd_shift_adst_adst_32,
+    .stage_range_col = fwd_stage_range_col_adst_adst_32,
+    .stage_range_row = fwd_stage_range_row_adst_adst_32,
+    .cos_bit_col = fwd_cos_bit_col_adst_adst_32,
+    .cos_bit_row = fwd_cos_bit_row_adst_adst_32,
+    .txfm_func_col = vp10_fadst32_new,
+    .txfm_func_row = vp10_fadst32_new};
+
+//  ---------------- config fwd_adst_dct_4 ----------------
+static int8_t fwd_shift_adst_dct_4[3] = {5, -4, 1};
+static int8_t fwd_stage_range_col_adst_dct_4[6] = {16, 16, 17, 18, 18, 18};
+static int8_t fwd_stage_range_row_adst_dct_4[4] = {14, 15, 15, 15};
+static int8_t fwd_cos_bit_col_adst_dct_4[6] = {15, 15, 15, 14, 14, 14};
+static int8_t fwd_cos_bit_row_adst_dct_4[4] = {15, 15, 15, 15};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 6,
+    .stage_num_row = 4,
+
+    .shift = fwd_shift_adst_dct_4,
+    .stage_range_col = fwd_stage_range_col_adst_dct_4,
+    .stage_range_row = fwd_stage_range_row_adst_dct_4,
+    .cos_bit_col = fwd_cos_bit_col_adst_dct_4,
+    .cos_bit_row = fwd_cos_bit_row_adst_dct_4,
+    .txfm_func_col = vp10_fadst4_new,
+    .txfm_func_row = vp10_fdct4_new};
+
+//  ---------------- config fwd_adst_dct_8 ----------------
+static int8_t fwd_shift_adst_dct_8[3] = {5, 1, -5};
+static int8_t fwd_stage_range_col_adst_dct_8[8] = {16, 16, 17, 18,
+                                                   18, 19, 19, 19};
+static int8_t fwd_stage_range_row_adst_dct_8[6] = {20, 21, 22, 22, 22, 22};
+static int8_t fwd_cos_bit_col_adst_dct_8[8] = {15, 15, 15, 14, 14, 13, 13, 13};
+static int8_t fwd_cos_bit_row_adst_dct_8[6] = {12, 11, 10, 10, 10, 10};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 8,
+    .stage_num_row = 6,
+
+    .shift = fwd_shift_adst_dct_8,
+    .stage_range_col = fwd_stage_range_col_adst_dct_8,
+    .stage_range_row = fwd_stage_range_row_adst_dct_8,
+    .cos_bit_col = fwd_cos_bit_col_adst_dct_8,
+    .cos_bit_row = fwd_cos_bit_row_adst_dct_8,
+    .txfm_func_col = vp10_fadst8_new,
+    .txfm_func_row = vp10_fdct8_new};
+
+//  ---------------- config fwd_adst_dct_16 ----------------
+static int8_t fwd_shift_adst_dct_16[3] = {4, -3, -1};
+static int8_t fwd_stage_range_col_adst_dct_16[10] = {15, 15, 16, 17, 17,
+                                                     18, 18, 19, 19, 19};
+static int8_t fwd_stage_range_row_adst_dct_16[8] = {16, 17, 18, 19,
+                                                    19, 19, 19, 19};
+static int8_t fwd_cos_bit_col_adst_dct_16[10] = {15, 15, 15, 15, 15,
+                                                 14, 14, 13, 13, 13};
+static int8_t fwd_cos_bit_row_adst_dct_16[8] = {15, 15, 14, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 10,
+    .stage_num_row = 8,
+
+    .shift = fwd_shift_adst_dct_16,
+    .stage_range_col = fwd_stage_range_col_adst_dct_16,
+    .stage_range_row = fwd_stage_range_row_adst_dct_16,
+    .cos_bit_col = fwd_cos_bit_col_adst_dct_16,
+    .cos_bit_row = fwd_cos_bit_row_adst_dct_16,
+    .txfm_func_col = vp10_fadst16_new,
+    .txfm_func_row = vp10_fdct16_new};
+
+//  ---------------- config fwd_adst_dct_32 ----------------
+static int8_t fwd_shift_adst_dct_32[3] = {5, -4, -2};
+static int8_t fwd_stage_range_col_adst_dct_32[12] = {16, 16, 17, 18, 18, 19,
+                                                     19, 20, 20, 21, 21, 21};
+static int8_t fwd_stage_range_row_adst_dct_32[10] = {17, 18, 19, 20, 21,
+                                                     21, 21, 21, 21, 21};
+static int8_t fwd_cos_bit_col_adst_dct_32[12] = {15, 15, 15, 14, 14, 13,
+                                                 13, 12, 12, 11, 11, 11};
+static int8_t fwd_cos_bit_row_adst_dct_32[10] = {15, 14, 13, 12, 11,
+                                                 11, 11, 11, 11, 11};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 12,
+    .stage_num_row = 10,
+
+    .shift = fwd_shift_adst_dct_32,
+    .stage_range_col = fwd_stage_range_col_adst_dct_32,
+    .stage_range_row = fwd_stage_range_row_adst_dct_32,
+    .cos_bit_col = fwd_cos_bit_col_adst_dct_32,
+    .cos_bit_row = fwd_cos_bit_row_adst_dct_32,
+    .txfm_func_col = vp10_fadst32_new,
+    .txfm_func_row = vp10_fdct32_new};
+
+#endif  // VP10_FWD_TXFM2D_CFG_H_

diff --git a/vp10/common/vp10_inv_txfm1d.c b/vp10/common/vp10_inv_txfm1d.c
new file mode 100644
index 0000000..b64b601
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm1d.c

@@ -0,0 +1,1536 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_inv_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#define range_check(stage, input, buf, size, bit)                         \
+  {                                                                       \
+    int i, j;                                                             \
+    for (i = 0; i < size; ++i) {                                          \
+      int buf_bit = get_max_bit(abs(buf[i])) + 1;                         \
+      if (buf_bit > bit) {                                                \
+        printf("======== %s overflow ========\n", __func__);              \
+        printf("stage: %d node: %d\n", stage, i);                         \
+        printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
+        printf("input:\n");                                               \
+        for (j = 0; j < size; j++) {                                      \
+          printf("%d,", input[j]);                                        \
+        }                                                                 \
+        printf("\n");                                                     \
+        assert(0, "vp10_inv_txfm1d.c: range_check overflow");             \
+      }                                                                   \
+    }                                                                     \
+  }
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void) stage;                                 \
+    (void) input;                                 \
+    (void) buf;                                   \
+    (void) size;                                  \
+    (void) bit;                                   \
+  }
+#endif
+
+void vp10_idct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[2];
+  bf1[2] = input[1];
+  bf1[3] = input[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[4];
+  bf1[2] = input[2];
+  bf1[3] = input[6];
+  bf1[4] = input[1];
+  bf1[5] = input[5];
+  bf1[6] = input[3];
+  bf1[7] = input[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[8];
+  bf1[2] = input[4];
+  bf1[3] = input[12];
+  bf1[4] = input[2];
+  bf1[5] = input[10];
+  bf1[6] = input[6];
+  bf1[7] = input[14];
+  bf1[8] = input[1];
+  bf1[9] = input[9];
+  bf1[10] = input[5];
+  bf1[11] = input[13];
+  bf1[12] = input[3];
+  bf1[13] = input[11];
+  bf1[14] = input[7];
+  bf1[15] = input[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[16];
+  bf1[2] = input[8];
+  bf1[3] = input[24];
+  bf1[4] = input[4];
+  bf1[5] = input[20];
+  bf1[6] = input[12];
+  bf1[7] = input[28];
+  bf1[8] = input[2];
+  bf1[9] = input[18];
+  bf1[10] = input[10];
+  bf1[11] = input[26];
+  bf1[12] = input[6];
+  bf1[13] = input[22];
+  bf1[14] = input[14];
+  bf1[15] = input[30];
+  bf1[16] = input[1];
+  bf1[17] = input[17];
+  bf1[18] = input[9];
+  bf1[19] = input[25];
+  bf1[20] = input[5];
+  bf1[21] = input[21];
+  bf1[22] = input[13];
+  bf1[23] = input[29];
+  bf1[24] = input[3];
+  bf1[25] = input[19];
+  bf1[26] = input[11];
+  bf1[27] = input[27];
+  bf1[28] = input[7];
+  bf1[29] = input[23];
+  bf1[30] = input[15];
+  bf1[31] = input[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[3];
+  bf1[2] = -input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[31];
+  bf1[2] = -input[15];
+  bf1[3] = input[16];
+  bf1[4] = -input[7];
+  bf1[5] = input[24];
+  bf1[6] = input[8];
+  bf1[7] = -input[23];
+  bf1[8] = -input[3];
+  bf1[9] = input[28];
+  bf1[10] = input[12];
+  bf1[11] = -input[19];
+  bf1[12] = input[4];
+  bf1[13] = -input[27];
+  bf1[14] = -input[11];
+  bf1[15] = input[20];
+  bf1[16] = -input[1];
+  bf1[17] = input[30];
+  bf1[18] = input[14];
+  bf1[19] = -input[17];
+  bf1[20] = input[6];
+  bf1[21] = -input[25];
+  bf1[22] = -input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[2];
+  bf1[25] = -input[29];
+  bf1[26] = -input[13];
+  bf1[27] = input[18];
+  bf1[28] = -input[5];
+  bf1[29] = input[26];
+  bf1[30] = input[10];
+  bf1[31] = -input[21];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = bf0[16] - bf0[18];
+  bf1[19] = bf0[17] - bf0[19];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = bf0[20] - bf0[22];
+  bf1[23] = bf0[21] - bf0[23];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = bf0[24] - bf0[26];
+  bf1[27] = bf0[25] - bf0[27];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = bf0[28] - bf0[30];
+  bf1[31] = bf0[29] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = bf0[16] - bf0[20];
+  bf1[21] = bf0[17] - bf0[21];
+  bf1[22] = bf0[18] - bf0[22];
+  bf1[23] = bf0[19] - bf0[23];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = bf0[24] - bf0[28];
+  bf1[29] = bf0[25] - bf0[29];
+  bf1[30] = bf0[26] - bf0[30];
+  bf1[31] = bf0[27] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = bf0[16] - bf0[24];
+  bf1[25] = bf0[17] - bf0[25];
+  bf1[26] = bf0[18] - bf0[26];
+  bf1[27] = bf0[19] - bf0[27];
+  bf1[28] = bf0[20] - bf0[28];
+  bf1[29] = bf0[21] - bf0[29];
+  bf1[30] = bf0[22] - bf0[30];
+  bf1[31] = bf0[23] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = bf0[0] - bf0[16];
+  bf1[17] = bf0[1] - bf0[17];
+  bf1[18] = bf0[2] - bf0[18];
+  bf1[19] = bf0[3] - bf0[19];
+  bf1[20] = bf0[4] - bf0[20];
+  bf1[21] = bf0[5] - bf0[21];
+  bf1[22] = bf0[6] - bf0[22];
+  bf1[23] = bf0[7] - bf0[23];
+  bf1[24] = bf0[8] - bf0[24];
+  bf1[25] = bf0[9] - bf0[25];
+  bf1[26] = bf0[10] - bf0[26];
+  bf1[27] = bf0[11] - bf0[27];
+  bf1[28] = bf0[12] - bf0[28];
+  bf1[29] = bf0[13] - bf0[29];
+  bf1[30] = bf0[14] - bf0[30];
+  bf1[31] = bf0[15] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[30];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[28];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[26];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[24];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[22];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[20];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[18];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[16];
+  bf1[16] = bf0[17];
+  bf1[17] = bf0[14];
+  bf1[18] = bf0[19];
+  bf1[19] = bf0[12];
+  bf1[20] = bf0[21];
+  bf1[21] = bf0[10];
+  bf1[22] = bf0[23];
+  bf1[23] = bf0[8];
+  bf1[24] = bf0[25];
+  bf1[25] = bf0[6];
+  bf1[26] = bf0[27];
+  bf1[27] = bf0[4];
+  bf1[28] = bf0[29];
+  bf1[29] = bf0[2];
+  bf1[30] = bf0[31];
+  bf1[31] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}

diff --git a/vp10/common/vp10_inv_txfm1d.h b/vp10/common/vp10_inv_txfm1d.h
new file mode 100644
index 0000000..0609b65
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm1d.h

@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_INV_TXFM1D_H_
+#define VP10_INV_TXFM1D_H_
+
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_idct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+void vp10_iadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_iadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_iadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_iadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_INV_TXFM1D_H_

diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c
new file mode 100644
index 0000000..c894a42
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm2d.c

@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_txfm.h"
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
+                                    int stride, const TXFM_2D_CFG *cfg,
+                                    int32_t *txfm_buf) {
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = cfg->txfm_func_col;
+  const TxfmFunc txfm_func_row = cfg->txfm_func_row;
+
+  // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
+  // it is used for intermediate data buffering
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + txfm_size;
+  int32_t *buf = temp_out + txfm_size;
+  int32_t *buf_ptr = buf;
+  int i, j;
+
+  // Rows
+  for (i = 0; i < txfm_size; ++i) {
+    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
+    round_shift_array(buf_ptr, txfm_size, -shift[0]);
+    input += txfm_size;
+    buf_ptr += txfm_size;
+  }
+
+  // Columns
+  for (i = 0; i < txfm_size; ++i) {
+    for (j = 0; j < txfm_size; ++j)
+      temp_in[j] = buf[j * txfm_size + i];
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    for (j = 0; j < txfm_size; ++j)
+      output[j * stride + i] += temp_out[j];
+  }
+}
+
+void vp10_inv_txfm2d_add_4x4(const int32_t *input, uint16_t *output,
+                             const int stride, const TXFM_2D_CFG *cfg,
+                             const int bd) {
+  int txfm_buf[4 * 4 + 4 + 4];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  clamp_block((int16_t *)output, 4, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_8x8(const int32_t *input, uint16_t *output,
+                             const int stride, const TXFM_2D_CFG *cfg,
+                             const int bd) {
+  int txfm_buf[8 * 8 + 8 + 8];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  clamp_block((int16_t *)output, 8, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_16x16(const int32_t *input, uint16_t *output,
+                               const int stride, const TXFM_2D_CFG *cfg,
+                               const int bd) {
+  int txfm_buf[16 * 16 + 16 + 16];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  clamp_block((int16_t *)output, 16, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_32x32(const int32_t *input, uint16_t *output,
+                               const int stride, const TXFM_2D_CFG *cfg,
+                               const int bd) {
+  int txfm_buf[32 * 32 + 32 + 32];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  clamp_block((int16_t *)output, 32, stride, 0, (1 << bd) - 1);
+}

diff --git a/vp10/common/vp10_inv_txfm2d.h b/vp10/common/vp10_inv_txfm2d.h
new file mode 100644
index 0000000..1b570ef
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm2d.h

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_INV_TXFM2D_C_H_
+#define VP10_INV_TXFM2D_C_H_
+
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+void vp10_inv_txfm2d_add_4x4(const int32_t *input, uint16_t *output,
+                             const int stride, const TXFM_2D_CFG *cfg,
+                             const int bd);
+void vp10_inv_txfm2d_add_8x8(const int32_t *input, uint16_t *output,
+                             const int stride, const TXFM_2D_CFG *cfg,
+                             const int bd);
+void vp10_inv_txfm2d_add_16x16(const int32_t *input, uint16_t *output,
+                               const int stride, const TXFM_2D_CFG *cfg,
+                               const int bd);
+void vp10_inv_txfm2d_add_32x32(const int32_t *input, uint16_t *output,
+                               const int stride, const TXFM_2D_CFG *cfg,
+                               const int bd);
+#ifdef __cplusplus
+}
+#endif
+#endif  // VP10_INV_TXFM2D_C_H_

diff --git a/vp10/common/vp10_inv_txfm2d_cfg.h b/vp10/common/vp10_inv_txfm2d_cfg.h
new file mode 100644
index 0000000..8cd76b5
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm2d_cfg.h

@@ -0,0 +1,377 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_INV_TXFM2D_CFG_H_
+#define VP10_INV_TXFM2D_CFG_H_
+#include "vp10/common/vp10_inv_txfm1d.h"
+
+//  ---------------- config inv_dct_dct_4 ----------------
+static const int8_t inv_shift_dct_dct_4[2] = {1, -5};
+static const int8_t inv_stage_range_col_dct_dct_4[4] = {17, 17, 16, 16};
+static const int8_t inv_stage_range_row_dct_dct_4[4] = {16, 16, 16, 16};
+static const int8_t inv_cos_bit_col_dct_dct_4[4] = {15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_dct_4[4] = {15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 4,
+    .stage_num_row = 4,
+
+    .shift = inv_shift_dct_dct_4,
+    .stage_range_col = inv_stage_range_col_dct_dct_4,
+    .stage_range_row = inv_stage_range_row_dct_dct_4,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_4,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_4,
+    .txfm_func_col = vp10_idct4_new,
+    .txfm_func_row = vp10_idct4_new};
+
+//  ---------------- config inv_dct_dct_8 ----------------
+static const int8_t inv_shift_dct_dct_8[2] = {0, -5};
+static const int8_t inv_stage_range_col_dct_dct_8[6] = {17, 17, 17, 17, 16, 16};
+static const int8_t inv_stage_range_row_dct_dct_8[6] = {17, 17, 17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_dct_dct_8[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_dct_8[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 6,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_dct_dct_8,
+    .stage_range_col = inv_stage_range_col_dct_dct_8,
+    .stage_range_row = inv_stage_range_row_dct_dct_8,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_8,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_8,
+    .txfm_func_col = vp10_idct8_new,
+    .txfm_func_row = vp10_idct8_new};
+
+//  ---------------- config inv_dct_dct_16 ----------------
+static const int8_t inv_shift_dct_dct_16[2] = {0, -6};
+static const int8_t inv_stage_range_col_dct_dct_16[8] = {18, 18, 18, 18,
+                                                         18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_dct_16[8] = {18, 18, 18, 18,
+                                                         18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_dct_dct_16[8] = {14, 14, 14, 14,
+                                                     14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_dct_dct_16[8] = {14, 14, 14, 14,
+                                                     14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 8,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_dct_dct_16,
+    .stage_range_col = inv_stage_range_col_dct_dct_16,
+    .stage_range_row = inv_stage_range_row_dct_dct_16,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_16,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_16,
+    .txfm_func_col = vp10_idct16_new,
+    .txfm_func_row = vp10_idct16_new};
+
+//  ---------------- config inv_dct_dct_32 ----------------
+static const int8_t inv_shift_dct_dct_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_dct_dct_32[10] = {18, 18, 18, 18, 18,
+                                                          18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_dct_32[10] = {19, 19, 19, 19, 19,
+                                                          19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_dct_dct_32[10] = {14, 14, 14, 14, 14,
+                                                      14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_dct_dct_32[10] = {13, 13, 13, 13, 13,
+                                                      13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 10,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_dct_dct_32,
+    .stage_range_col = inv_stage_range_col_dct_dct_32,
+    .stage_range_row = inv_stage_range_row_dct_dct_32,
+    .cos_bit_col = inv_cos_bit_col_dct_dct_32,
+    .cos_bit_row = inv_cos_bit_row_dct_dct_32,
+    .txfm_func_col = vp10_idct32_new,
+    .txfm_func_row = vp10_idct32_new};
+
+//  ---------------- config inv_dct_adst_4 ----------------
+static const int8_t inv_shift_dct_adst_4[2] = {1, -5};
+static const int8_t inv_stage_range_col_dct_adst_4[4] = {17, 17, 16, 16};
+static const int8_t inv_stage_range_row_dct_adst_4[6] = {16, 16, 16,
+                                                         16, 16, 16};
+static const int8_t inv_cos_bit_col_dct_adst_4[4] = {15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_adst_4[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 4,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_dct_adst_4,
+    .stage_range_col = inv_stage_range_col_dct_adst_4,
+    .stage_range_row = inv_stage_range_row_dct_adst_4,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_4,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_4,
+    .txfm_func_col = vp10_idct4_new,
+    .txfm_func_row = vp10_iadst4_new};
+
+//  ---------------- config inv_dct_adst_8 ----------------
+static const int8_t inv_shift_dct_adst_8[2] = {-1, -4};
+static const int8_t inv_stage_range_col_dct_adst_8[6] = {16, 16, 16,
+                                                         16, 15, 15};
+static const int8_t inv_stage_range_row_dct_adst_8[8] = {17, 17, 17, 17,
+                                                         17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_dct_adst_8[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_dct_adst_8[8] = {15, 15, 15, 15,
+                                                     15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 6,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_dct_adst_8,
+    .stage_range_col = inv_stage_range_col_dct_adst_8,
+    .stage_range_row = inv_stage_range_row_dct_adst_8,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_8,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_8,
+    .txfm_func_col = vp10_idct8_new,
+    .txfm_func_row = vp10_iadst8_new};
+
+//  ---------------- config inv_dct_adst_16 ----------------
+static const int8_t inv_shift_dct_adst_16[2] = {1, -7};
+static const int8_t inv_stage_range_col_dct_adst_16[8] = {19, 19, 19, 19,
+                                                          19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_adst_16[10] = {18, 18, 18, 18, 18,
+                                                           18, 18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_dct_adst_16[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 14};
+static const int8_t inv_cos_bit_row_dct_adst_16[10] = {14, 14, 14, 14, 14,
+                                                       14, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 8,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_dct_adst_16,
+    .stage_range_col = inv_stage_range_col_dct_adst_16,
+    .stage_range_row = inv_stage_range_row_dct_adst_16,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_16,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_16,
+    .txfm_func_col = vp10_idct16_new,
+    .txfm_func_row = vp10_iadst16_new};
+
+//  ---------------- config inv_dct_adst_32 ----------------
+static const int8_t inv_shift_dct_adst_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_dct_adst_32[10] = {18, 18, 18, 18, 18,
+                                                           18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_adst_32[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_dct_adst_32[10] = {14, 14, 14, 14, 14,
+                                                       14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_dct_adst_32[12] = {13, 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 10,
+    .stage_num_row = 12,
+
+    .shift = inv_shift_dct_adst_32,
+    .stage_range_col = inv_stage_range_col_dct_adst_32,
+    .stage_range_row = inv_stage_range_row_dct_adst_32,
+    .cos_bit_col = inv_cos_bit_col_dct_adst_32,
+    .cos_bit_row = inv_cos_bit_row_dct_adst_32,
+    .txfm_func_col = vp10_idct32_new,
+    .txfm_func_row = vp10_iadst32_new};
+
+//  ---------------- config inv_adst_adst_4 ----------------
+static const int8_t inv_shift_adst_adst_4[2] = {0, -4};
+static const int8_t inv_stage_range_col_adst_adst_4[6] = {16, 16, 16,
+                                                          16, 15, 15};
+static const int8_t inv_stage_range_row_adst_adst_4[6] = {16, 16, 16,
+                                                          16, 16, 16};
+static const int8_t inv_cos_bit_col_adst_adst_4[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_adst_4[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 6,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_adst_adst_4,
+    .stage_range_col = inv_stage_range_col_adst_adst_4,
+    .stage_range_row = inv_stage_range_row_adst_adst_4,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_4,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_4,
+    .txfm_func_col = vp10_iadst4_new,
+    .txfm_func_row = vp10_iadst4_new};
+
+//  ---------------- config inv_adst_adst_8 ----------------
+static const int8_t inv_shift_adst_adst_8[2] = {-1, -4};
+static const int8_t inv_stage_range_col_adst_adst_8[8] = {16, 16, 16, 16,
+                                                          16, 16, 15, 15};
+static const int8_t inv_stage_range_row_adst_adst_8[8] = {17, 17, 17, 17,
+                                                          17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_adst_adst_8[8] = {15, 15, 15, 15,
+                                                      15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_adst_8[8] = {15, 15, 15, 15,
+                                                      15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 8,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_adst_adst_8,
+    .stage_range_col = inv_stage_range_col_adst_adst_8,
+    .stage_range_row = inv_stage_range_row_adst_adst_8,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_8,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_8,
+    .txfm_func_col = vp10_iadst8_new,
+    .txfm_func_row = vp10_iadst8_new};
+
+//  ---------------- config inv_adst_adst_16 ----------------
+static const int8_t inv_shift_adst_adst_16[2] = {0, -6};
+static const int8_t inv_stage_range_col_adst_adst_16[10] = {18, 18, 18, 18, 18,
+                                                            18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_adst_adst_16[10] = {18, 18, 18, 18, 18,
+                                                            18, 18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_adst_adst_16[10] = {14, 14, 14, 14, 14,
+                                                        14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_adst_adst_16[10] = {14, 14, 14, 14, 14,
+                                                        14, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 10,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_adst_adst_16,
+    .stage_range_col = inv_stage_range_col_adst_adst_16,
+    .stage_range_row = inv_stage_range_row_adst_adst_16,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_16,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_16,
+    .txfm_func_col = vp10_iadst16_new,
+    .txfm_func_row = vp10_iadst16_new};
+
+//  ---------------- config inv_adst_adst_32 ----------------
+static const int8_t inv_shift_adst_adst_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_adst_adst_32[12] = {
+    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_adst_adst_32[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_adst_adst_32[12] = {14, 14, 14, 14, 14, 14,
+                                                        14, 14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_adst_adst_32[12] = {13, 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 12,
+    .stage_num_row = 12,
+
+    .shift = inv_shift_adst_adst_32,
+    .stage_range_col = inv_stage_range_col_adst_adst_32,
+    .stage_range_row = inv_stage_range_row_adst_adst_32,
+    .cos_bit_col = inv_cos_bit_col_adst_adst_32,
+    .cos_bit_row = inv_cos_bit_row_adst_adst_32,
+    .txfm_func_col = vp10_iadst32_new,
+    .txfm_func_row = vp10_iadst32_new};
+
+//  ---------------- config inv_adst_dct_4 ----------------
+static const int8_t inv_shift_adst_dct_4[2] = {1, -5};
+static const int8_t inv_stage_range_col_adst_dct_4[6] = {17, 17, 17, 17, 16, 16};
+static const int8_t inv_stage_range_row_adst_dct_4[4] = {16, 16, 16, 16};
+static const int8_t inv_cos_bit_col_adst_dct_4[6] = {15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_dct_4[4] = {15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_4 = {
+    .txfm_size = 4,
+    .stage_num_col = 6,
+    .stage_num_row = 4,
+
+    .shift = inv_shift_adst_dct_4,
+    .stage_range_col = inv_stage_range_col_adst_dct_4,
+    .stage_range_row = inv_stage_range_row_adst_dct_4,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_4,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_4,
+    .txfm_func_col = vp10_iadst4_new,
+    .txfm_func_row = vp10_idct4_new};
+
+//  ---------------- config inv_adst_dct_8 ----------------
+static const int8_t inv_shift_adst_dct_8[2] = {-1, -4};
+static const int8_t inv_stage_range_col_adst_dct_8[8] = {16, 16, 16, 16,
+                                                   16, 16, 15, 15};
+static const int8_t inv_stage_range_row_adst_dct_8[6] = {17, 17, 17, 17, 17, 17};
+static const int8_t inv_cos_bit_col_adst_dct_8[8] = {15, 15, 15, 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_dct_8[6] = {15, 15, 15, 15, 15, 15};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_8 = {
+    .txfm_size = 8,
+    .stage_num_col = 8,
+    .stage_num_row = 6,
+
+    .shift = inv_shift_adst_dct_8,
+    .stage_range_col = inv_stage_range_col_adst_dct_8,
+    .stage_range_row = inv_stage_range_row_adst_dct_8,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_8,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_8,
+    .txfm_func_col = vp10_iadst8_new,
+    .txfm_func_row = vp10_idct8_new};
+
+//  ---------------- config inv_adst_dct_16 ----------------
+static const int8_t inv_shift_adst_dct_16[2] = {-1, -5};
+static const int8_t inv_stage_range_col_adst_dct_16[10] = {17, 17, 17, 17, 17,
+                                                     17, 17, 17, 16, 16};
+static const int8_t inv_stage_range_row_adst_dct_16[8] = {18, 18, 18, 18,
+                                                    18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_adst_dct_16[10] = {15, 15, 15, 15, 15,
+                                                 15, 15, 15, 15, 15};
+static const int8_t inv_cos_bit_row_adst_dct_16[8] = {14, 14, 14, 14, 14, 14, 14, 14};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_16 = {
+    .txfm_size = 16,
+    .stage_num_col = 10,
+    .stage_num_row = 8,
+
+    .shift = inv_shift_adst_dct_16,
+    .stage_range_col = inv_stage_range_col_adst_dct_16,
+    .stage_range_row = inv_stage_range_row_adst_dct_16,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_16,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_16,
+    .txfm_func_col = vp10_iadst16_new,
+    .txfm_func_row = vp10_idct16_new};
+
+//  ---------------- config inv_adst_dct_32 ----------------
+static const int8_t inv_shift_adst_dct_32[2] = {-1, -6};
+static const int8_t inv_stage_range_col_adst_dct_32[12] = {18, 18, 18, 18, 18, 18,
+                                                     18, 18, 18, 18, 17, 17};
+static const int8_t inv_stage_range_row_adst_dct_32[10] = {19, 19, 19, 19, 19,
+                                                     19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_adst_dct_32[12] = {14, 14, 14, 14, 14, 14,
+                                                 14, 14, 14, 14, 14, 15};
+static const int8_t inv_cos_bit_row_adst_dct_32[10] = {13, 13, 13, 13, 13,
+                                                 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_32 = {
+    .txfm_size = 32,
+    .stage_num_col = 12,
+    .stage_num_row = 10,
+
+    .shift = inv_shift_adst_dct_32,
+    .stage_range_col = inv_stage_range_col_adst_dct_32,
+    .stage_range_row = inv_stage_range_row_adst_dct_32,
+    .cos_bit_col = inv_cos_bit_col_adst_dct_32,
+    .cos_bit_row = inv_cos_bit_row_adst_dct_32,
+    .txfm_func_col = vp10_iadst32_new,
+    .txfm_func_row = vp10_idct32_new};
+
+#endif  // VP10_INV_TXFM2D_CFG_H_

diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h
new file mode 100644
index 0000000..b4fd753
--- /dev/null
+++ b/vp10/common/vp10_txfm.h

@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_TXFM_H_
+#define VP10_TXFM_H_
+
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+// cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+static const int32_t cospi_arr[7][64] =
+  {{ 1024,  1024,  1023,  1021,  1019,  1016,  1013,  1009,
+     1004,   999,   993,   987,   980,   972,   964,   955,
+      946,   936,   926,   915,   903,   891,   878,   865,
+      851,   837,   822,   807,   792,   775,   759,   742,
+      724,   706,   688,   669,   650,   630,   610,   590,
+      569,   548,   526,   505,   483,   460,   438,   415,
+      392,   369,   345,   321,   297,   273,   249,   224,
+      200,   175,   150,   125,   100,    75,    50,    25},
+  {  2048,  2047,  2046,  2042,  2038,  2033,  2026,  2018,
+     2009,  1998,  1987,  1974,  1960,  1945,  1928,  1911,
+     1892,  1872,  1851,  1829,  1806,  1782,  1757,  1730,
+     1703,  1674,  1645,  1615,  1583,  1551,  1517,  1483,
+     1448,  1412,  1375,  1338,  1299,  1260,  1220,  1179,
+     1138,  1096,  1053,  1009,   965,   921,   876,   830,
+      784,   737,   690,   642,   595,   546,   498,   449,
+      400,   350,   301,   251,   201,   151,   100,    50},
+  {  4096,  4095,  4091,  4085,  4076,  4065,  4052,  4036,
+     4017,  3996,  3973,  3948,  3920,  3889,  3857,  3822,
+     3784,  3745,  3703,  3659,  3612,  3564,  3513,  3461,
+     3406,  3349,  3290,  3229,  3166,  3102,  3035,  2967,
+     2896,  2824,  2751,  2675,  2598,  2520,  2440,  2359,
+     2276,  2191,  2106,  2019,  1931,  1842,  1751,  1660,
+     1567,  1474,  1380,  1285,  1189,  1092,   995,   897,
+      799,   700,   601,   501,   401,   301,   201,   101},
+  {  8192,  8190,  8182,  8170,  8153,  8130,  8103,  8071,
+     8035,  7993,  7946,  7895,  7839,  7779,  7713,  7643,
+     7568,  7489,  7405,  7317,  7225,  7128,  7027,  6921,
+     6811,  6698,  6580,  6458,  6333,  6203,  6070,  5933,
+     5793,  5649,  5501,  5351,  5197,  5040,  4880,  4717,
+     4551,  4383,  4212,  4038,  3862,  3683,  3503,  3320,
+     3135,  2948,  2760,  2570,  2378,  2185,  1990,  1795,
+     1598,  1401,  1202,  1003,   803,   603,   402,   201},
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143,
+    16069, 15986, 15893, 15791, 15679, 15557, 15426, 15286,
+    15137, 14978, 14811, 14635, 14449, 14256, 14053, 13842,
+    13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866,
+    11585, 11297, 11003, 10702, 10394, 10080,  9760,  9434,
+     9102,  8765,  8423,  8076,  7723,  7366,  7005,  6639,
+     6270,  5897,  5520,  5139,  4756,  4370,  3981,  3590,
+     3196,  2801,  2404,  2006,  1606,  1205,   804,   402},
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286,
+    32138, 31972, 31786, 31581, 31357, 31114, 30853, 30572,
+    30274, 29957, 29622, 29269, 28899, 28511, 28106, 27684,
+    27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732,
+    23170, 22595, 22006, 21403, 20788, 20160, 19520, 18868,
+    18205, 17531, 16846, 16151, 15447, 14733, 14010, 13279,
+    12540, 11793, 11039, 10279,  9512,  8740,  7962,  7180,
+     6393,  5602,  4808,  4011,  3212,  2411,  1608,   804},
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571,
+    64277, 63944, 63572, 63162, 62714, 62228, 61705, 61145,
+    60547, 59914, 59244, 58538, 57798, 57022, 56212, 55368,
+    54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464,
+    46341, 45190, 44011, 42806, 41576, 40320, 39040, 37736,
+    36410, 35062, 33692, 32303, 30893, 29466, 28020, 26558,
+    25080, 23586, 22078, 20557, 19024, 17479, 15924, 14359,
+    12785, 11204,  9616,  8022,  6424,  4821,  3216,  1608}};
+
+static INLINE int32_t round_shift(int32_t value, int bit) {
+  // For value >= 0,
+  // there are twe version of rounding
+  // 1) (value + (1 << (bit - 1)) - 1) >> bit
+  // 2) (value + (1 << (bit - 1))) >> bit
+  // boath methods are mild unbiased
+  // however, the first version has slightly advantage because
+  // it rounds number toward zero.
+  // For value < 0, we also choose the version that rounds number
+  // toward zero.
+  if (bit > 0) {
+    if (value >= 0)
+      return (value + (1 << (bit - 1)) - 1) >> bit;
+    else
+      return ((value - (1 << (bit - 1))) >> bit) + 1;
+  } else {
+    return value << (-bit);
+  }
+}
+
+static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    for (i = 0; i < size; i++) {
+      arr[i] = round_shift(arr[i], bit);
+    }
+  }
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+                               int bit) {
+  int32_t result_32 = w0 * in0 + w1 * in1;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
+  if (result_32 != result_64) {
+    printf(
+        "%s overflow result_32: %d result_64: %ld w0: %d in0: %d w1: %d in1: "
+        "%d\n",
+        __func__, result_32, result_64, w0, in0, w1, in1);
+    assert(0 && "half_btf overflow");
+  }
+#endif
+  return round_shift(result_32, bit);
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+// TODO(angiebird): implement SSE
+static INLINE void clamp_block(int16_t *block, int block_size, int stride,
+                               int low, int high) {
+  int i, j;
+  for (i = 0; i < block_size; ++i) {
+    for (j = 0; j < block_size; ++j) {
+      block[i * stride + j] = clamp(block[i * stride + j], low, high);
+    }
+  }
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range);
+
+typedef struct TXFM_2D_CFG {
+  const int txfm_size;
+  const int stage_num_col;
+  const int stage_num_row;
+
+  const int8_t *shift;
+  const int8_t *stage_range_col;
+  const int8_t *stage_range_row;
+  const int8_t *cos_bit_col;
+  const int8_t *cos_bit_row;
+  const TxfmFunc txfm_func_col;
+  const TxfmFunc txfm_func_row;
+} TXFM_2D_CFG;
+
+#endif  // VP10_TXFM_H_

diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c
index a2c674b..900f091 100644
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c

@@ -11,6 +11,54 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
+#include "vp10/common/enums.h"
+
+#if CONFIG_EXT_TX
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) do {             \
+  __m128i *tmp;                                 \
+  fliplr_16x8(in0);                             \
+  fliplr_16x8(in1);                             \
+  tmp = (in0);                                  \
+  (in0) = (in1);                                \
+  (in1) = tmp;                                  \
+} while (0)
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+#endif
 
 void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
@@ -22,22 +70,50 @@
   in[1] = load_input_data(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst4_sse2(in);
       iadst4_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -52,12 +128,12 @@
 
   // Reconstruction and Store
   {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
     d0 = _mm_unpacklo_epi8(d0, zero);
     d2 = _mm_unpacklo_epi8(d2, zero);
     d0 = _mm_add_epi16(d0, in[0]);
@@ -94,22 +170,50 @@
   in[7] = load_input_data(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct8_sse2(in);
       idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct8_sse2(in);
       iadst8_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst8_sse2(in);
       idct8_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst8_sse2(in);
       iadst8_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -146,29 +250,59 @@
 
 void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride, int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
 
   load_buffer_8x16(input, in0);
   input += 8;
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;

diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 1c3f182..1bb569d 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c

@@ -61,13 +61,26 @@
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
     cm->comp_var_ref[0] = LAST_FRAME;
+#if CONFIG_EXT_REFS
+    cm->comp_var_ref[1] = LAST2_FRAME;
+    cm->comp_var_ref[2] = LAST3_FRAME;
+    cm->comp_var_ref[3] = LAST4_FRAME;
+    cm->comp_var_ref[4] = GOLDEN_FRAME;
+#else
     cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
   } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
                  cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+#if CONFIG_EXT_REFS
+    assert(0);
+#endif  // CONFIG_EXT_REFS
     cm->comp_fixed_ref = GOLDEN_FRAME;
     cm->comp_var_ref[0] = LAST_FRAME;
     cm->comp_var_ref[1] = ALTREF_FRAME;
   } else {
+#if CONFIG_EXT_REFS
+    assert(0);
+#endif  // CONFIG_EXT_REFS
     cm->comp_fixed_ref = LAST_FRAME;
     cm->comp_var_ref[0] = GOLDEN_FRAME;
     cm->comp_var_ref[1] = ALTREF_FRAME;
@@ -83,18 +96,9 @@
   return data > max ? max : data;
 }
 
-#if CONFIG_MISC_FIXES
 static TX_MODE read_tx_mode(struct vpx_read_bit_buffer *rb) {
   return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
 }
-#else
-static TX_MODE read_tx_mode(vpx_reader *r) {
-  TX_MODE tx_mode = vpx_read_literal(r, 2);
-  if (tx_mode == ALLOW_32X32)
-    tx_mode += vpx_read_bit(r);
-  return tx_mode;
-}
-#endif
 
 static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
   int i, j;
@@ -120,13 +124,22 @@
 }
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
-  int i, j;
+  int i;
+#if CONFIG_REF_MV
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->newmv_prob[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->zeromv_prob[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->refmv_prob[i]);
+#else
+  int j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
       vp10_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+#endif
 }
 
-#if CONFIG_MISC_FIXES
 static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
     struct vpx_read_bit_buffer *rb) {
   if (is_compound_reference_allowed(cm)) {
@@ -137,47 +150,36 @@
     return SINGLE_REFERENCE;
   }
 }
-#else
-static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
-                                                vpx_reader *r) {
-  if (is_compound_reference_allowed(cm)) {
-    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
-                                              : COMPOUND_REFERENCE)
-                           : SINGLE_REFERENCE;
-  } else {
-    return SINGLE_REFERENCE;
-  }
-}
-#endif
 
 static void read_frame_reference_mode_probs(VP10_COMMON *cm, vpx_reader *r) {
   FRAME_CONTEXT *const fc = cm->fc;
-  int i;
+  int i, j;
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
     for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
       vp10_diff_update_prob(r, &fc->comp_inter_prob[i]);
 
-  if (cm->reference_mode != COMPOUND_REFERENCE)
+  if (cm->reference_mode != COMPOUND_REFERENCE) {
     for (i = 0; i < REF_CONTEXTS; ++i) {
-      vp10_diff_update_prob(r, &fc->single_ref_prob[i][0]);
-      vp10_diff_update_prob(r, &fc->single_ref_prob[i][1]);
+      for (j = 0; j < (SINGLE_REFS - 1); ++j) {
+        vp10_diff_update_prob(r, &fc->single_ref_prob[i][j]);
+      }
     }
+  }
 
-  if (cm->reference_mode != SINGLE_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; ++i)
-      vp10_diff_update_prob(r, &fc->comp_ref_prob[i]);
+  if (cm->reference_mode != SINGLE_REFERENCE) {
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < (COMP_REFS - 1); ++j) {
+        vp10_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
+      }
+    }
+  }
 }
 
 static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
-#if CONFIG_MISC_FIXES
     vp10_diff_update_prob(r, &p[i]);
-#else
-    if (vpx_read(r, MV_UPDATE_PROB))
-      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
-#endif
 }
 
 static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
@@ -214,7 +216,7 @@
                                           uint8_t *dst, int stride,
                                           int eob, int block) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
   const int seg_id = xd->mi[0]->mbmi.segment_id;
   if (eob > 0) {
     tran_low_t *const dqcoeff = pd->dqcoeff;
@@ -369,8 +371,8 @@
                           col, row, plane);
 
   if (!mbmi->skip) {
-    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
-    const scan_order *sc = get_scan(tx_size, tx_type);
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+    const scan_order *sc = get_scan(tx_size, tx_type, 0);
     const int eob = vp10_decode_block_tokens(xd, plane, sc, col, row, tx_size,
                                              r, mbmi->segment_id);
     inverse_transform_block_intra(xd, plane, tx_type, tx_size,
@@ -378,14 +380,71 @@
   }
 }
 
+#if CONFIG_VAR_TX
+static void decode_reconstruct_tx(MACROBLOCKD *const xd, vpx_reader *r,
+                                  MB_MODE_INFO *const mbmi,
+                                  int plane, BLOCK_SIZE plane_bsize,
+                                  int block, int blk_row, int blk_col,
+                                  TX_SIZE tx_size, int *eob_total) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+    const scan_order *sc = get_scan(tx_size, tx_type, 1);
+    const int eob = vp10_decode_block_tokens(xd, plane, sc,
+                                             blk_col, blk_row, tx_size,
+                                             r, mbmi->segment_id);
+    inverse_transform_block_inter(xd, plane, tx_size,
+        &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col],
+        pd->dst.stride, eob, block);
+    *eob_total += eob;
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      decode_reconstruct_tx(xd, r, mbmi, plane, plane_bsize, block + i * step,
+                            offsetr, offsetc, tx_size - 1, eob_total);
+    }
+  }
+}
+#endif  // CONFIG_VAR_TX
+
 static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
                                    MB_MODE_INFO *const mbmi, int plane,
                                    int row, int col, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   int block_idx = (row << 1) + col;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
-  const scan_order *sc = get_scan(tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+  const scan_order *sc = get_scan(tx_size, tx_type, 1);
   const int eob = vp10_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
                                           mbmi->segment_id);
 
@@ -517,7 +576,9 @@
                     subpel_y, sf, w, h, ref, kernel, xs, ys);
   }
 }
+
 #else
+
 static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
                                int x0, int y0, int b_w, int b_h,
                                int frame_width, int frame_height,
@@ -555,6 +616,9 @@
   int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
       buf_stride, subpel_x, subpel_y;
   uint8_t *ref_frame, *buf_ptr;
+#if CONFIG_EXT_INTERP
+  const int i_filter = IsInterpolatingFilter(xd->mi[0]->mbmi.interp_filter);
+#endif  // CONFIG_EXT_INTERP
 
   // Get reference frame pointer, width and height.
   if (plane == 0) {
@@ -624,6 +688,9 @@
   // Do border extension if there is motion or the
   // width/height is not a multiple of 8 pixels.
   if (is_scaled || scaled_mv.col || scaled_mv.row ||
+#if CONFIG_EXT_INTERP
+      !i_filter ||
+#endif
       (frame_width & 0x7) || (frame_height & 0x7)) {
     int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
 
@@ -631,13 +698,21 @@
     int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
     int x_pad = 0, y_pad = 0;
 
-    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+    if (subpel_x ||
+#if CONFIG_EXT_INTERP
+        !i_filter ||
+#endif
+        (sf->x_step_q4 != SUBPEL_SHIFTS)) {
       x0 -= VP9_INTERP_EXTEND - 1;
       x1 += VP9_INTERP_EXTEND;
       x_pad = 1;
     }
 
-    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+    if (subpel_y ||
+#if CONFIG_EXT_INTERP
+        !i_filter ||
+#endif
+        (sf->y_step_q4 != SUBPEL_SHIFTS)) {
       y0 -= VP9_INTERP_EXTEND - 1;
       y1 += VP9_INTERP_EXTEND;
       y_pad = 1;
@@ -750,6 +825,49 @@
   }
 }
 
+static void dec_build_inter_predictors_sb_sub8x8(VP10Decoder *const pbi,
+                                                 MACROBLOCKD *xd,
+                                                 int mi_row, int mi_col,
+                                                 int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0];
+  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
+  const int is_compound = has_second_ref(&mi->mbmi);
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct buf_2d *const dst_buf = &pd->dst;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+
+    const int n4w_x4 = 4 * num_4x4_w;
+    const int n4h_x4 = 4 * num_4x4_h;
+    int ref;
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+      const int idx = xd->block_refs[ref]->idx;
+      BufferPool *const pool = pbi->common.buffer_pool;
+      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+      const int is_scaled = vp10_is_scaled(sf);
+      const MV mv = average_split_mvs(pd, mi, ref, block);
+      dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                 0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
+                                 sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+                                 is_scaled, ref);
+    }
+  }
+}
+
 static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                          int n4_wl, int n4_hl) {
   // get minimum log2 num4x4s dimension
@@ -799,6 +917,11 @@
 
   set_skip_context(xd, mi_row, mi_col);
 
+
+#if CONFIG_VAR_TX
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
@@ -807,7 +930,588 @@
   return &xd->mi[0]->mbmi;
 }
 
+#if CONFIG_SUPERTX
+static MB_MODE_INFO *set_offsets_extend(VP10_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        BLOCK_SIZE bsize_pred,
+                                        int mi_row_pred, int mi_col_pred,
+                                        int mi_row_ori, int mi_col_ori) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const int bw = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int bh = num_8x8_blocks_high_lookup[bsize_pred];
+  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
+  const int bwl = b_width_log2_lookup[bsize_pred];
+  const int bhl = b_height_log2_lookup[bsize_pred];
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
+                 cm->mi_rows, cm->mi_cols);
+
+  xd->up_available    = (mi_row_ori != 0);
+  xd->left_available  = (mi_col_ori > tile->mi_col_start);
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  return &xd->mi[0]->mbmi;
+}
+
+static MB_MODE_INFO *set_mb_offsets(VP10_COMMON *const cm,
+                                    MACROBLOCKD *const xd,
+                                    BLOCK_SIZE bsize,
+                                    int mi_row, int mi_col,
+                                    int bw, int bh,
+                                    int x_mis, int y_mis) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x)
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+  return &xd->mi[0]->mbmi;
+}
+
+static void set_offsets_topblock(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const int bwl = b_width_log2_lookup[bsize];
+  const int bhl = b_height_log2_lookup[bsize];
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+}
+
+static void set_param_topblock(VP10_COMMON *const cm,  MACROBLOCKD *const xd,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+#if CONFIG_EXT_TX
+                               int txfm,
+#endif
+                               int skip) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+#if CONFIG_EXT_TX
+      xd->mi[y * cm->mi_stride + x]->mbmi.tx_type = txfm;
+#endif
+    }
+}
+
+static void set_ref(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                    int idx, int mi_row, int mi_col) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
+  xd->block_refs[idx] = ref_buffer;
+  if (!vp10_is_valid_scale(&ref_buffer->sf))
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Invalid scale factors");
+  vp10_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
+                        &ref_buffer->sf);
+  xd->corrupted |= ref_buffer->buf->corrupted;
+}
+
+static void dec_predict_b_extend(
+    VP10Decoder *const pbi, MACROBLOCKD *const xd,
+    const TileInfo *const tile, int block,
+    int mi_row_ori, int mi_col_ori,
+    int mi_row_pred, int mi_col_pred,
+    int mi_row_top, int mi_col_top,
+    uint8_t * dst_buf[3], int dst_stride[3],
+    BLOCK_SIZE bsize_top,
+    BLOCK_SIZE bsize_pred,
+    int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+  MB_MODE_INFO *mbmi;
+  VP10_COMMON *const cm = &pbi->common;
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top ||
+      mi_row_pred >= cm->mi_rows || mi_col_pred >= cm->mi_cols)
+    return;
+
+  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred,
+                            mi_row_pred, mi_col_pred,
+                            mi_row_ori, mi_col_ori);
+  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
+
+  if (!bextend) {
+    mbmi->tx_size = b_width_log2_lookup[bsize_top];
+  }
+
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  if (!b_sub8x8)
+    dec_build_inter_predictors_sb(pbi, xd, mi_row_pred, mi_col_pred);
+  else
+    dec_build_inter_predictors_sb_sub8x8(pbi, xd, mi_row_pred, mi_col_pred,
+                                         block);
+}
+
+static void dec_extend_dir(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                           int mi_row, int mi_col,
+                           int mi_row_top, int mi_col_top,
+                           uint8_t * dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss) ?
+                    BLOCK_8X8 : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      assert(!b_sub8x8);
+      for (i = 0; i < mi_width/unit - 1; i++) {
+        mi_col_pred += unit;
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred, mi_col_pred,
+                             mi_row_top, mi_col_top,
+                             dst_buf, dst_stride,
+                             top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss) ?
+                    BLOCK_8X8 : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height/unit - 1; i++) {
+        mi_row_pred += unit;
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred, mi_col_pred,
+                             mi_row_top, mi_col_top,
+                             dst_buf, dst_stride,
+                             top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+  }
+}
+
+static void dec_extend_all(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                           int mi_row, int mi_col,
+                           int mi_row_top, int mi_col_top,
+                           uint8_t * dst_buf[3], int dst_stride[3]) {
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 1);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 2);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 4);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 5);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 6);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 7);
+}
+
+static void dec_predict_sb_complex(VP10Decoder *const pbi,
+                                   MACROBLOCKD *const xd,
+                                   const TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   int mi_row_top, int mi_col_top,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  VP10_COMMON *const cm = &pbi->common;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  MB_MODE_INFO *mbmi;
+  int i, offset = mi_row * cm->mi_stride + mi_col;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf1[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf2[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf3[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+  int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+  int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+  int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * len);
+  } else {
+#endif
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAXTXLEN * MAXTXLEN;
+    dst_buf1[2] = tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAXTXLEN * MAXTXLEN;
+    dst_buf2[2] = tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAXTXLEN * MAXTXLEN;
+    dst_buf3[2] = tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  mbmi = &xd->mi[0]->mbmi;
+  partition = partition_lookup[bsl][mbmi->sb_type];
+  subsize = get_subsize(bsize, partition);
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8) {
+        // For sub8x8, predict in 8x8 unit
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // weighted average to smooth the boundary
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  &xd->plane[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, 0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col,
+                               mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1,
+                               top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row + hbs, mi_col,
+                           mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row + hbs, mi_col,
+                           mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, 1);
+
+          // weighted average to smooth the boundary
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8) {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  &xd->plane[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, 0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+        // Second half
+        if (mi_col + hbs < cm->mi_cols) {
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
+                               mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1, top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, 2);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize) {
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+          dec_extend_all(pbi, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3);
+        }
+      } else {
+        dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col,
+                               mi_row_top, mi_col_top, subsize, top_bsize,
+                               dst_buf, dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+      }
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i], dst_stride[i],
+                                                      dst_buf1[i],
+                                                      dst_stride1[i],
+                                                      &xd->plane[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_VERT, i);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        dst_buf3[i],
+                                                        dst_stride3[i],
+                                                        &xd->plane[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_VERT, i);
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf[i],
+                                                        dst_stride[i],
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        &xd->plane[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_HORZ, i);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i],
+                                                      dst_stride[i],
+                                                      dst_buf2[i],
+                                                      dst_stride2[i],
+                                                      &xd->plane[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_HORZ, i);
+          }
+        }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                          int mi_row, int mi_col,
                          vpx_reader *r, BLOCK_SIZE bsize,
                          int bwl, int bhl) {
@@ -818,8 +1522,22 @@
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
 
+#if CONFIG_SUPERTX
+  MB_MODE_INFO *mbmi;
+  if (supertx_enabled) {
+    mbmi = set_mb_offsets(cm, xd, bsize, mi_row, mi_col,
+                          bw, bh, x_mis, y_mis);
+  } else {
+    mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                       bw, bh, x_mis, y_mis, bwl, bhl);
+  }
+  vp10_read_mode_info(pbi, xd, supertx_enabled,
+                      mi_row, mi_col, r, x_mis, y_mis);
+#else
   MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
                                    bw, bh, x_mis, y_mis, bwl, bhl);
+  vp10_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+#endif  // CONFIG_SUPERTX
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -829,70 +1547,92 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vp10_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
-
-  if (mbmi->skip) {
-    dec_reset_skip_context(xd);
-  }
-
-  if (!is_inter_block(mbmi)) {
-    int plane;
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const TX_SIZE tx_size =
-          plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-                  : mbmi->tx_size;
-      const int num_4x4_w = pd->n4_w;
-      const int num_4x4_h = pd->n4_h;
-      const int step = (1 << tx_size);
-      int row, col;
-      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
-          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
-          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-      for (row = 0; row < max_blocks_high; row += step)
-        for (col = 0; col < max_blocks_wide; col += step)
-          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
-                                              row, col, tx_size);
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif
+    if (mbmi->skip) {
+      dec_reset_skip_context(xd);
     }
-  } else {
-    // Prediction
-    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
-
-    // Reconstruction
-    if (!mbmi->skip) {
-      int eobtotal = 0;
+    if (!is_inter_block(mbmi)) {
       int plane;
-
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
         const TX_SIZE tx_size =
             plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-                    : mbmi->tx_size;
+            : mbmi->tx_size;
         const int num_4x4_w = pd->n4_w;
         const int num_4x4_h = pd->n4_h;
         const int step = (1 << tx_size);
         int row, col;
-        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
-            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
-            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+        const int max_blocks_wide = num_4x4_w +
+            (xd->mb_to_right_edge >= 0 ?
+             0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h +
+            (xd->mb_to_bottom_edge >= 0 ?
+             0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
-            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
-                                                tx_size);
+            predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+                                                row, col, tx_size);
       }
+    } else {
+      // Prediction
+      dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
 
-      if (!less8x8 && eobtotal == 0)
-#if CONFIG_MISC_FIXES
-        mbmi->has_no_coeffs = 1;  // skip loopfilter
+      // Reconstruction
+      if (!mbmi->skip) {
+        int eobtotal = 0;
+        int plane;
+
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *const pd = &xd->plane[plane];
+          const int num_4x4_w = pd->n4_w;
+          const int num_4x4_h = pd->n4_h;
+          int row, col;
+#if CONFIG_VAR_TX
+          // TODO(jingning): This can be simplified for decoder performance.
+          const BLOCK_SIZE plane_bsize =
+              get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), pd);
+          const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+          const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+          int bw = num_4x4_blocks_wide_lookup[txb_size];
+          int block = 0;
+          const int step = 1 << (max_tx_size << 1);
+
+          for (row = 0; row < num_4x4_h; row += bw) {
+            for (col = 0; col < num_4x4_w; col += bw) {
+              decode_reconstruct_tx(xd, r, mbmi, plane, plane_bsize,
+                                    block, row, col, max_tx_size, &eobtotal);
+              block += step;
+            }
+          }
 #else
-        mbmi->skip = 1;  // skip loopfilter
+          const TX_SIZE tx_size =
+              plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+              : mbmi->tx_size;
+          const int step = (1 << tx_size);
+          const int max_blocks_wide = num_4x4_w +
+              (xd->mb_to_right_edge >= 0 ?
+               0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+          const int max_blocks_high = num_4x4_h +
+              (xd->mb_to_bottom_edge >= 0 ?
+               0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+          for (row = 0; row < max_blocks_high; row += step)
+            for (col = 0; col < max_blocks_wide; col += step)
+              eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+                                                  tx_size);
 #endif
+        }
+
+        if (!less8x8 && eobtotal == 0)
+          mbmi->has_no_coeffs = 1;  // skip loopfilter
+      }
     }
+#if CONFIG_SUPERTX
   }
+#endif  // CONFIG_SUPERTX
 
   xd->corrupted |= vpx_reader_has_error(r);
 }
@@ -946,8 +1686,23 @@
   return p;
 }
 
+#if CONFIG_SUPERTX
+static int read_skip_without_seg(VP10_COMMON *cm, const MACROBLOCKD *xd,
+                                 vpx_reader *r) {
+  const int ctx = vp10_get_skip_context(xd);
+  const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->skip[ctx][skip];
+  return skip;
+}
+#endif  // CONFIG_SUPERTX
+
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                             int supertx_enabled,
+#endif
                              int mi_row, int mi_col,
                              vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
   VP10_COMMON *const cm = &pbi->common;
@@ -958,6 +1713,15 @@
   BLOCK_SIZE subsize;
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
+#if CONFIG_SUPERTX
+  const int read_token = !supertx_enabled;
+  int skip = 0;
+  TX_SIZE supertx_size = b_width_log2_lookup[bsize];
+  const TileInfo *const tile = &xd->tile;
+#if CONFIG_EXT_TX
+  int txfm = DCT_DCT;
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_SUPERTX
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -965,40 +1729,169 @@
   partition = read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols,
                              n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) &&
+      partition != PARTITION_NONE &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !supertx_enabled &&
+      !xd->lossless[0]) {
+    const int supertx_context =
+        partition_supertx_context_lookup[partition];
+    supertx_enabled = vpx_read(
+        r, cm->fc->supertx_prob[supertx_context][supertx_size]);
+    if (xd->counts)
+      xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
+  }
+  if (supertx_enabled && read_token) {
+    int offset = mi_row * cm->mi_stride + mi_col;
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = cm->mi + offset;
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize],
+                   mi_col, num_8x8_blocks_wide_lookup[bsize],
+                   cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    // Here skip is read without using any segment level feature
+    skip = read_skip_without_seg(cm, xd, r);
+    if (skip)
+      reset_skip_context(xd, bsize);
+#if CONFIG_EXT_TX
+    if (!skip) {
+      if (get_ext_tx_types(supertx_size, bsize, 1) > 1) {
+        int eset = get_ext_tx_set(supertx_size, bsize, 1);
+        if (eset > 0) {
+          txfm = vpx_read_tree(r, vp10_ext_tx_inter_tree[eset],
+                               cm->fc->inter_ext_tx_prob[eset][supertx_size]);
+          if (xd->counts)
+            ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
+        }
+      }
+    }
+#endif  // CONFIG_EXT_TX
+  }
+#endif  // CONFIG_SUPERTX
   if (!hbs) {
     // calculate bmode block dimensions (log 2)
     xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
     xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+    decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                 mi_row, mi_col, r, subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
-                       n8x8_l2);
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row + hbs, mi_col, r, subsize, n4x4_l2, n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
-                       n4x4_l2);
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row, mi_col + hbs, r, subsize, n8x8_l2, n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
-                         n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col + hbs, r, subsize, n8x8_l2);
         break;
       default:
         assert(0 && "Invalid partition type");
     }
   }
 
+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+
+    vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row, mi_col,
+                           bsize, bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+#if CONFIG_EXT_TX
+      xd->mi[0]->mbmi.tx_type = txfm;
+#endif
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const struct macroblockd_plane *const pd = &xd->plane[i];
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        int row, col;
+        const TX_SIZE tx_size =
+            i ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+            : mbmi->tx_size;
+        const int step = (1 << tx_size);
+        const int max_blocks_wide = num_4x4_w +
+            (xd->mb_to_right_edge >= 0 ?
+             0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h +
+            (xd->mb_to_bottom_edge >= 0 ?
+             0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd, r, mbmi, i, row, col,
+                                                tx_size);
+      }
+      if (!(subsize < BLOCK_8X8) && eobtotal == 0)
+        skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col,
+#if CONFIG_EXT_TX
+                       txfm,
+#endif
+                       skip);
+  }
+#endif  // CONFIG_SUPERTX
+
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
@@ -1048,9 +1941,6 @@
 static void setup_segmentation(VP10_COMMON *const cm,
                                struct vpx_read_bit_buffer *rb) {
   struct segmentation *const seg = &cm->seg;
-#if !CONFIG_MISC_FIXES
-  struct segmentation_probs *const segp = &cm->segp;
-#endif
   int i, j;
 
   seg->update_map = 0;
@@ -1067,26 +1957,11 @@
     seg->update_map = vpx_rb_read_bit(rb);
   }
   if (seg->update_map) {
-#if !CONFIG_MISC_FIXES
-    for (i = 0; i < SEG_TREE_PROBS; i++)
-      segp->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
-                                                : MAX_PROB;
-#endif
     if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
       seg->temporal_update = 0;
     } else {
       seg->temporal_update = vpx_rb_read_bit(rb);
     }
-#if !CONFIG_MISC_FIXES
-    if (seg->temporal_update) {
-      for (i = 0; i < PREDICTION_PROBS; i++)
-        segp->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
-                                                  : MAX_PROB;
-    } else {
-      for (i = 0; i < PREDICTION_PROBS; i++)
-        segp->pred_probs[i] = MAX_PROB;
-    }
-#endif
   }
 
   // Segmentation data update
@@ -1140,7 +2015,7 @@
 
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
   return vpx_rb_read_bit(rb) ?
-      vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4) : 0;
+      vpx_rb_read_inv_signed_literal(rb, 6) : 0;
 }
 
 static void setup_quantization(VP10_COMMON *const cm,
@@ -1180,7 +2055,8 @@
 }
 
 static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
-  return vpx_rb_read_bit(rb) ? SWITCHABLE : vpx_rb_read_literal(rb, 2);
+  return vpx_rb_read_bit(rb) ?
+      SWITCHABLE : vpx_rb_read_literal(rb, 2 + CONFIG_EXT_INTERP);
 }
 
 static void setup_render_size(VP10_COMMON *cm,
@@ -1283,10 +2159,8 @@
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
-#if CONFIG_MISC_FIXES
       cm->render_width = buf->render_width;
       cm->render_height = buf->render_height;
-#endif
       found = 1;
       break;
     }
@@ -1294,9 +2168,7 @@
 
   if (!found) {
     vp10_read_frame_size(rb, &width, &height);
-#if CONFIG_MISC_FIXES
     setup_render_size(cm, rb);
-#endif
   }
 
   if (width <= 0 || height <= 0)
@@ -1328,9 +2200,6 @@
   }
 
   resize_context_buffers(cm, width, height);
-#if !CONFIG_MISC_FIXES
-  setup_render_size(cm, rb);
-#endif
 
   lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
@@ -1377,14 +2246,10 @@
   if (cm->log2_tile_rows)
     cm->log2_tile_rows += vpx_rb_read_bit(rb);
 
-#if CONFIG_MISC_FIXES
   // tile size magnitude
   if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
     cm->tile_sz_mag = vpx_rb_read_literal(rb, 2);
   }
-#else
-  cm->tile_sz_mag = 3;
-#endif
 }
 
 typedef struct TileBuffer {
@@ -1428,9 +2293,9 @@
     if (decrypt_cb) {
       uint8_t be_data[4];
       decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
-      size = mem_get_varsize(be_data, tile_sz_mag) + CONFIG_MISC_FIXES;
+      size = mem_get_varsize(be_data, tile_sz_mag) + 1;
     } else {
-      size = mem_get_varsize(*data, tile_sz_mag) + CONFIG_MISC_FIXES;
+      size = mem_get_varsize(*data, tile_sz_mag) + 1;
     }
     *data += tile_sz_mag + 1;
 
@@ -1508,6 +2373,11 @@
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * aligned_cols);
 
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, 0,
+         sizeof(*cm->above_txfm_context) * aligned_cols);
+#endif
+
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
   if (pbi->tile_data == NULL ||
@@ -1554,16 +2424,24 @@
         vp10_tile_set_col(&tile, tile_data->cm, col);
         vp10_zero(tile_data->xd.left_context);
         vp10_zero(tile_data->xd.left_seg_context);
+#if CONFIG_VAR_TX
+        vp10_zero(tile_data->xd.left_txfm_context_buffer);
+#endif
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(pbi, &tile_data->xd, mi_row,
-                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
+          decode_partition(pbi, &tile_data->xd,
+#if CONFIG_SUPERTX
+                           0,
+#endif
+                           mi_row, mi_col, &tile_data->bit_reader,
+                           BLOCK_64X64, 4);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
             vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                                "Failed to decode tile data");
       }
+#if !CONFIG_VAR_TX
       // Loopfilter one row.
       if (cm->lf.filter_level && !cm->skip_loop_filter) {
         const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -1590,10 +2468,15 @@
       if (cm->frame_parallel_decode)
         vp10_frameworker_broadcast(pbi->cur_buf,
                                   mi_row << MI_BLOCK_SIZE_LOG2);
+#endif
     }
   }
 
   // Loopfilter remaining rows in the frame.
+#if CONFIG_VAR_TX
+  vp10_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
+                         cm->lf.filter_level, 0, 0);
+#else
   if (cm->lf.filter_level && !cm->skip_loop_filter) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     winterface->sync(&pbi->lf_worker);
@@ -1601,6 +2484,7 @@
     lf_data->stop = cm->mi_rows;
     winterface->execute(&pbi->lf_worker);
   }
+#endif
 
   // Get last tile data.
   tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
@@ -1627,9 +2511,15 @@
        mi_row += MI_BLOCK_SIZE) {
     vp10_zero(tile_data->xd.left_context);
     vp10_zero(tile_data->xd.left_seg_context);
+#if CONFIG_VAR_TX
+    vp10_zero(tile_data->xd.left_txfm_context_buffer);
+#endif
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
+#if CONFIG_SUPERTX
+                       0,
+#endif
                        mi_row, mi_col, &tile_data->bit_reader,
                        BLOCK_64X64, 4);
     }
@@ -1704,7 +2594,10 @@
          sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, 0,
+         sizeof(*cm->above_txfm_context) * aligned_mi_cols);
+#endif
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -1865,6 +2758,10 @@
   int i, mask, ref_index = 0;
   size_t sz;
 
+#if CONFIG_EXT_REFS
+  cm->last3_frame_type = cm->last2_frame_type;
+  cm->last2_frame_type = cm->last_frame_type;
+#endif  // CONFIG_EXT_REFS
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
@@ -1930,13 +2827,14 @@
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
       pbi->need_resync = 0;
     }
+    if (frame_is_intra_only(cm))
+      cm->allow_screen_content_tools = vpx_rb_read_bit(rb);
   } else {
     cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
 
     if (cm->error_resilient_mode) {
         cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
     } else {
-#if CONFIG_MISC_FIXES
       if (cm->intra_only) {
           cm->reset_frame_context =
               vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
@@ -1950,40 +2848,14 @@
                   vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
                                       : RESET_FRAME_CONTEXT_CURRENT;
       }
-#else
-      static const RESET_FRAME_CONTEXT_MODE reset_frame_context_conv_tbl[4] = {
-        RESET_FRAME_CONTEXT_NONE, RESET_FRAME_CONTEXT_NONE,
-        RESET_FRAME_CONTEXT_CURRENT, RESET_FRAME_CONTEXT_ALL
-      };
-
-      cm->reset_frame_context =
-          reset_frame_context_conv_tbl[vpx_rb_read_literal(rb, 2)];
-#endif
     }
 
     if (cm->intra_only) {
       if (!vp10_read_sync_code(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Invalid frame sync code");
-#if CONFIG_MISC_FIXES
+
       read_bitdepth_colorspace_sampling(cm, rb);
-#else
-      if (cm->profile > PROFILE_0) {
-        read_bitdepth_colorspace_sampling(cm, rb);
-      } else {
-        // NOTE: The intra-only frame header does not include the specification
-        // of either the color format or color sub-sampling in profile 0. VP9
-        // specifies that the default color format should be YUV 4:2:0 in this
-        // case (normative).
-        cm->color_space = VPX_CS_BT_601;
-        cm->color_range = 0;
-        cm->subsampling_y = cm->subsampling_x = 1;
-        cm->bit_depth = VPX_BITS_8;
-#if CONFIG_VP9_HIGHBITDEPTH
-        cm->use_highbitdepth = 0;
-#endif
-      }
-#endif
 
       pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
@@ -2046,10 +2918,6 @@
         cm->refresh_frame_context =
             vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
                                 : REFRESH_FRAME_CONTEXT_BACKWARD;
-#if !CONFIG_MISC_FIXES
-    } else {
-      vpx_rb_read_bit(rb);  // parallel decoding mode flag
-#endif
     }
   } else {
     cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
@@ -2076,6 +2944,7 @@
 
   for (; ref_index < REF_FRAMES; ++ref_index) {
     cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
     // Current thread holds the reference frame.
     if (cm->ref_frame_map[ref_index] >= 0)
       ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
@@ -2108,11 +2977,9 @@
   }
 
   setup_segmentation_dequant(cm);
-#if CONFIG_MISC_FIXES
   cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4
                                                       : read_tx_mode(rb);
   cm->reference_mode = read_frame_reference_mode(cm, rb);
-#endif
 
   setup_tile_info(cm, rb);
   sz = vpx_rb_read_literal(rb, 16);
@@ -2124,6 +2991,32 @@
   return sz;
 }
 
+#if CONFIG_EXT_TX
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j, k;
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < num_ext_tx_set_inter[s] - 1; ++j)
+          vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[s][i][j]);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          for (k = 0; k < num_ext_tx_set_intra[s] - 1; ++k)
+            vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[s][i][j][k]);
+      }
+    }
+  }
+}
+#else
 static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j, k;
   if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
@@ -2141,10 +3034,25 @@
   }
 }
 
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_SUPERTX
+static void read_supertx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+  int i, j;
+  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = 1; j < TX_SIZES; ++j) {
+        vp10_diff_update_prob(r, &fc->supertx_prob[i][j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
                                   size_t partition_size) {
   VP10_COMMON *const cm = &pbi->common;
-#if !CONFIG_MISC_FIXES
+#if CONFIG_SUPERTX
   MACROBLOCKD *const xd = &pbi->mb;
 #endif
   FRAME_CONTEXT *const fc = cm->fc;
@@ -2156,17 +3064,18 @@
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
 
-#if !CONFIG_MISC_FIXES
-  cm->tx_mode = xd->lossless[0] ? ONLY_4X4 : read_tx_mode(&r);
-#endif
   if (cm->tx_mode == TX_MODE_SELECT)
     read_tx_mode_probs(&fc->tx_probs, &r);
   read_coef_probs(fc, cm->tx_mode, &r);
 
+#if CONFIG_VAR_TX
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    vp10_diff_update_prob(&r, &fc->txfm_partition_prob[k]);
+#endif
+
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     vp10_diff_update_prob(&r, &fc->skip_probs[k]);
 
-#if CONFIG_MISC_FIXES
   if (cm->seg.enabled) {
     if (cm->seg.temporal_update) {
       for (k = 0; k < PREDICTION_PROBS; k++)
@@ -2183,16 +3092,13 @@
   for (j = 0; j < PARTITION_CONTEXTS; ++j)
     for (i = 0; i < PARTITION_TYPES - 1; ++i)
       vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
-#endif
 
   if (frame_is_intra_only(cm)) {
     vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
-#if CONFIG_MISC_FIXES
     for (k = 0; k < INTRA_MODES; k++)
       for (j = 0; j < INTRA_MODES; j++)
         for (i = 0; i < INTRA_MODES - 1; ++i)
           vp10_diff_update_prob(&r, &cm->kf_y_prob[k][j][i]);
-#endif
   } else {
     nmv_context *const nmvc = &fc->nmvc;
 
@@ -2204,9 +3110,6 @@
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp10_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-#if !CONFIG_MISC_FIXES
-    cm->reference_mode = read_frame_reference_mode(cm, &r);
-#endif
     if (cm->reference_mode != SINGLE_REFERENCE)
       setup_compound_reference_mode(cm);
     read_frame_reference_mode_probs(cm, &r);
@@ -2215,14 +3118,12 @@
       for (i = 0; i < INTRA_MODES - 1; ++i)
         vp10_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
 
-#if !CONFIG_MISC_FIXES
-    for (j = 0; j < PARTITION_CONTEXTS; ++j)
-      for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
-#endif
-
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
     read_ext_tx_probs(fc, &r);
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0])
+      read_supertx_probs(fc, &r);
+#endif
   }
 
   return vpx_reader_has_error(&r);
@@ -2263,10 +3164,14 @@
   assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
   assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
   assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
-  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
-                 sizeof(cm->counts.intra_ext_tx)));
+#if CONFIG_EXT_TX
   assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx,
                  sizeof(cm->counts.inter_ext_tx)));
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+                 sizeof(cm->counts.intra_ext_tx)));
+#else
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+#endif  // CONFIG_EXT_TX
 }
 #endif  // NDEBUG
 
@@ -2407,14 +3312,9 @@
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
       vp10_adapt_coef_probs(cm);
-#if CONFIG_MISC_FIXES
       vp10_adapt_intra_frame_probs(cm);
-#endif
 
       if (!frame_is_intra_only(cm)) {
-#if !CONFIG_MISC_FIXES
-        vp10_adapt_intra_frame_probs(cm);
-#endif
         vp10_adapt_inter_frame_probs(cm);
         vp10_adapt_mv_probs(cm, cm->allow_high_precision_mv);
       }

diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index a28ae55..ec93453 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c

@@ -63,7 +63,59 @@
 }
 
 static PREDICTION_MODE read_inter_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                       vpx_reader *r, int ctx) {
+                                       vpx_reader *r, int16_t ctx) {
+#if CONFIG_REF_MV
+  FRAME_COUNTS *counts = xd->counts;
+  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+  vpx_prob mode_prob = cm->fc->newmv_prob[mode_ctx];
+
+  if (vpx_read(r, mode_prob) == 0) {
+    if (counts)
+      ++counts->newmv_mode[mode_ctx][0];
+    return NEWMV;
+  }
+  if (counts)
+    ++counts->newmv_mode[mode_ctx][1];
+
+  if (ctx & (1 << ALL_ZERO_FLAG_OFFSET))
+    return ZEROMV;
+
+  mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+  mode_prob = cm->fc->zeromv_prob[mode_ctx];
+  if (vpx_read(r, mode_prob) == 0) {
+    if (counts)
+      ++counts->zeromv_mode[mode_ctx][0];
+    return ZEROMV;
+  }
+  if (counts)
+    ++counts->zeromv_mode[mode_ctx][1];
+
+  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  if (ctx & (1 << SKIP_NEARESTMV_OFFSET))
+    mode_ctx = 6;
+  if (ctx & (1 << SKIP_NEARMV_OFFSET))
+    mode_ctx = 7;
+  if (ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+    mode_ctx = 8;
+
+  mode_prob = cm->fc->refmv_prob[mode_ctx];
+
+  if (vpx_read(r, mode_prob) == 0) {
+    if (counts)
+      ++counts->refmv_mode[mode_ctx][0];
+
+    return NEARESTMV;
+  } else {
+    if (counts)
+      ++counts->refmv_mode[mode_ctx][1];
+    return NEARMV;
+  }
+
+  // Invalid prediction mode.
+  assert(0);
+#else
   const int mode = vpx_read_tree(r, vp10_inter_mode_tree,
                                  cm->fc->inter_mode_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
@@ -71,6 +123,7 @@
     ++counts->inter_mode[ctx][mode];
 
   return NEARESTMV + mode;
+#endif
 }
 
 static int read_segment_id(vpx_reader *r,
@@ -78,6 +131,68 @@
   return vpx_read_tree(r, vp10_segment_tree, segp->tree_probs);
 }
 
+#if CONFIG_VAR_TX
+static void read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd,
+                               MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
+                               TX_SIZE tx_size, int blk_row, int blk_col,
+                               vpx_reader *r) {
+  int is_split = 0;
+  const int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col >> 1),
+                                   xd->left_txfm_context + (blk_row >> 1),
+                                   tx_size);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+     max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+     return;
+
+  is_split = vpx_read(r, cm->fc->txfm_partition_prob[ctx]);
+
+  if (is_split) {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    if (counts)
+      ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->tx_size = mbmi->inter_tx_size[tx_idx];
+      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    --bsl;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + ((i >> 1) << bsl);
+      int offsetc = blk_col + ((i & 0x01) << bsl);
+      read_tx_size_inter(cm, xd, mbmi, counts,
+                         tx_size - 1, offsetr, offsetc, r);
+    }
+  } else {
+    int idx, idy;
+    mbmi->inter_tx_size[tx_idx] = tx_size;
+    for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
+      for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
+        mbmi->inter_tx_size[tx_idx + (idy << 3) + idx] = tx_size;
+    mbmi->tx_size = mbmi->inter_tx_size[tx_idx];
+    if (counts)
+      ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+  }
+}
+#endif
+
 static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
                                      TX_SIZE max_tx_size, vpx_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
@@ -136,28 +251,18 @@
                                  int mi_offset, int x_mis, int y_mis,
                                  vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   FRAME_COUNTS *counts = xd->counts;
   struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  struct segmentation_probs *const segp = &cm->segp;
-#endif
   int segment_id;
 
-#if !CONFIG_MISC_FIXES
-  (void) xd;
-#endif
-
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
   assert(seg->update_map && !seg->temporal_update);
 
   segment_id = read_segment_id(r, segp);
-#if CONFIG_MISC_FIXES
   if (counts)
     ++counts->seg.tree_total[segment_id];
-#endif
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
@@ -177,12 +282,8 @@
 static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
                                  int mi_row, int mi_col, vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   FRAME_COUNTS *counts = xd->counts;
   struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  struct segmentation_probs *const segp = &cm->segp;
-#endif
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int predicted_segment_id, segment_id;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
@@ -210,25 +311,19 @@
     const int ctx = vp10_get_pred_context_seg_id(xd);
     const vpx_prob pred_prob = segp->pred_probs[ctx];
     mbmi->seg_id_predicted = vpx_read(r, pred_prob);
-#if CONFIG_MISC_FIXES
     if (counts)
       ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
-#endif
     if (mbmi->seg_id_predicted) {
       segment_id = predicted_segment_id;
     } else {
       segment_id = read_segment_id(r, segp);
-#if CONFIG_MISC_FIXES
       if (counts)
         ++counts->seg.tree_mispred[segment_id];
-#endif
     }
   } else {
     segment_id = read_segment_id(r, segp);
-#if CONFIG_MISC_FIXES
     if (counts)
       ++counts->seg.tree_total[segment_id];
-#endif
   }
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
@@ -248,6 +343,71 @@
   }
 }
 
+static void read_palette_mode_info(VP10_COMMON *const cm,
+                                   MACROBLOCKD *const xd,
+                                   vpx_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi  = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int i, palette_ctx = 0;
+
+  if (above_mi)
+    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  if (left_mi)
+    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  if (vpx_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                   [palette_ctx])) {
+    int n;
+    PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+
+    pmi->palette_size[0] =
+        vpx_read_tree(r, vp10_palette_size_tree,
+                      vp10_default_palette_y_size_prob[bsize - BLOCK_8X8]) + 2;
+    n = pmi->palette_size[0];
+
+    for (i = 0; i < n; ++i)
+      pmi->palette_colors[i] = vpx_read_literal(r, cm->bit_depth);
+
+    xd->plane[0].color_index_map[0] = read_uniform(r, n);
+    assert(xd->plane[0].color_index_map[0] < n);
+  }
+}
+
+#if CONFIG_EXT_INTRA
+static void read_ext_intra_mode_info(VP10_COMMON *const cm,
+                                     MACROBLOCKD *const xd, vpx_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  FRAME_COUNTS *counts = xd->counts;
+
+#if !ALLOW_FILTER_INTRA_MODES
+  return;
+#endif
+  if (mbmi->mode == DC_PRED) {
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
+        vpx_read(r, cm->fc->ext_intra_probs[0]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
+      mbmi->ext_intra_mode_info.ext_intra_mode[0] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts)
+      ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
+  }
+  if (mbmi->uv_mode == DC_PRED) {
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+        vpx_read(r, cm->fc->ext_intra_probs[1]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
+      mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts)
+      ++counts->ext_intra[1][mbmi->ext_intra_mode_info.use_ext_intra_mode[1]];
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
 static void read_intra_frame_mode_info(VP10_COMMON *const cm,
                                        MACROBLOCKD *const xd,
                                        int mi_row, int mi_col, vpx_reader *r) {
@@ -293,10 +453,46 @@
     default:
       mbmi->mode = read_intra_mode(r,
           get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#if CONFIG_EXT_INTRA
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        mbmi->angle_delta[0] =
+            read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif  // CONFIG_EXT_INTRA
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+      bsize >= BLOCK_8X8)
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
+      mbmi->mode == DC_PRED)
+    read_palette_mode_info(cm, xd, r);
+
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(mbmi->tx_size, mbmi->sb_type, 0) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+        ALLOW_INTRA_EXT_TX) {
+      FRAME_COUNTS *counts = xd->counts;
+      int eset = get_ext_tx_set(mbmi->tx_size, mbmi->sb_type, 0);
+      if (eset > 0) {
+        mbmi->tx_type = vpx_read_tree(
+            r, vp10_ext_tx_intra_tree[eset],
+            cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode]);
+        if (counts)
+          ++counts->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode]
+                                [mbmi->tx_type];
+      }
+    } else {
+      mbmi->tx_type = DCT_DCT;
+    }
+#else
   if (mbmi->tx_size < TX_32X32 &&
       cm->base_qindex > 0 && !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -310,6 +506,14 @@
   } else {
     mbmi->tx_type = DCT_DCT;
   }
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+    if (bsize >= BLOCK_8X8)
+      read_ext_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
 }
 
 static int read_mv_component(vpx_reader *r,
@@ -399,12 +603,68 @@
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ctx = vp10_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
+      const int bit = vpx_read(r, fc->comp_ref_prob[ctx][0]);
       if (counts)
-        ++counts->comp_ref[ctx][bit];
+        ++counts->comp_ref[ctx][0][bit];
       ref_frame[idx] = cm->comp_fixed_ref;
+
+#if CONFIG_EXT_REFS
+      if (!bit) {
+        const int ctx1 = vp10_get_pred_context_comp_ref_p1(cm, xd);
+        const int bit1 = vpx_read(r, fc->comp_ref_prob[ctx1][1]);
+        if (counts)
+          ++counts->comp_ref[ctx1][1][bit1];
+        ref_frame[!idx] = cm->comp_var_ref[bit1 ? 0 : 1];
+      } else {
+        const int ctx2 = vp10_get_pred_context_comp_ref_p2(cm, xd);
+        const int bit2 = vpx_read(r, fc->comp_ref_prob[ctx2][2]);
+        if (counts)
+          ++counts->comp_ref[ctx2][2][bit2];
+        if (!bit2) {
+          const int ctx3 = vp10_get_pred_context_comp_ref_p3(cm, xd);
+          const int bit3 = vpx_read(r, fc->comp_ref_prob[ctx3][3]);
+          if (counts)
+            ++counts->comp_ref[ctx3][3][bit3];
+          ref_frame[!idx] = cm->comp_var_ref[bit3 ? 2 : 3];
+        } else {
+          ref_frame[!idx] = cm->comp_var_ref[4];
+        }
+      }
+#else
       ref_frame[!idx] = cm->comp_var_ref[bit];
+#endif  // CONFIG_EXT_REFS
     } else if (mode == SINGLE_REFERENCE) {
+#if CONFIG_EXT_REFS
+      const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
+      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      if (counts)
+        ++counts->single_ref[ctx0][0][bit0];
+      if (bit0) {
+        const int ctx1 = vp10_get_pred_context_single_ref_p2(xd);
+        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        if (counts)
+          ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
+      } else {
+        const int ctx2 = vp10_get_pred_context_single_ref_p3(xd);
+        const int bit2 = vpx_read(r, fc->single_ref_prob[ctx2][2]);
+        if (counts)
+          ++counts->single_ref[ctx2][2][bit2];
+        if (bit2) {
+          const int ctx4 = vp10_get_pred_context_single_ref_p5(xd);
+          const int bit4 = vpx_read(r, fc->single_ref_prob[ctx4][4]);
+          if (counts)
+            ++counts->single_ref[ctx4][4][bit4];
+          ref_frame[0] = bit4 ? LAST4_FRAME : LAST3_FRAME;
+        } else {
+          const int ctx3 = vp10_get_pred_context_single_ref_p4(xd);
+          const int bit3 = vpx_read(r, fc->single_ref_prob[ctx3][3]);
+          if (counts)
+            ++counts->single_ref[ctx3][3][bit3];
+          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+        }
+      }
+#else
       const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
       const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
@@ -418,6 +678,7 @@
       } else {
         ref_frame[0] = LAST_FRAME;
       }
+#endif  // CONFIG_EXT_REFS
 
       ref_frame[1] = NONE;
     } else {
@@ -431,10 +692,13 @@
     VP10_COMMON *const cm, MACROBLOCKD *const xd,
     vpx_reader *r) {
   const int ctx = vp10_get_pred_context_switchable_interp(xd);
-  const INTERP_FILTER type =
-      (INTERP_FILTER)vpx_read_tree(r, vp10_switchable_interp_tree,
-                                   cm->fc->switchable_interp_prob[ctx]);
   FRAME_COUNTS *counts = xd->counts;
+  INTERP_FILTER type;
+#if CONFIG_EXT_INTERP
+  if (!vp10_is_interp_needed(xd)) return EIGHTTAP;
+#endif
+  type = (INTERP_FILTER)vpx_read_tree(r, vp10_switchable_interp_tree,
+                                      cm->fc->switchable_interp_prob[ctx]);
   if (counts)
     ++counts->switchable_interp[ctx][type];
   return type;
@@ -470,9 +734,30 @@
       break;
     default:
       mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[0] = 0;
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        mbmi->angle_delta[0] =
+            read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif  // CONFIG_EXT_INTRA
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+      bsize >= BLOCK_8X8)
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+#endif  // CONFIG_EXT_INTRA
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+  if (bsize >= BLOCK_8X8)
+    read_ext_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -555,13 +840,15 @@
   int_mv nearestmv[2], nearmv[2];
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int ref, is_compound;
-  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
+  int16_t inter_mode_ctx[MAX_REF_FRAMES];
+  int16_t mode_ctx = 0;
+  MV_REFERENCE_FRAME ref_frame;
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
     xd->block_refs[ref] = ref_buf;
@@ -570,10 +857,25 @@
                          "Reference frame has invalid dimensions");
     vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
                          &ref_buf->sf);
-    vp10_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
-                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
   }
 
+  for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+    vp10_find_mv_refs(cm, xd, mi, ref_frame,
+#if CONFIG_REF_MV
+                      &xd->ref_mv_count[ref_frame],
+                      xd->ref_mv_stack[ref_frame],
+#endif
+                      ref_mvs[ref_frame],
+                      mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
+  }
+
+  mode_ctx = inter_mode_ctx[mbmi->ref_frame[0]];
+
+#if CONFIG_REF_MV
+  mode_ctx = vp10_mode_context_analyzer(inter_mode_ctx,
+                                        mbmi->ref_frame, bsize, -1);
+#endif
+
   if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
@@ -583,8 +885,7 @@
     }
   } else {
     if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, xd, r,
-                                   inter_mode_ctx[mbmi->ref_frame[0]]);
+      mbmi->mode = read_inter_mode(cm, xd, r, mode_ctx);
   }
 
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -594,9 +895,11 @@
     }
   }
 
+#if !CONFIG_EXT_INTERP
   mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
-                      ? read_switchable_interp_filter(cm, xd, r)
-                      : cm->interp_filter;
+                        ? read_switchable_interp_filter(cm, xd, r)
+                        : cm->interp_filter;
+#endif  // !CONFIG_EXT_INTERP
 
   if (bsize < BLOCK_8X8) {
     const int num_4x4_w = 1 << xd->bmode_blocks_wl;
@@ -608,15 +911,17 @@
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
+#if CONFIG_REF_MV
+        mode_ctx = vp10_mode_context_analyzer(inter_mode_ctx,  mbmi->ref_frame,
+                                              bsize, j);
+#endif
+        b_mode = read_inter_mode(cm, xd, r, mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
           for (ref = 0; ref < 1 + is_compound; ++ref)
             vp10_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
                                           &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref],
-                                          dummy_mode_ctx);
+                                          &near_sub8x8[ref]);
         }
 
         if (!assign_mv(cm, xd, b_mode, block, nearestmv,
@@ -645,28 +950,118 @@
     xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, mbmi->mv, nearestmv,
                                 nearestmv, nearmv, is_compound, allow_hp, r);
   }
+#if CONFIG_EXT_INTERP
+  mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
+                        ? read_switchable_interp_filter(cm, xd, r)
+                        : cm->interp_filter;
+#endif  // CONFIG_EXT_INTERP
 }
 
 static void read_inter_frame_mode_info(VP10Decoder *const pbi,
                                        MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                                        int mi_row, int mi_col, vpx_reader *r) {
   VP10_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int inter_block;
+  int inter_block = 1;
+#if CONFIG_VAR_TX
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#endif  // CONFIG_VAR_TX
+#if CONFIG_SUPERTX
+  (void) supertx_enabled;
+#endif  // CONFIG_SUPERTX
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif  // CONFIG_SUPERTX
+    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+        !mbmi->skip && inter_block) {
+      const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+      const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+      const int bs = num_4x4_blocks_wide_lookup[txb_size];
+      const int width  = num_4x4_blocks_wide_lookup[bsize];
+      const int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bs)
+        for (idx = 0; idx < width; idx += bs)
+          read_tx_size_inter(cm, xd, mbmi, xd->counts, max_tx_size,
+                             idy, idx, r);
+      if (xd->counts) {
+        const int ctx = get_tx_size_context(xd);
+        ++get_tx_counts(max_tx_size, ctx, &xd->counts->tx)[mbmi->tx_size];
+      }
+    } else {
+      mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+      if (inter_block) {
+        const int width  = num_4x4_blocks_wide_lookup[bsize];
+        const int height = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < height; ++idy)
+          for (idx = 0; idx < width; ++idx)
+            mbmi->inter_tx_size[(idy >> 1) * 8 + (idx >> 1)] = mbmi->tx_size;
+      }
+
+      set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
+      set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
+    }
+#else
+    mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+#endif  // CONFIG_VAR_TX
+#if CONFIG_SUPERTX
+  }
+#endif  // CONFIG_SUPERTX
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd,
+                               mi, mi_row, mi_col, r);
   else
     read_intra_block_mode_info(cm, xd, mi, r);
 
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, mbmi->sb_type, inter_block) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    int eset = get_ext_tx_set(mbmi->tx_size, mbmi->sb_type,
+                              inter_block);
+    FRAME_COUNTS *counts = xd->counts;
+
+    if (inter_block) {
+      if (eset > 0) {
+        mbmi->tx_type =
+            vpx_read_tree(r, vp10_ext_tx_inter_tree[eset],
+                          cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size]);
+        if (counts)
+          ++counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
+      }
+    } else if (ALLOW_INTRA_EXT_TX) {
+      if (eset > 0) {
+        mbmi->tx_type = vpx_read_tree(r, vp10_ext_tx_intra_tree[eset],
+                                      cm->fc->intra_ext_tx_prob[eset]
+                                        [mbmi->tx_size][mbmi->mode]);
+        if (counts)
+          ++counts->intra_ext_tx[eset][mbmi->tx_size]
+                                [mbmi->mode][mbmi->tx_type];
+      }
+    }
+  } else {
+    mbmi->tx_type = DCT_DCT;
+  }
+#else
   if (mbmi->tx_size < TX_32X32 &&
       cm->base_qindex > 0 && !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -688,11 +1083,15 @@
   } else {
     mbmi->tx_type = DCT_DCT;
   }
+#endif  // CONFIG_EXT_TX
 }
 
 void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vpx_reader *r,
-                        int x_mis, int y_mis) {
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         int mi_row, int mi_col, vpx_reader *r,
+                         int x_mis, int y_mis) {
   VP10_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
@@ -700,9 +1099,22 @@
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+#if CONFIG_REF_MV
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = NONE;
+        mv->ref_frame[1] = NONE;
+      }
+    }
+#endif
   } else {
-    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
-
+    read_inter_frame_mode_info(pbi, xd,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                               mi_row, mi_col, r);
     for (h = 0; h < y_mis; ++h) {
       MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
       for (w = 0; w < x_mis; ++w) {

diff --git a/vp10/decoder/decodemv.h b/vp10/decoder/decodemv.h
index 6653be5..959a001 100644
--- a/vp10/decoder/decodemv.h
+++ b/vp10/decoder/decodemv.h

@@ -20,8 +20,12 @@
 #endif
 
 void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vpx_reader *r,
-                        int x_mis, int y_mis);
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
+
+                         int mi_row, int mi_col, vpx_reader *r,
+                         int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index d8864d2..2dbadb3 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c

@@ -196,10 +196,23 @@
   // later commit that adds VP9-specific controls for this functionality.
   if (ref_frame_flag == VP9_LAST_FLAG) {
     ref_buf = &cm->frame_refs[0];
+#if CONFIG_EXT_REFS
+  } else if (ref_frame_flag == VP9_LAST2_FLAG) {
+    ref_buf = &cm->frame_refs[1];
+  } else if (ref_frame_flag == VP9_LAST3_FLAG) {
+    ref_buf = &cm->frame_refs[2];
+  } else if (ref_frame_flag == VP9_LAST4_FLAG) {
+    ref_buf = &cm->frame_refs[3];
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
+    ref_buf = &cm->frame_refs[4];
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
+    ref_buf = &cm->frame_refs[5];
+#else
   } else if (ref_frame_flag == VP9_GOLD_FLAG) {
     ref_buf = &cm->frame_refs[1];
   } else if (ref_frame_flag == VP9_ALT_FLAG) {
     ref_buf = &cm->frame_refs[2];
+#endif  // CONFIG_EXT_REFS
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
@@ -243,10 +256,10 @@
     // Current thread releases the holding of reference frame.
     decrease_ref_count(old_idx, frame_bufs, pool);
 
-    // Release the reference frame in reference map.
-    if ((mask & 1) && old_idx >= 0) {
+    // Release the reference frame holding in the reference map for the decoding
+    // of the next frame.
+    if (mask & 1)
       decrease_ref_count(old_idx, frame_bufs, pool);
-    }
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
@@ -268,7 +281,7 @@
   }
 
   // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < 3; ref_index++)
+  for (ref_index = 0; ref_index < REFS_PER_FRAME; ref_index++)
     cm->frame_refs[ref_index].idx = -1;
 }
 
@@ -326,7 +339,6 @@
     pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
   }
 
-
   if (setjmp(cm->error.jmp)) {
     const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
     int i;
@@ -350,10 +362,10 @@
         // Current thread releases the holding of reference frame.
         decrease_ref_count(old_idx, frame_bufs, pool);
 
-        // Release the reference frame in reference map.
-        if ((mask & 1) && old_idx >= 0) {
+       // Release the reference frame holding in the reference map for the
+       // decoding of the next frame.
+       if (mask & 1)
           decrease_ref_count(old_idx, frame_bufs, pool);
-        }
         ++ref_index;
       }
 
@@ -459,9 +471,7 @@
   // an invalid bitstream and need to return an error.
 
   uint8_t marker;
-#if CONFIG_MISC_FIXES
   size_t frame_sz_sum = 0;
-#endif
 
   assert(data_sz);
   marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
@@ -470,7 +480,7 @@
   if ((marker & 0xe0) == 0xc0) {
     const uint32_t frames = (marker & 0x7) + 1;
     const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
+    const size_t index_sz = 2 + mag * (frames - 1);
 
     // This chunk is marked as having a superframe index but doesn't have
     // enough data for it, thus it's an invalid superframe index.
@@ -501,20 +511,16 @@
         x = clear_buffer;
       }
 
-      for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
+      for (i = 0; i < frames - 1; ++i) {
         uint32_t this_sz = 0;
 
         for (j = 0; j < mag; ++j)
           this_sz |= (*x++) << (j * 8);
-        this_sz += CONFIG_MISC_FIXES;
+        this_sz += 1;
         sizes[i] = this_sz;
-#if CONFIG_MISC_FIXES
         frame_sz_sum += this_sz;
-#endif
       }
-#if CONFIG_MISC_FIXES
       sizes[i] = data_sz - index_sz - frame_sz_sum;
-#endif
       *count = frames;
     }
   }

diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index d39e3dc..011c45a 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c

@@ -164,11 +164,7 @@
           val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
           break;
         case CATEGORY6_TOKEN: {
-#if CONFIG_MISC_FIXES
           const int skip_bits = TX_SIZES - 1 - tx_size;
-#else
-          const int skip_bits = 0;
-#endif
           const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_VP9_HIGHBITDEPTH
           switch (xd->bd) {
@@ -257,6 +253,33 @@
   }
 }
 
+void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                                vpx_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
+  int n = mbmi->palette_mode_info.palette_size[plane != 0];
+  int i, j;
+  uint8_t *color_map = xd->plane[plane].color_index_map;
+  const vpx_prob (* prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
+      plane ? vp10_default_palette_uv_color_prob :
+          vp10_default_palette_y_color_prob;
+
+  for (i = 0; i < rows; ++i) {
+    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+      color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
+                                                 color_order);
+      color_idx = vpx_read_tree(r, vp10_palette_color_tree[n - 2],
+                                prob[n - 2][color_ctx]);
+      assert(color_idx >= 0 && color_idx < n);
+      color_map[i * cols + j] = color_order[color_idx];
+    }
+  }
+}
+
 int vp10_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
                             int x, int y,

diff --git a/vp10/decoder/detokenize.h b/vp10/decoder/detokenize.h
index c3fd90a..d2677f6 100644
--- a/vp10/decoder/detokenize.h
+++ b/vp10/decoder/detokenize.h

@@ -20,6 +20,8 @@
 extern "C" {
 #endif
 
+void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                                vpx_reader *r);
 int vp10_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
                             int x, int y,

diff --git a/vp10/decoder/dsubexp.c b/vp10/decoder/dsubexp.c
index 36c1917..7d2872e 100644
--- a/vp10/decoder/dsubexp.c
+++ b/vp10/decoder/dsubexp.c

@@ -23,13 +23,13 @@
 
 static int decode_uniform(vpx_reader *r) {
   const int l = 8;
-  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
+  const int m = (1 << l) - 190;
   const int v = vpx_read_literal(r, l - 1);
   return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
 }
 
 static int inv_remap_prob(int v, int m) {
-  static uint8_t inv_map_table[MAX_PROB - CONFIG_MISC_FIXES] = {
+  static uint8_t inv_map_table[MAX_PROB - 1] = {
       7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
     202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
      12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
@@ -47,9 +47,6 @@
     207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
     223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
     239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
-#if !CONFIG_MISC_FIXES
-    253
-#endif
   };
   assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
   v = inv_map_table[v];

diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index ede8bb3..344f63d 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c

@@ -38,12 +38,33 @@
 static const struct vp10_token intra_mode_encodings[INTRA_MODES] = {
   {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
   {62, 6}, {2, 2}};
+#if CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
+static const struct vp10_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+  {{0, 1}, {4, 3}, {3, 2}, {5, 3}};
+#else
 static const struct vp10_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
   {{0, 1}, {2, 2}, {3, 2}};
+#endif  // CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
 static const struct vp10_token partition_encodings[PARTITION_TYPES] =
   {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+#if !CONFIG_REF_MV
 static const struct vp10_token inter_mode_encodings[INTER_MODES] =
   {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
+#endif
+static const struct vp10_token palette_size_encodings[] = {
+    {0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {63, 6},
+};
+static const struct vp10_token
+palette_color_encodings[PALETTE_MAX_SIZE - 1][PALETTE_MAX_SIZE] = {
+    {{0, 1}, {1, 1}},  // 2 colors
+    {{0, 1}, {2, 2}, {3, 2}},  // 3 colors
+    {{0, 1}, {2, 2}, {6, 3}, {7, 3}},  // 4 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {15, 4}},  // 5 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {31, 5}},  // 6 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {63, 6}},  // 7 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4},
+        {30, 5}, {62, 6}, {126, 7}, {127, 7}},  // 8 colors
+};
 
 static INLINE void write_uniform(vpx_writer *w, int n, int v) {
   int l = get_unsigned_bits(n);
@@ -58,22 +79,82 @@
   }
 }
 
+#if CONFIG_EXT_TX
+static struct vp10_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES];
+static struct vp10_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
+#else
 static struct vp10_token ext_tx_encodings[TX_TYPES];
+#endif  // CONFIG_EXT_TX
 
 void vp10_encode_token_init() {
+#if CONFIG_EXT_TX
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    vp10_tokens_from_tree(ext_tx_inter_encodings[s], vp10_ext_tx_inter_tree[s]);
+  }
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    vp10_tokens_from_tree(ext_tx_intra_encodings[s], vp10_ext_tx_intra_tree[s]);
+  }
+#else
   vp10_tokens_from_tree(ext_tx_encodings, vp10_ext_tx_tree);
+#endif  // CONFIG_EXT_TX
 }
 
+#if CONFIG_SUPERTX
+static int vp10_check_supertx(VP10_COMMON *cm, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize) {
+  MODE_INFO *mi;
+  mi = cm->mi + (mi_row * cm->mi_stride + mi_col);
+  return mi[0].mbmi.tx_size == max_txsize_lookup[bsize] &&
+         mi[0].mbmi.sb_type < bsize;
+}
+#endif  // CONFIG_SUPERTX
+
 static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
                              const vpx_prob *probs) {
   vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
-static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
-                             const vpx_prob *probs) {
+static void write_inter_mode(VP10_COMMON *cm,
+                             vpx_writer *w, PREDICTION_MODE mode,
+                             const int16_t mode_ctx) {
+#if CONFIG_REF_MV
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+  const vpx_prob newmv_prob = cm->fc->newmv_prob[newmv_ctx];
+  vpx_write(w, mode != NEWMV, newmv_prob);
+
+  if (mode != NEWMV) {
+    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    const vpx_prob zeromv_prob = cm->fc->zeromv_prob[zeromv_ctx];
+
+    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      assert(mode == ZEROMV);
+      return;
+    }
+
+    vpx_write(w, mode != ZEROMV, zeromv_prob);
+
+    if (mode != ZEROMV) {
+      int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      vpx_prob refmv_prob;
+
+      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET))
+        refmv_ctx = 6;
+      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET))
+        refmv_ctx = 7;
+      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+        refmv_ctx = 8;
+
+      refmv_prob = cm->fc->refmv_prob[refmv_ctx];
+      vpx_write(w, mode != NEARESTMV, refmv_prob);
+    }
+  }
+#else
+  const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
   assert(is_inter_mode(mode));
-  vp10_write_token(w, vp10_inter_mode_tree, probs,
+  vp10_write_token(w, vp10_inter_mode_tree, inter_probs,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
+#endif
 }
 
 static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
@@ -114,6 +195,62 @@
   return savings;
 }
 
+#if CONFIG_VAR_TX
+static void write_tx_size_inter(const VP10_COMMON *cm,
+                                const MACROBLOCKD *xd,
+                                const MB_MODE_INFO *mbmi,
+                                TX_SIZE tx_size, int blk_row, int blk_col,
+                                vpx_writer *w) {
+  const int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col >> 1),
+                                   xd->left_txfm_context + (blk_row >> 1),
+                                   tx_size);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+     max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+     return;
+
+  if (tx_size == mbmi->inter_tx_size[tx_idx]) {
+    vpx_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+  } else {
+    const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+    vpx_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+
+    if (tx_size == TX_8X8) {
+      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    --bsl;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + ((i >> 1) << bsl);
+      int offsetc = blk_col + ((i & 0x01) << bsl);
+      write_tx_size_inter(cm, xd, mbmi, tx_size - 1, offsetr, offsetc, w);
+    }
+  }
+}
+
+static void update_txfm_partition_probs(VP10_COMMON *cm, vpx_writer *w,
+                                        FRAME_COUNTS *counts) {
+  int k;
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    vp10_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
+                               counts->txfm_partition[k]);
+}
+#endif
+
 static void write_selected_tx_size(const VP10_COMMON *cm,
                                    const MACROBLOCKD *xd, vpx_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
@@ -129,6 +266,22 @@
   }
 }
 
+#if CONFIG_REF_MV
+static void update_inter_mode_probs(VP10_COMMON *cm, vpx_writer *w,
+                                    FRAME_COUNTS *counts) {
+  int i;
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->newmv_prob[i],
+                               counts->newmv_mode[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
+                               counts->zeromv_mode[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->refmv_prob[i],
+                               counts->refmv_mode[i]);
+}
+#endif
+
 static int write_skip(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                       int segment_id, const MODE_INFO *mi, vpx_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
@@ -157,6 +310,60 @@
                      counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
+
+#if CONFIG_EXT_TX
+static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+      savings += prob_diff_update_savings(
+          vp10_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
+          cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s]);
+    }
+    do_update = savings > savings_thresh;
+    vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        prob_diff_update(vp10_ext_tx_inter_tree[s],
+                         cm->fc->inter_ext_tx_prob[s][i],
+                         cm->counts.inter_ext_tx[s][i],
+                         num_ext_tx_set_inter[s], w);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+      for (j = 0; j < INTRA_MODES; ++j)
+        savings += prob_diff_update_savings(
+            vp10_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
+            cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s]);
+    }
+    do_update = savings > savings_thresh;
+    vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          prob_diff_update(vp10_ext_tx_intra_tree[s],
+                           cm->fc->intra_ext_tx_prob[s][i][j],
+                           cm->counts.intra_ext_tx[s][i][j],
+                           num_ext_tx_set_intra[s], w);
+      }
+    }
+  }
+}
+#else
 static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
   const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
@@ -199,19 +406,62 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
+
+static void pack_palette_tokens(vpx_writer *w, TOKENEXTRA **tp,
+                                BLOCK_SIZE bsize, int n) {
+  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int i;
+  TOKENEXTRA *p = *tp;
+
+  for (i = 0; i < rows * cols -1; ++i) {
+    vp10_write_token(w, vp10_palette_color_tree[n - 2], p->context_tree,
+                     &palette_color_encodings[n - 2][p->token]);
+    ++p;
+  }
+
+  *tp = p;
+}
+
+#if CONFIG_SUPERTX
+static void update_supertx_probs(VP10_COMMON *cm, vpx_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    for (j = 1; j < TX_SIZES; ++j) {
+      savings += vp10_cond_prob_diff_update_savings(&cm->fc->supertx_prob[i][j],
+                                                    cm->counts.supertx[i][j]);
+    }
+  }
+  do_update = savings > savings_thresh;
+  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = 1; j < TX_SIZES; ++j) {
+        vp10_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
+                                   cm->counts.supertx[i][j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
 
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
   TOKENEXTRA *p = *tp;
-#if !CONFIG_MISC_FIXES
-  (void) tx;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = 16 << (tx << 1);
 #endif
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
     const struct vp10_token *const a = &vp10_coef_encodings[t];
-    int i = 0;
     int v = a->value;
     int n = a->len;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -228,38 +478,30 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     /* skip one or two nodes */
-    if (p->skip_eob_node) {
+    if (p->skip_eob_node)
       n -= p->skip_eob_node;
-      i = 2 * p->skip_eob_node;
-    }
+    else
+      vpx_write(w, t != EOB_TOKEN, p->context_tree[0]);
 
-    // TODO(jbb): expanding this can lead to big gains.  It allows
-    // much better branch prediction and would enable us to avoid numerous
-    // lookups and compares.
+    if (t != EOB_TOKEN) {
+      vpx_write(w, t != ZERO_TOKEN, p->context_tree[1]);
 
-    // If we have a token that's in the constrained set, the coefficient tree
-    // is split into two treed writes.  The first treed write takes care of the
-    // unconstrained nodes.  The second treed write takes care of the
-    // constrained nodes.
-    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
-      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
-      int bits = v >> (n - len);
-      vp10_write_tree(w, vp10_coef_tree, p->context_tree, bits, len, i);
-      vp10_write_tree(w, vp10_coef_con_tree,
-                     vp10_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
-                     v, n - len, 0);
-    } else {
-      vp10_write_tree(w, vp10_coef_tree, p->context_tree, v, n, i);
+      if (t != ZERO_TOKEN) {
+        vpx_write(w, t != ONE_TOKEN, p->context_tree[2]);
+
+        if (t != ONE_TOKEN) {
+          int len = UNCONSTRAINED_NODES - p->skip_eob_node;
+          vp10_write_tree(w, vp10_coef_con_tree,
+                          vp10_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
+                          v, n - len, 0);
+        }
+      }
     }
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
-#if CONFIG_MISC_FIXES
       int skip_bits =
           (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
-#else
-      int skip_bits = 0;
-#endif
 
       if (l) {
         const unsigned char *pb = b->prob;
@@ -282,11 +524,68 @@
       vpx_write_bit(w, e & 1);
     }
     ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (t == EOB_TOKEN || count == seg_eob)
+      break;
+#endif
   }
 
   *tp = p;
 }
 
+#if CONFIG_VAR_TX
+static void pack_txb_tokens(vpx_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *const tok_end,
+                           MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+                           BLOCK_SIZE plane_bsize,
+                           vpx_bit_depth_t bit_depth,
+                           int block,
+                           int blk_row, int blk_col, TX_SIZE tx_size) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size);
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      pack_txb_tokens(w, tp, tok_end, xd, mbmi, plane,
+                      plane_bsize, bit_depth, block + i * step,
+                      offsetr, offsetc, tx_size - 1);
+    }
+  }
+}
+#endif
+
 static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
                              const struct segmentation_probs *segp,
                              int segment_id) {
@@ -317,31 +616,121 @@
     }
 
     if (is_compound) {
-      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
-                vp10_get_pred_prob_comp_ref_p(cm, xd));
+#if CONFIG_EXT_REFS
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME ||
+                       mbmi->ref_frame[0] == LAST4_FRAME);
+#else
+      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
+      vpx_write(w, bit, vp10_get_pred_prob_comp_ref_p(cm, xd));
+
+#if CONFIG_EXT_REFS
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
+        vpx_write(w, bit1, vp10_get_pred_prob_comp_ref_p1(cm, xd));
+      } else {
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        vpx_write(w, bit2, vp10_get_pred_prob_comp_ref_p2(cm, xd));
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] == LAST3_FRAME;
+          vpx_write(w, bit3, vp10_get_pred_prob_comp_ref_p3(cm, xd));
+        }
+      }
+#endif  // CONFIG_EXT_REFS
     } else {
+#if CONFIG_EXT_REFS
+      const int bit0 = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                        mbmi->ref_frame[0] == ALTREF_FRAME);
+      vpx_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        vpx_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+      } else {
+        const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+                          mbmi->ref_frame[0] == LAST4_FRAME);
+        vpx_write(w, bit2, vp10_get_pred_prob_single_ref_p3(cm, xd));
+
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          vpx_write(w, bit3, vp10_get_pred_prob_single_ref_p4(cm, xd));
+        } else {
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          vpx_write(w, bit4, vp10_get_pred_prob_single_ref_p5(cm, xd));
+        }
+      }
+#else
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
       vpx_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
         vpx_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
       }
+#endif  // CONFIG_EXT_REFS
     }
   }
 }
 
+#if CONFIG_EXT_INTRA
+static void write_ext_intra_mode_info(const VP10_COMMON *const cm,
+                                      const MB_MODE_INFO *const mbmi,
+                                      vpx_writer *w) {
+#if !ALLOW_FILTER_INTRA_MODES
+  return;
+#endif
+  if (mbmi->mode == DC_PRED) {
+    vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[0],
+              cm->fc->ext_intra_probs[0]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
+      EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[0];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
+  if (mbmi->uv_mode == DC_PRED) {
+    vpx_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[1],
+              cm->fc->ext_intra_probs[1]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
+      EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[1];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
+static void write_switchable_interp_filter(VP10_COMP *cpi,
+                                           const MACROBLOCKD *xd,
+                                           vpx_writer *w) {
+  VP10_COMMON *const cm = &cpi->common;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  if (cm->interp_filter == SWITCHABLE) {
+    const int ctx = vp10_get_pred_context_switchable_interp(xd);
+#if CONFIG_EXT_INTERP
+    if (!vp10_is_interp_needed(xd)) {
+      // if (mbmi->interp_filter != EIGHTTAP)
+      //   printf("Error [%d]\n", mbmi->sb_type);
+      assert(mbmi->interp_filter == EIGHTTAP);
+      return;
+    }
+#endif
+    vp10_write_token(w, vp10_switchable_interp_tree,
+                     cm->fc->switchable_interp_prob[ctx],
+                     &switchable_interp_encodings[mbmi->interp_filter]);
+    ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+  }
+}
+
 static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
+#endif
                                 vpx_writer *w) {
   VP10_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
-  const MACROBLOCK *const x = &cpi->td.mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MACROBLOCK *x = &cpi->td.mb;
+  const MACROBLOCKD *xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   const struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  const struct segmentation_probs *const segp = &cm->segp;
-#endif
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
@@ -364,19 +753,60 @@
     }
   }
 
+#if CONFIG_SUPERTX
+  if (supertx_enabled)
+    skip = mbmi->skip;
+  else
+    skip = write_skip(cm, xd, segment_id, mi, w);
+#else
   skip = write_skip(cm, xd, segment_id, mi, w);
+#endif  // CONFIG_SUPERTX
 
-  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
       !(is_inter && skip) && !xd->lossless[segment_id]) {
-    write_selected_tx_size(cm, xd, w);
+#if CONFIG_VAR_TX
+    if (is_inter) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+      const int txb_size = txsize_to_bsize[max_tx_size];
+      const int bs = num_4x4_blocks_wide_lookup[txb_size];
+      const int width  = num_4x4_blocks_wide_lookup[bsize];
+      const int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bs)
+        for (idx = 0; idx < width; idx += bs)
+          write_tx_size_inter(cm, xd, mbmi, max_tx_size, idy, idx, w);
+    } else {
+      set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
+      set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
+
+      write_selected_tx_size(cm, xd, w);
+    }
+  } else {
+    set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
+    set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
+#else
+  write_selected_tx_size(cm, xd, w);
+#endif
   }
 
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
       write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+#if CONFIG_EXT_INTRA
+      if (mode != DC_PRED && mode != TM_PRED) {
+        write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                      MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+      }
+#endif  // CONFIG_EXT_INTRA
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -389,27 +819,34 @@
       }
     }
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
+#if CONFIG_EXT_INTRA
+    if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+        bsize >= BLOCK_8X8)
+      write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                    MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
+
+    if (bsize >= BLOCK_8X8)
+      write_ext_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_EXT_INTRA
   } else {
-    const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
     write_ref_frames(cm, xd, w);
 
+#if CONFIG_REF_MV
+    mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                          mbmi->ref_frame, bsize, -1);
+#endif
+
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_inter_mode(w, mode, inter_probs);
+        write_inter_mode(cm, w, mode, mode_ctx);
       }
     }
 
-    if (cm->interp_filter == SWITCHABLE) {
-      const int ctx = vp10_get_pred_context_switchable_interp(xd);
-      vp10_write_token(w, vp10_switchable_interp_tree,
-                      cm->fc->switchable_interp_prob[ctx],
-                      &switchable_interp_encodings[mbmi->interp_filter]);
-      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
-    } else {
-      assert(mbmi->interp_filter == cm->interp_filter);
-    }
+#if !CONFIG_EXT_INTERP
+    write_switchable_interp_filter(cpi, xd, w);
+#endif  // !CONFIG_EXT_INTERP
 
     if (bsize < BLOCK_8X8) {
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -419,7 +856,11 @@
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-          write_inter_mode(w, b_mode, inter_probs);
+#if CONFIG_REF_MV
+          mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                                mbmi->ref_frame, bsize, j);
+#endif
+          write_inter_mode(cm, w, b_mode, mode_ctx);
           if (b_mode == NEWMV) {
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp10_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@@ -436,9 +877,38 @@
                         allow_hp);
       }
     }
+#if CONFIG_EXT_INTERP
+    write_switchable_interp_filter(cpi, xd, w);
+#endif  // CONFIG_EXT_INTERP
   }
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, bsize, is_inter) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    int eset = get_ext_tx_set(mbmi->tx_size, bsize, is_inter);
+    if (is_inter) {
+      if (eset > 0)
+        vp10_write_token(w, vp10_ext_tx_inter_tree[eset],
+                         cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size],
+                         &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+    } else if (ALLOW_INTRA_EXT_TX) {
+      if (eset > 0)
+        vp10_write_token(
+            w, vp10_ext_tx_intra_tree[eset],
+            cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode],
+            &ext_tx_intra_encodings[eset][mbmi->tx_type]);
+    }
+  }
+#else
   if (mbmi->tx_size < TX_32X32 &&
       cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     if (is_inter) {
       vp10_write_token(
@@ -456,16 +926,43 @@
     if (!mbmi->skip)
       assert(mbmi->tx_type == DCT_DCT);
   }
+#endif  // CONFIG_EXT_TX
+}
+
+static void write_palette_mode_info(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd,
+                                    const MODE_INFO *const mi,
+                                    vpx_writer *w) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+  int palette_ctx = 0;
+  int n, i;
+
+  n = pmi->palette_size[0];
+  if (above_mi)
+    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  if (left_mi)
+    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  vpx_write(w, n > 0,
+            vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx]);
+  if (n > 0) {
+    vp10_write_token(w, vp10_palette_size_tree,
+                     vp10_default_palette_y_size_prob[bsize - BLOCK_8X8],
+                     &palette_size_encodings[n - 2]);
+    for (i = 0; i < n; ++i)
+      vpx_write_literal(w, pmi->palette_colors[i],
+                        cm->bit_depth);
+    write_uniform(w, n, pmi->palette_first_color_idx[0]);
+  }
 }
 
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                               MODE_INFO **mi_8x8, vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   const struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  const struct segmentation_probs *const segp = &cm->segp;
-#endif
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
@@ -484,6 +981,11 @@
   if (bsize >= BLOCK_8X8) {
     write_intra_mode(w, mbmi->mode,
                      get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#if CONFIG_EXT_INTRA
+    if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+      write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                    MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+#endif  // CONFIG_EXT_INTRA
   } else {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -499,7 +1001,31 @@
   }
 
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED &&
+      bsize >= BLOCK_8X8)
+    write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                  MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
+#endif  // CONFIG_EXT_INTRA
 
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools &&
+      mbmi->mode == DC_PRED)
+    write_palette_mode_info(cm, xd, mi, w);
+
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, bsize, 0) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+      ALLOW_INTRA_EXT_TX) {
+    int eset = get_ext_tx_set(mbmi->tx_size, bsize, 0);
+    if (eset > 0)
+      vp10_write_token(
+          w, vp10_ext_tx_intra_tree[eset],
+          cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode],
+          &ext_tx_intra_encodings[eset][mbmi->tx_type]);
+  }
+#else
   if (mbmi->tx_size < TX_32X32 &&
       cm->base_qindex > 0 && !mbmi->skip &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -509,11 +1035,20 @@
                                  [intra_mode_to_tx_type_context[mbmi->mode]],
         &ext_tx_encodings[mbmi->tx_type]);
   }
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA
+  if (bsize >= BLOCK_8X8)
+      write_ext_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_EXT_INTRA
 }
 
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
                           vpx_writer *w, TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
                           int mi_row, int mi_col) {
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@@ -532,15 +1067,70 @@
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cm, xd, xd->mi, w);
   } else {
-    pack_inter_mode_mvs(cpi, m, w);
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+#endif
+    pack_inter_mode_mvs(cpi, m,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        w);
   }
 
+  if (m->mbmi.palette_mode_info.palette_size[0] > 0) {
+    assert(*tok < tok_end);
+    pack_palette_tokens(w, tok, m->mbmi.sb_type,
+                        m->mbmi.palette_mode_info.palette_size[0]);
+    assert(*tok < tok_end);
+  }
+
+#if CONFIG_SUPERTX
+  if (supertx_enabled) return;
+#endif  // CONFIG_SUPERTX
+
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      MB_MODE_INFO *mbmi = &m->mbmi;
+      BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), pd);
+
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+      int row, col;
+
+      if (is_inter_block(mbmi)) {
+        const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+        const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+        int block = 0;
+        const int step = 1 << (max_tx_size << 1);
+        for (row = 0; row < num_4x4_h; row += bw) {
+          for (col = 0; col < num_4x4_w; col += bw) {
+            pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                            cm->bit_depth, block, row, col, max_tx_size);
+            block += step;
+          }
+        }
+      } else {
+        TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                           : m->mbmi.tx_size;
+        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+
+        for (row = 0; row < num_4x4_h; row += bw)
+          for (col = 0; col < num_4x4_w; col += bw)
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      }
+#else
       TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
                          : m->mbmi.tx_size;
       pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+#endif  // CONFIG_VAR_TX
       assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
       (*tok)++;
     }
@@ -572,6 +1162,9 @@
 static void write_modes_sb(VP10_COMP *cpi,
                            const TileInfo *const tile, vpx_writer *w,
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                           int supertx_enabled,
+#endif
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@@ -580,7 +1173,12 @@
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  const MODE_INFO *m = NULL;
+  MODE_INFO *m = NULL;
+#if CONFIG_SUPERTX
+  const int pack_token = !supertx_enabled;
+  TX_SIZE supertx_size;
+  int plane;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -590,36 +1188,118 @@
   partition = partition_lookup[bsl][m->mbmi.sb_type];
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[bsize],
+                 mi_col, num_8x8_blocks_wide_lookup[bsize],
+                 cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled &&
+      !frame_is_intra_only(cm) &&
+      partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !xd->lossless[0]) {
+    vpx_prob prob;
+    supertx_size = max_txsize_lookup[bsize];
+    prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                               [supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    vpx_write(w, supertx_enabled, prob);
+    if (supertx_enabled) {
+      vpx_write(w, xd->mi[0]->mbmi.skip, vp10_get_skip_prob(cm, xd));
+#if CONFIG_EXT_TX
+      if (supertx_size <= TX_16X16 && !xd->mi[0]->mbmi.skip) {
+        int eset = get_ext_tx_set(supertx_size, bsize, 1);
+        if (eset > 0) {
+          vp10_write_token(
+              w, vp10_ext_tx_inter_tree[eset],
+              cm->fc->inter_ext_tx_prob[eset][supertx_size],
+              &ext_tx_inter_encodings[eset][xd->mi[0]->mbmi.tx_type]);
+        }
+      }
+#endif  // CONFIG_EXT_TX
+    }
+  }
+#endif  // CONFIG_SUPERTX
   if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                  supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                  mi_row, mi_col);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                      mi_row, mi_col);
         break;
       case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                      mi_row, mi_col);
         if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                        mi_row + bs, mi_col);
         break;
       case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                      supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                      mi_row, mi_col);
         if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+          write_modes_b(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                        mi_row, mi_col + bs);
         break;
       case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
-                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row, mi_col + bs, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row + bs, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row + bs, mi_col + bs, subsize);
         break;
       default:
         assert(0);
     }
   }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token &&
+      !m->mbmi.skip) {
+    assert(*tok < tok_end);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                         : m->mbmi.tx_size;
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+      (*tok)++;
+    }
+  }
+#endif  // CONFIG_SUPERTX
 
   // update partition context
   if (bsize >= BLOCK_8X8 &&
@@ -636,10 +1316,16 @@
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     vp10_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+    vp10_zero(xd->left_txfm_context_buffer);
+#endif
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64);
+      write_modes_sb(cpi, tile, w, tok, tok_end,
+#if CONFIG_SUPERTX
+                     0,
+#endif
+                     mi_row, mi_col, BLOCK_64X64);
   }
 }
 
@@ -714,7 +1400,6 @@
         }
       }
 
-      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
         vpx_write_bit(bc, 0);
@@ -875,7 +1560,7 @@
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
     vpx_wb_write_bit(wb, 1);
-    vpx_wb_write_inv_signed_literal(wb, delta_q, CONFIG_MISC_FIXES ? 6 : 4);
+    vpx_wb_write_inv_signed_literal(wb, delta_q, 6);
   } else {
     vpx_wb_write_bit(wb, 0);
   }
@@ -892,11 +1577,7 @@
 static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
                                 struct vpx_write_bit_buffer *wb) {
   int i, j;
-
   const struct segmentation *seg = &cm->seg;
-#if !CONFIG_MISC_FIXES
-  const struct segmentation_probs *segp = &cm->segp;
-#endif
 
   vpx_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
@@ -911,16 +1592,6 @@
   if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
     vp10_choose_segmap_coding_method(cm, xd);
-#if !CONFIG_MISC_FIXES
-    // Write out probabilities used to decode unpredicted  macro-block segments
-    for (i = 0; i < SEG_TREE_PROBS; i++) {
-      const int prob = segp->tree_probs[i];
-      const int update = prob != MAX_PROB;
-      vpx_wb_write_bit(wb, update);
-      if (update)
-        vpx_wb_write_literal(wb, prob, 8);
-    }
-#endif
 
     // Write out the chosen coding method.
     if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
@@ -928,18 +1599,6 @@
     } else {
       assert(seg->temporal_update == 0);
     }
-
-#if !CONFIG_MISC_FIXES
-    if (seg->temporal_update) {
-      for (i = 0; i < PREDICTION_PROBS; i++) {
-        const int prob = segp->pred_probs[i];
-        const int update = prob != MAX_PROB;
-        vpx_wb_write_bit(wb, update);
-        if (update)
-          vpx_wb_write_literal(wb, prob, 8);
-      }
-    }
-#endif
   }
 
   // Segmentation data
@@ -967,7 +1626,6 @@
   }
 }
 
-#if CONFIG_MISC_FIXES
 static void update_seg_probs(VP10_COMP *cpi, vpx_writer *w) {
   VP10_COMMON *cm = &cpi->common;
 
@@ -994,18 +1652,10 @@
   if (mode != TX_MODE_SELECT)
     vpx_wb_write_literal(wb, mode, 2);
 }
-#else
-static void write_txfm_mode(TX_MODE mode, struct vpx_writer *wb) {
-  vpx_write_literal(wb, VPXMIN(mode, ALLOW_32X32), 2);
-  if (mode >= ALLOW_32X32)
-    vpx_write_bit(wb, mode == TX_MODE_SELECT);
-}
-#endif
 
 
 static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
-
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
     unsigned int ct_8x8p[TX_SIZES - 3][2];
@@ -1039,7 +1689,7 @@
                                 struct vpx_write_bit_buffer *wb) {
   vpx_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
-    vpx_wb_write_literal(wb, filter, 2);
+    vpx_wb_write_literal(wb, filter, 2 + CONFIG_EXT_INTERP);
 }
 
 static void fix_interp_filter(VP10_COMMON *cm, FRAME_COUNTS *counts) {
@@ -1085,6 +1735,17 @@
 }
 
 static int get_refresh_mask(VP10_COMP *cpi) {
+  int refresh_mask = 0;
+#if CONFIG_EXT_REFS
+  int ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame) {
+    refresh_mask |= (cpi->refresh_last_frames[ref_frame - LAST_FRAME] <<
+                     cpi->lst_fb_idxes[ref_frame - LAST_FRAME]);
+  }
+#else
+  refresh_mask = cpi->refresh_last_frame << cpi->lst_fb_idx;
+#endif  // CONFIG_EXT_REFS
+
   if (vp10_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term we leave it in the GF slot and,
@@ -1096,15 +1757,14 @@
     // Note: This is highly specific to the use of ARF as a forward reference,
     // and this needs to be generalized as other uses are implemented
     // (like RTC/temporal scalability).
-    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
-           (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+    return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
   } else {
     int arf_idx = cpi->alt_fb_idx;
     if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       arf_idx = gf_group->arf_update_idx[gf_group->index];
     }
-    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+    return refresh_mask |
            (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
            (cpi->refresh_alt_ref_frame << arf_idx);
   }
@@ -1123,6 +1783,10 @@
 
   memset(cm->above_seg_context, 0,
          sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, 0,
+         sizeof(*cm->above_txfm_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+#endif
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
@@ -1146,7 +1810,7 @@
 
         // size of this tile
         assert(residual_bc.pos > 0);
-        tile_sz = residual_bc.pos - CONFIG_MISC_FIXES;
+        tile_sz = residual_bc.pos - 1;
         mem_put_le32(data_ptr + total_size, tile_sz);
         max_tile = max_tile > tile_sz ? max_tile : tile_sz;
         total_size += 4;
@@ -1191,10 +1855,8 @@
     if (cfg != NULL) {
       found = cm->width == cfg->y_crop_width &&
               cm->height == cfg->y_crop_height;
-#if CONFIG_MISC_FIXES
       found &= cm->render_width == cfg->render_width &&
                cm->render_height == cfg->render_height;
-#endif
     }
     vpx_wb_write_bit(wb, found);
     if (found) {
@@ -1205,15 +1867,8 @@
   if (!found) {
     vpx_wb_write_literal(wb, cm->width - 1, 16);
     vpx_wb_write_literal(wb, cm->height - 1, 16);
-
-#if CONFIG_MISC_FIXES
     write_render_size(cm, wb);
-#endif
   }
-
-#if !CONFIG_MISC_FIXES
-  write_render_size(cm, wb);
-#endif
 }
 
 static void write_sync_code(struct vpx_write_bit_buffer *wb) {
@@ -1284,12 +1939,13 @@
     write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
+    if (frame_is_intra_only(cm))
+      vpx_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
     if (!cm->show_frame)
       vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode) {
-#if CONFIG_MISC_FIXES
       if (cm->intra_only) {
         vpx_wb_write_bit(wb,
                          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
@@ -1300,25 +1956,11 @@
           vpx_wb_write_bit(wb,
                            cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
       }
-#else
-      static const int reset_frame_context_conv_tbl[3] = { 0, 2, 3 };
-
-      vpx_wb_write_literal(wb,
-          reset_frame_context_conv_tbl[cm->reset_frame_context], 2);
-#endif
     }
 
     if (cm->intra_only) {
       write_sync_code(wb);
-
-#if CONFIG_MISC_FIXES
       write_bitdepth_colorspace_sampling(cm, wb);
-#else
-      // Note for profile 0, 420 8bpp is assumed.
-      if (cm->profile > PROFILE_0) {
-        write_bitdepth_colorspace_sampling(cm, wb);
-      }
-#endif
 
       vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
@@ -1344,11 +1986,9 @@
   if (!cm->error_resilient_mode) {
     vpx_wb_write_bit(wb,
                      cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF);
-#if CONFIG_MISC_FIXES
     if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
-#endif
       vpx_wb_write_bit(wb, cm->refresh_frame_context !=
-                               REFRESH_FRAME_CONTEXT_BACKWARD);
+          REFRESH_FRAME_CONTEXT_BACKWARD);
   }
 
   vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
@@ -1356,7 +1996,6 @@
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
-#if CONFIG_MISC_FIXES
   if (!cm->seg.enabled && xd->lossless[0])
     cm->tx_mode = TX_4X4;
   else
@@ -1369,36 +2008,29 @@
     if (!use_hybrid_pred)
       vpx_wb_write_bit(wb, use_compound_pred);
   }
-#endif
 
   write_tile_info(cm, wb);
 }
 
 static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
   VP10_COMMON *const cm = &cpi->common;
+#if CONFIG_SUPERTX
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#endif  // CONFIG_SUPERTX
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
   vpx_writer header_bc;
-  int i;
-#if CONFIG_MISC_FIXES
-  int j;
-#endif
+  int i, j;
 
   vpx_start_encode(&header_bc, data);
-
-#if !CONFIG_MISC_FIXES
-  if (cpi->td.mb.e_mbd.lossless[0]) {
-    cm->tx_mode = TX_4X4;
-  } else {
-    write_txfm_mode(cm->tx_mode, &header_bc);
-    update_txfm_probs(cm, &header_bc, counts);
-  }
-#else
   update_txfm_probs(cm, &header_bc, counts);
-#endif
   update_coef_probs(cpi, &header_bc);
+
+#if CONFIG_VAR_TX
+  update_txfm_partition_probs(cm, &header_bc, counts);
+#endif
+
   update_skip_probs(cm, &header_bc, counts);
-#if CONFIG_MISC_FIXES
   update_seg_probs(cpi, &header_bc);
 
   for (i = 0; i < INTRA_MODES; ++i)
@@ -1408,20 +2040,21 @@
   for (i = 0; i < PARTITION_CONTEXTS; ++i)
     prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
                      counts->partition[i], PARTITION_TYPES, &header_bc);
-#endif
 
   if (frame_is_intra_only(cm)) {
     vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
-#if CONFIG_MISC_FIXES
     for (i = 0; i < INTRA_MODES; ++i)
       for (j = 0; j < INTRA_MODES; ++j)
         prob_diff_update(vp10_intra_mode_tree, cm->kf_y_prob[i][j],
                          counts->kf_y_mode[i][j], INTRA_MODES, &header_bc);
-#endif
   } else {
+#if CONFIG_REF_MV
+    update_inter_mode_probs(cm, &header_bc, counts);
+#else
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       prob_diff_update(vp10_inter_mode_tree, cm->fc->inter_mode_probs[i],
                        counts->inter_mode[i], INTER_MODES, &header_bc);
+#endif
 
     if (cm->interp_filter == SWITCHABLE)
       update_switchable_interp_probs(cm, &header_bc, counts);
@@ -1432,52 +2065,43 @@
 
     if (cpi->allow_comp_inter_inter) {
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-#if !CONFIG_MISC_FIXES
-      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
-
-      vpx_write_bit(&header_bc, use_compound_pred);
-      if (use_compound_pred) {
-        vpx_write_bit(&header_bc, use_hybrid_pred);
-        if (use_hybrid_pred)
-          for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-            vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      counts->comp_inter[i]);
-      }
-#else
       if (use_hybrid_pred)
         for (i = 0; i < COMP_INTER_CONTEXTS; i++)
           vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
                                      counts->comp_inter[i]);
-#endif
     }
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
-        vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  counts->single_ref[i][0]);
-        vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  counts->single_ref[i][1]);
+        for (j = 0; j < (SINGLE_REFS - 1); j ++) {
+          vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][j],
+                                     counts->single_ref[i][j]);
+        }
       }
     }
 
-    if (cm->reference_mode != SINGLE_REFERENCE)
-      for (i = 0; i < REF_CONTEXTS; i++)
-        vp10_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  counts->comp_ref[i]);
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        for (j = 0; j < (COMP_REFS - 1); j ++) {
+          vp10_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i][j],
+                                     counts->comp_ref[i][j]);
+        }
+      }
+    }
 
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
       prob_diff_update(vp10_intra_mode_tree, cm->fc->y_mode_prob[i],
                        counts->y_mode[i], INTRA_MODES, &header_bc);
 
-#if !CONFIG_MISC_FIXES
-    for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
-                       counts->partition[i], PARTITION_TYPES, &header_bc);
-#endif
-
     vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
                         &counts->mv);
+#if CONFIG_EXT_TX
     update_ext_tx_probs(cm, &header_bc);
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0])
+      update_supertx_probs(cm, &header_bc);
+#endif  // CONFIG_SUPERTX
   }
 
   vpx_stop_encode(&header_bc);
@@ -1486,7 +2110,6 @@
   return header_bc.pos;
 }
 
-#if CONFIG_MISC_FIXES
 static int remux_tiles(uint8_t *dest, const int sz,
                        const int n_tiles, const int mag) {
   int rpos = 0, wpos = 0, n;
@@ -1526,7 +2149,6 @@
 
   return wpos;
 }
-#endif
 
 void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
@@ -1534,14 +2156,9 @@
   struct vpx_write_bit_buffer wb = {data, 0};
   struct vpx_write_bit_buffer saved_wb;
   unsigned int max_tile;
-#if CONFIG_MISC_FIXES
   VP10_COMMON *const cm = &cpi->common;
   const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
   const int have_tiles = n_log2_tiles > 0;
-#else
-  const int have_tiles = 0;  // we have tiles, but we don't want to write a
-                             // tile size marker in the header
-#endif
 
   write_uncompressed_header(cpi, &wb);
   saved_wb = wb;
@@ -1557,7 +2174,6 @@
   data += first_part_size;
 
   data_sz = encode_tiles(cpi, data, &max_tile);
-#if CONFIG_MISC_FIXES
   if (max_tile > 0) {
     int mag;
     unsigned int mask;
@@ -1576,7 +2192,6 @@
   } else {
     assert(n_log2_tiles == 0);
   }
-#endif
   data += data_sz;
 
   // TODO(jbb): Figure out what to do if first_part_size > 16 bits.

diff --git a/vp10/encoder/bitstream.h b/vp10/encoder/bitstream.h
index b1da89f..9df03da 100644
--- a/vp10/encoder/bitstream.h
+++ b/vp10/encoder/bitstream.h

@@ -21,6 +21,8 @@
 void vp10_encode_token_init();
 void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size);
 
+void vp10_encode_token_init();
+
 static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
   return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
          cpi->rc.is_src_frame_alt_ref;

diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index ab0252b..1383c19 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h

@@ -13,6 +13,9 @@
 
 #include "vp10/common/entropymv.h"
 #include "vp10/common/entropy.h"
+#if CONFIG_REF_MV
+#include "vp10/common/mvref_common.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,7 +27,7 @@
   unsigned int var;
 } diff;
 
-struct macroblock_plane {
+typedef struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
   tran_low_t *qcoeff;
   tran_low_t *coeff;
@@ -40,7 +43,7 @@
   int16_t *round;
 
   int64_t quant_thred[2];
-};
+} MACROBLOCK_PLANE;
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
  * coefficient in this block was zero) or not. */
@@ -49,9 +52,20 @@
 
 typedef struct {
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  uint8_t mode_context[MAX_REF_FRAMES];
+  int16_t mode_context[MAX_REF_FRAMES];
+#if CONFIG_REF_MV
+  uint8_t ref_mv_count[MAX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MAX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+#endif
 } MB_MODE_INFO_EXT;
 
+typedef struct {
+  uint8_t best_palette_color_map[4096];
+  double kmeans_data_buf[4096];
+  uint8_t kmeans_indices_buf[4096];
+  uint8_t kmeans_pre_indices_buf[4096];
+} PALETTE_BUFFER;
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -94,6 +108,8 @@
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
+  PALETTE_BUFFER *palette_buffer;
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
@@ -104,6 +120,9 @@
   // Notes transform blocks where no coefficents are coded.
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
+#if CONFIG_VAR_TX
+  uint8_t blk_skip[MAX_MB_PLANE][256];
+#endif
 
   int skip;
 

diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index 6c056d2..1ac802f 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c

@@ -28,6 +28,10 @@
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    CHECK_MEM_ERROR(cm, ctx->blk_skip[i],
+                    vpx_calloc(num_blk, sizeof(uint8_t)));
+#endif
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
@@ -50,6 +54,10 @@
   vpx_free(ctx->zcoeff_blk);
   ctx->zcoeff_blk = 0;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    vpx_free(ctx->blk_skip[i]);
+    ctx->blk_skip[i] = 0;
+#endif
     for (k = 0; k < 3; ++k) {
       vpx_free(ctx->coeff[i][k]);
       ctx->coeff[i][k] = 0;

diff --git a/vp10/encoder/context_tree.h b/vp10/encoder/context_tree.h
index 2a0fffb..55ae471 100644
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h

@@ -28,6 +28,9 @@
   MB_MODE_INFO_EXT mbmi_ext;
   uint8_t *zcoeff_blk;
   uint8_t *color_index_map[2];
+#if CONFIG_VAR_TX
+  uint8_t *blk_skip[MAX_MB_PLANE];
+#endif
   tran_low_t *coeff[MAX_MB_PLANE][3];
   tran_low_t *qcoeff[MAX_MB_PLANE][3];
   tran_low_t *dqcoeff[MAX_MB_PLANE][3];

diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 132a141..5602753 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c

@@ -37,6 +37,382 @@
 #endif
 }
 
+#if CONFIG_EXT_TX
+void fdst4(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  tran_high_t step[4];
+  tran_high_t temp1, temp2;
+
+  step[0] = input[0] - input[3];
+  step[1] = -input[1] + input[2];
+  step[2] = -input[1] - input[2];
+  step[3] = input[0] + input[3];
+
+  temp1 = (step[0] + step[1]) * cospi_16_64;
+  temp2 = (step[0] - step[1]) * cospi_16_64;
+  output[3] = fdct_round_shift(temp1);
+  output[1] = fdct_round_shift(temp2);
+  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+  output[2] = fdct_round_shift(temp1);
+  output[0] = fdct_round_shift(temp2);
+#else
+  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
+  static const int32_t sinvalue_lookup[] = {
+    141124871, 228344838,
+  };
+  int64_t sum;
+  int64_t s03 = (input[0] + input[3]);
+  int64_t d03 = (input[0] - input[3]);
+  int64_t s12 = (input[1] + input[2]);
+  int64_t d12 = (input[1] - input[2]);
+  sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
+  output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
+  output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
+  output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
+  output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+#endif  // USE_DST2
+}
+
+void fdst8(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
+
+  // stage 1
+  s0 = input[0] - input[7];
+  s1 = -input[1] + input[6];
+  s2 = input[2] - input[5];
+  s3 = -input[3] + input[4];
+  s4 = -input[3] - input[4];
+  s5 = input[2] + input[5];
+  s6 = -input[1] - input[6];
+  s7 = input[0] + input[7];
+
+  x0 = s0 + s3;
+  x1 = s1 + s2;
+  x2 = s1 - s2;
+  x3 = s0 - s3;
+  t0 = (x0 + x1) * cospi_16_64;
+  t1 = (x0 - x1) * cospi_16_64;
+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+  output[7] = fdct_round_shift(t0);
+  output[5] = fdct_round_shift(t2);
+  output[3] = fdct_round_shift(t1);
+  output[1] = fdct_round_shift(t3);
+
+  // Stage 2
+  t0 = (s6 - s5) * cospi_16_64;
+  t1 = (s6 + s5) * cospi_16_64;
+  t2 = fdct_round_shift(t0);
+  t3 = fdct_round_shift(t1);
+
+  // Stage 3
+  x0 = s4 + t2;
+  x1 = s4 - t2;
+  x2 = s7 - t3;
+  x3 = s7 + t3;
+
+  // Stage 4
+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[6] = fdct_round_shift(t0);
+  output[4] = fdct_round_shift(t2);
+  output[2] = fdct_round_shift(t1);
+  output[0] = fdct_round_shift(t3);
+#else
+  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
+  static const int sinvalue_lookup[] = {
+    86559612, 162678858, 219176632, 249238470
+  };
+  int64_t sum;
+  int64_t s07 = (input[0] + input[7]);
+  int64_t d07 = (input[0] - input[7]);
+  int64_t s16 = (input[1] + input[6]);
+  int64_t d16 = (input[1] - input[6]);
+  int64_t s25 = (input[2] + input[5]);
+  int64_t d25 = (input[2] - input[5]);
+  int64_t s34 = (input[3] + input[4]);
+  int64_t d34 = (input[3] - input[4]);
+  sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
+        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
+  output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
+        d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
+  output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = (s07 + s16 - s34)* sinvalue_lookup[2];
+  output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
+        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
+  output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
+        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
+  output[4] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = (d07 - d16 + d34)* sinvalue_lookup[2];
+  output[5] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
+        s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
+  output[6] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
+        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
+  output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+#endif  // USE_DST2
+}
+
+void fdst16(const tran_low_t *input, tran_low_t *output) {
+#if USE_DST2
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t in[8];         // canbe16
+  tran_high_t temp1, temp2;  // needs32
+
+  // step 1
+  in[0] = input[0] - input[15];
+  in[1] = -input[1] + input[14];
+  in[2] = input[2] - input[13];
+  in[3] = -input[3] + input[12];
+  in[4] = input[4] - input[11];
+  in[5] = -input[5] + input[10];
+  in[6] = input[6] - input[ 9];
+  in[7] = -input[7] + input[ 8];
+
+  step1[0] = -input[7] - input[ 8];
+  step1[1] = input[6] + input[ 9];
+  step1[2] = -input[5] - input[10];
+  step1[3] = input[4] + input[11];
+  step1[4] = -input[3] - input[12];
+  step1[5] = input[2] + input[13];
+  step1[6] = -input[1] - input[14];
+  step1[7] = input[0] + input[15];
+
+  // fdct8(step, step);
+  {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    // stage 1
+    s0 = in[0] + in[7];
+    s1 = in[1] + in[6];
+    s2 = in[2] + in[5];
+    s3 = in[3] + in[4];
+    s4 = in[3] - in[4];
+    s5 = in[2] - in[5];
+    s6 = in[1] - in[6];
+    s7 = in[0] - in[7];
+
+    // fdct4(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+    output[15] = fdct_round_shift(t0);
+    output[11] = fdct_round_shift(t2);
+    output[7] = fdct_round_shift(t1);
+    output[3] = fdct_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+    output[13] = fdct_round_shift(t0);
+    output[9] = fdct_round_shift(t2);
+    output[5] = fdct_round_shift(t1);
+    output[1] = fdct_round_shift(t3);
+  }
+
+  // step 2
+  temp1 = (step1[5] - step1[2]) * cospi_16_64;
+  temp2 = (step1[4] - step1[3]) * cospi_16_64;
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
+  temp1 = (step1[4] + step1[3]) * cospi_16_64;
+  temp2 = (step1[5] + step1[2]) * cospi_16_64;
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
+
+  // step 3
+  step3[0] = step1[0] + step2[3];
+  step3[1] = step1[1] + step2[2];
+  step3[2] = step1[1] - step2[2];
+  step3[3] = step1[0] - step2[3];
+  step3[4] = step1[7] - step2[4];
+  step3[5] = step1[6] - step2[5];
+  step3[6] = step1[6] + step2[5];
+  step3[7] = step1[7] + step2[4];
+
+  // step 4
+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
+  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
+
+  // step 5
+  step1[0] = step3[0] + step2[1];
+  step1[1] = step3[0] - step2[1];
+  step1[2] = step3[3] + step2[2];
+  step1[3] = step3[3] - step2[2];
+  step1[4] = step3[4] - step2[5];
+  step1[5] = step3[4] + step2[5];
+  step1[6] = step3[7] - step2[6];
+  step1[7] = step3[7] + step2[6];
+
+  // step 6
+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  output[14] = fdct_round_shift(temp1);
+  output[6] = fdct_round_shift(temp2);
+
+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  output[10] = fdct_round_shift(temp1);
+  output[2] = fdct_round_shift(temp2);
+
+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  output[12] = fdct_round_shift(temp1);
+  output[4] = fdct_round_shift(temp2);
+
+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+  output[8] = fdct_round_shift(temp1);
+  output[0] = fdct_round_shift(temp2);
+#else
+  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
+  static const int sinvalue_lookup[] = {
+    47852167, 94074787, 137093803, 175444254,
+    207820161, 233119001, 250479254, 259309736
+  };
+  int64_t sum;
+  int64_t s015 = (input[0] + input[15]);
+  int64_t d015 = (input[0] - input[15]);
+  int64_t s114 = (input[1] + input[14]);
+  int64_t d114 = (input[1] - input[14]);
+  int64_t s213 = (input[2] + input[13]);
+  int64_t d213 = (input[2] - input[13]);
+  int64_t s312 = (input[3] + input[12]);
+  int64_t d312 = (input[3] - input[12]);
+  int64_t s411 = (input[4] + input[11]);
+  int64_t d411 = (input[4] - input[11]);
+  int64_t s510 = (input[5] + input[10]);
+  int64_t d510 = (input[5] - input[10]);
+  int64_t s69  = (input[6] + input[9]);
+  int64_t d69  = (input[6] - input[9]);
+  int64_t s78  = (input[7] + input[8]);
+  int64_t d78  = (input[7] - input[8]);
+  sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
+        s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
+        s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
+        s69  * sinvalue_lookup[6] + s78  * sinvalue_lookup[7];
+  output[0]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
+        d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
+        d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
+        d69  * sinvalue_lookup[2] + d78  * sinvalue_lookup[0];
+  output[1]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
+        s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
+        s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
+        s69  * sinvalue_lookup[3] - s78  * sinvalue_lookup[6];
+  output[2]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
+        d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
+        d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
+        d69  * sinvalue_lookup[5] - d78  * sinvalue_lookup[1];
+  output[3]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
+        s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
+        s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
+        s69  * sinvalue_lookup[0] + s78  * sinvalue_lookup[5];
+  output[4]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
+        d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
+        d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
+        d69  * sinvalue_lookup[7] + d78  * sinvalue_lookup[2];
+  output[5]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
+        s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
+        s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
+        s69  * sinvalue_lookup[1] - s78  * sinvalue_lookup[4];
+  output[6]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
+        d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
+        d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
+        d69  * sinvalue_lookup[4] - d78  * sinvalue_lookup[3];
+  output[7]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
+        s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
+        s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
+        s69  * sinvalue_lookup[4] + s78  * sinvalue_lookup[3];
+  output[8]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
+        d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
+        d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
+        d69  * sinvalue_lookup[1] + d78  * sinvalue_lookup[4];
+  output[9]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
+        s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
+        s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
+        s69  * sinvalue_lookup[7] - s78  * sinvalue_lookup[2];
+  output[10] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
+        d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
+        d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
+        d69  * sinvalue_lookup[0] - d78  * sinvalue_lookup[5];
+  output[11] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
+        s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
+        s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
+        s69  * sinvalue_lookup[5] + s78  * sinvalue_lookup[1];
+  output[12] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
+        d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
+        d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
+        d69  * sinvalue_lookup[3] + d78  * sinvalue_lookup[6];
+  output[13] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
+        s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
+        s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
+        s69  * sinvalue_lookup[2] - s78  * sinvalue_lookup[0];
+  output[14] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+  sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
+        d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
+        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
+        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
+  output[15] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
+#endif  // USE_DST2
+}
+#endif  // CONFIG_EXT_TX
+
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
@@ -999,29 +1375,171 @@
   output[15] = (tran_low_t)-x1;
 }
 
+#if CONFIG_EXT_TX
+static void copy_block(const int16_t *src, int src_stride, int l,
+                       int16_t *dest, int dest_stride) {
+  int i;
+  for (i = 0; i < l; ++i) {
+    memcpy(dest + dest_stride * i, src + src_stride * i,
+           l * sizeof(int16_t));
+  }
+}
+
+static void fliplr(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l; ++i) {
+    for (j = 0; j < l / 2; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + l - 1 - j];
+      dest[i * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void flipud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (j = 0; j < l; ++j) {
+    for (i = 0; i < l / 2; ++i) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+      dest[(l - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l / 2; ++i) {
+    for (j = 0; j < l; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
+      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplr(dest, dest_stride, l);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  flipud(dest, dest_stride, l);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l,
+                            int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplrud(dest, dest_stride, l);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+                             int16_t *buff, int tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case FLIPADST_DST:
+      copy_flipud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case DST_FLIPADST:
+      copy_fliplr(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case FLIPADST_FLIPADST:
+      copy_fliplrud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT  = 0
-  { fadst4, fdct4  },  // ADST_DCT = 1
-  { fdct4,  fadst4 },  // DCT_ADST = 2
-  { fadst4, fadst4 }   // ADST_ADST = 3
+  { fdct4,  fdct4  },  // DCT_DCT           = 0,
+  { fadst4, fdct4  },  // ADST_DCT          = 1,
+  { fdct4,  fadst4 },  // DCT_ADST          = 2,
+  { fadst4, fadst4 },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+  { fadst4, fdct4  },  // FLIPADST_DCT      = 4,
+  { fdct4,  fadst4 },  // DCT_FLIPADST      = 5,
+  { fadst4, fadst4 },  // FLIPADST_FLIPADST = 6,
+  { fadst4, fadst4 },  // ADST_FLIPADST     = 7,
+  { fadst4, fadst4 },  // FLIPADST_ADST     = 8,
+  { fdst4,  fdct4  },  // DST_DCT           = 9,
+  { fdct4,  fdst4  },  // DCT_DST           = 10,
+  { fdst4,  fadst4 },  // DST_ADST          = 11,
+  { fadst4, fdst4  },  // ADST_DST          = 12,
+  { fdst4,  fadst4 },  // DST_FLIPADST      = 13,
+  { fadst4, fdst4  },  // FLIPADST_DST      = 14,
+  { fdst4,  fdst4  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
 };
 
 static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT  = 0
-  { fadst8, fdct8  },  // ADST_DCT = 1
-  { fdct8,  fadst8 },  // DCT_ADST = 2
-  { fadst8, fadst8 }   // ADST_ADST = 3
+  { fdct8,  fdct8  },  // DCT_DCT           = 0,
+  { fadst8, fdct8  },  // ADST_DCT          = 1,
+  { fdct8,  fadst8 },  // DCT_ADST          = 2,
+  { fadst8, fadst8 },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+  { fadst8, fdct8  },  // FLIPADST_DCT      = 4,
+  { fdct8,  fadst8 },  // DCT_FLIPADST      = 5,
+  { fadst8, fadst8 },  // FLIPADST_FLIPADST = 6,
+  { fadst8, fadst8 },  // ADST_FLIPADST     = 7,
+  { fadst8, fadst8 },  // FLIPADST_ADST     = 8,
+  { fdst8,  fdct8  },  // DST_DCT           = 9,
+  { fdct8,  fdst8  },  // DCT_DST           = 10,
+  { fdst8,  fadst8 },  // DST_ADST          = 11,
+  { fadst8, fdst8  },  // ADST_DST          = 12,
+  { fdst8,  fadst8 },  // DST_FLIPADST      = 13,
+  { fadst8, fdst8  },  // FLIPADST_DST      = 14,
+  { fdst8,  fdst8  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
 };
 
 static const transform_2d FHT_16[] = {
-  { fdct16,  fdct16  },  // DCT_DCT  = 0
-  { fadst16, fdct16  },  // ADST_DCT = 1
-  { fdct16,  fadst16 },  // DCT_ADST = 2
-  { fadst16, fadst16 }   // ADST_ADST = 3
+  { fdct16,  fdct16  },  // DCT_DCT           = 0,
+  { fadst16, fdct16  },  // ADST_DCT          = 1,
+  { fdct16,  fadst16 },  // DCT_ADST          = 2,
+  { fadst16, fadst16 },  // ADST_ADST         = 3,
+#if CONFIG_EXT_TX
+  { fadst16, fdct16  },  // FLIPADST_DCT      = 4,
+  { fdct16,  fadst16 },  // DCT_FLIPADST      = 5,
+  { fadst16, fadst16 },  // FLIPADST_FLIPADST = 6,
+  { fadst16, fadst16 },  // ADST_FLIPADST     = 7,
+  { fadst16, fadst16 },  // FLIPADST_ADST     = 8,
+  { fdst16,  fdct16  },  // DST_DCT           = 9,
+  { fdct16,  fdst16  },  // DCT_DST           = 10,
+  { fdst16,  fadst16 },  // DST_ADST          = 11,
+  { fadst16, fdst16  },  // ADST_DST          = 12,
+  { fdst16,  fadst16 },  // DST_FLIPADST      = 13,
+  { fadst16, fdst16  },  // FLIPADST_DST      = 14,
+  { fdst16,  fdst16  },  // DST_DST           = 15
+#endif  // CONFIG_EXT_TX
 };
 
 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
+                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vpx_fdct4x4_c(input, output, stride);
   } else {
@@ -1030,6 +1548,11 @@
     tran_low_t temp_in[4], temp_out[4];
     const transform_2d ht = FHT_4[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[4 * 4];
+    maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 4; ++i) {
       for (j = 0; j < 4; ++j)
@@ -1053,15 +1576,15 @@
 }
 
 void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
-                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr,
-                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                         const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr,
-                         const int16_t *scan, const int16_t *iscan) {
+                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   int eob = -1;
 
   int i, j;
@@ -1165,7 +1688,7 @@
 }
 
 void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
+                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vpx_fdct8x8_c(input, output, stride);
   } else {
@@ -1174,6 +1697,11 @@
     tran_low_t temp_in[8], temp_out[8];
     const transform_2d ht = FHT_8[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[8 * 8];
+    maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 8; ++i) {
       for (j = 0; j < 8; ++j)
@@ -1251,7 +1779,7 @@
 }
 
 void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
-                    int stride, int tx_type) {
+                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vpx_fdct16x16_c(input, output, stride);
   } else {
@@ -1260,6 +1788,11 @@
     tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[16 * 16];
+    maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 16; ++i) {
       for (j = 0; j < 16; ++j)

diff --git a/vp10/encoder/denoiser.c b/vp10/encoder/denoiser.c
index e5d8157..43647b0 100644
--- a/vp10/encoder/denoiser.c
+++ b/vp10/encoder/denoiser.c

@@ -377,9 +377,17 @@
 void vp10_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
+#if CONFIG_EXT_REFS
+                                    int refresh_last_frames[LAST_REF_FRAMES],
+#else
+                                    int refresh_last_frame,
+#endif  // CONFIG_EXT_REFS
                                     int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame) {
+                                    int refresh_golden_frame) {
+#if CONFIG_EXT_REFS
+  int ref_frame;
+#endif  // CONFIG_EXT_REFS
+
   if (frame_type == KEY_FRAME) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
@@ -397,10 +405,19 @@
     swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
                       &denoiser->running_avg_y[INTRA_FRAME]);
   }
+#if CONFIG_EXT_REFS
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame) {
+    if (refresh_last_frames[ref_frame - LAST_FRAME]) {
+      swap_frame_buffer(&denoiser->running_avg_y[ref_frame],
+                        &denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  }
+#else
   if (refresh_last_frame) {
     swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
                       &denoiser->running_avg_y[INTRA_FRAME]);
   }
+#endif  // CONFIG_EXT_REFS
 }
 
 void vp10_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {

diff --git a/vp10/encoder/denoiser.h b/vp10/encoder/denoiser.h
index e543fb0..f48cbb0 100644
--- a/vp10/encoder/denoiser.h
+++ b/vp10/encoder/denoiser.h

@@ -35,9 +35,13 @@
 void vp10_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
+#if CONFIG_EXT_REFS
+                                    int refresh_last_frames[LAST_REF_FRAMES],
+#else
+                                    int refresh_last_frame,
+#endif  // CONFIG_EXT_REFS
                                     int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame);
+                                    int refresh_golden_frame);
 
 void vp10_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,

diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 26ce5a1..c1301f9 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c

@@ -36,6 +36,9 @@
 #include "vp10/encoder/aq_complexity.h"
 #include "vp10/encoder/aq_cyclicrefresh.h"
 #include "vp10/encoder/aq_variance.h"
+#if CONFIG_SUPERTX
+#include "vp10/encoder/cost.h"
+#endif
 #include "vp10/encoder/encodeframe.h"
 #include "vp10/encoder/encodemb.h"
 #include "vp10/encoder/encodemv.h"
@@ -51,6 +54,40 @@
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx);
 
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx);
+
+static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree);
+static void predict_superblock(VP10_COMP *cpi, ThreadData *td,
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree);
+static void predict_sb_complex(VP10_COMP *cpi, ThreadData *td,
+                               const TileInfo *const tile,
+                               int mi_row, int mi_col,
+                               int mi_row_ori, int mi_col_ori,
+                               int output_enabled, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize,
+                               uint8_t *dst_buf[3], int dst_stride[3],
+                               PC_TREE *pc_tree);
+static void update_state_sb_supertx(VP10_COMP *cpi, ThreadData *td,
+                                    const TileInfo *const tile,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize,
+                                    int output_enabled, PC_TREE *pc_tree);
+static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
+                          const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          int *tmp_rate, int64_t *tmp_dist,
+#if CONFIG_EXT_TX
+                          TX_TYPE *best_tx,
+#endif
+                          PC_TREE *pc_tree);
+#endif  // CONFIG_SUPERTX
+
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -170,11 +207,11 @@
 
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_mode_info_offsets(VP10_COMP *const cpi,
-                                         MACROBLOCK *const x,
-                                         MACROBLOCKD *const xd,
-                                         int mi_row,
-                                         int mi_col) {
+static void set_mode_info_offsets(VP10_COMP *const cpi,
+                                  MACROBLOCK *const x,
+                                  MACROBLOCKD *const xd,
+                                  int mi_row,
+                                  int mi_col) {
   VP10_COMMON *const cm = &cpi->common;
   const int idx_str = xd->mi_stride * mi_row + mi_col;
   xd->mi = cm->mi_grid_visible + idx_str;
@@ -196,6 +233,12 @@
 
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
 
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
   mbmi = &xd->mi[0]->mbmi;
 
   // Set up destination pointers.
@@ -239,6 +282,80 @@
   xd->tile = *tile;
 }
 
+#if CONFIG_SUPERTX
+static void set_offsets_supertx(VP10_COMP *cpi, ThreadData *td,
+                                const TileInfo *const tile,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_extend(VP10_COMP *cpi, ThreadData *td,
+                               const TileInfo *const tile,
+                               int mi_row_pred, int mi_col_pred,
+                               int mi_row_ori, int mi_col_ori,
+                               BLOCK_SIZE bsize_pred, BLOCK_SIZE bsize_ori) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize_pred];
+  const struct segmentation *const seg = &cm->seg;
+
+  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
+
+  mbmi = &xd->mi[0]->mbmi;
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_row_min = -(((mi_row_pred + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col_pred + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row_pred) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col_pred) * MI_SIZE + VP9_INTERP_EXTEND;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+  xd->up_available    = (mi_row_ori != 0);
+  xd->left_available  = (mi_col_ori > tile->mi_col_start);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+
+  // Setup segment ID.
+  if (seg->enabled) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mbmi->segment_id = get_segment_id(cm, map, bsize_ori,
+                                        mi_row_ori, mi_col_ori);
+    }
+    vp10_init_plane_quantizers(cpi, x);
+
+    x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
+  } else {
+    mbmi->segment_id = 0;
+    x->encode_breakout = cpi->encode_breakout;
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 static void set_block_size(VP10_COMP * const cpi,
                            MACROBLOCK *const x,
                            MACROBLOCKD *const xd,
@@ -967,7 +1084,9 @@
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   int max_plane;
 
+#if !CONFIG_SUPERTX
   assert(mi->mbmi.sb_type == bsize);
+#endif
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
@@ -985,8 +1104,8 @@
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp10_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
-                                        mi_col, bsize, ctx->rate, ctx->dist,
-                                        x->skip);
+                                         mi_col, bsize, ctx->rate, ctx->dist,
+                                         x->skip);
     }
   }
 
@@ -1026,8 +1145,15 @@
   }
 
   x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+#else
   memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
          sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+#endif
 
   if (!output_enabled)
     return;
@@ -1056,7 +1182,11 @@
     if (is_inter_block(mbmi)) {
       vp10_update_mv_count(td);
 
-      if (cm->interp_filter == SWITCHABLE) {
+      if (cm->interp_filter == SWITCHABLE
+#if CONFIG_EXT_INTERP
+          && vp10_is_interp_needed(xd)
+#endif
+          ) {
         const int ctx = vp10_get_pred_context_switchable_interp(xd);
         ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
       }
@@ -1082,6 +1212,285 @@
   }
 }
 
+#if CONFIG_SUPERTX
+static void update_state_supertx(VP10_COMP *cpi, ThreadData *td,
+                                 PICK_MODE_CONTEXT *ctx,
+                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                 int output_enabled) {
+  int i, y, x_idx;
+  VP10_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = VPXMIN(mi_width, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(mi_height, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+  assert(is_inter_block(mbmi));
+
+  // If segmentation in use
+  if (seg->enabled && output_enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id =
+        get_segment_id(cm, map, bsize, mi_row, mi_col);
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      // Else for cyclic refresh mode update the segment map, set the segment id
+      // and then update the quantizer.
+      vp10_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
+                                         mi_row, mi_col, bsize,
+                                         ctx->rate, ctx->dist, 1);
+      vp10_init_plane_quantizers(cpi, x);
+    }
+  }
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+  if (cpi->oxcf.aq_mode)
+    vp10_init_plane_quantizers(cpi, x);
+
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+
+  x->skip = ctx->skip;
+  memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+         sizeof(uint8_t) * ctx->num_4x4_blk);
+
+  if (!output_enabled)
+    return;
+
+  if (!frame_is_intra_only(cm)) {
+    vp10_update_mv_count(td);
+
+    if (cm->interp_filter == SWITCHABLE
+#if CONFIG_EXT_INTERP
+        && vp10_is_interp_needed(xd)
+#endif
+        ) {
+      const int ctx = vp10_get_pred_context_switchable_interp(xd);
+      ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+      rdc->filter_diff[i] += ctx->best_filter_diff[i];
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
+}
+
+static void update_state_sb_supertx(VP10_COMP *cpi, ThreadData *td,
+                                    const TileInfo *const tile,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize,
+                                    int output_enabled, PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  int i;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col,
+                           subsize, output_enabled);
+      break;
+    case PARTITION_VERT:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_supertx(cpi, td, &pc_tree->vertical[1],
+                             mi_row, mi_col + hbs, subsize, output_enabled);
+      }
+      break;
+    case PARTITION_HORZ:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
+                             mi_col, subsize, output_enabled);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
+                             subsize, output_enabled);
+      } else {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize,
+                                output_enabled, pc_tree->split[0]);
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
+                                output_enabled, pc_tree->split[1]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
+                                output_enabled, pc_tree->split[2]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                                subsize, output_enabled, pc_tree->split[3]);
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = (&pc_tree->none)->coeff_pbuf[i][1];
+    p[i].qcoeff = (&pc_tree->none)->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = (&pc_tree->none)->dqcoeff_pbuf[i][1];
+    p[i].eobs = (&pc_tree->none)->eobs_pbuf[i][1];
+  }
+}
+
+static void update_supertx_param(ThreadData *td,
+                                 PICK_MODE_CONTEXT *ctx,
+#if CONFIG_EXT_TX
+                                 int best_tx,
+#endif
+                                 TX_SIZE supertx_size) {
+  MACROBLOCK *const x = &td->mb;
+
+  ctx->mic.mbmi.tx_size = supertx_size;
+  memcpy(ctx->zcoeff_blk, x->zcoeff_blk[supertx_size],
+         sizeof(uint8_t) * ctx->num_4x4_blk);
+  ctx->skip = x->skip;
+#if CONFIG_EXT_TX
+  ctx->mic.mbmi.tx_type = best_tx;
+#endif  // CONFIG_EXT_TX
+}
+
+static void update_supertx_param_sb(VP10_COMP *cpi, ThreadData *td,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize,
+#if CONFIG_EXT_TX
+                                    int best_tx,
+#endif
+                                    TX_SIZE supertx_size, PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      update_supertx_param(td, &pc_tree->none,
+#if CONFIG_EXT_TX
+                           best_tx,
+#endif
+                           supertx_size);
+      break;
+    case PARTITION_VERT:
+      update_supertx_param(td, &pc_tree->vertical[0],
+#if CONFIG_EXT_TX
+                           best_tx,
+#endif
+                           supertx_size);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8)
+        update_supertx_param(td, &pc_tree->vertical[1],
+#if CONFIG_EXT_TX
+                             best_tx,
+#endif
+                             supertx_size);
+      break;
+    case PARTITION_HORZ:
+      update_supertx_param(td, &pc_tree->horizontal[0],
+#if CONFIG_EXT_TX
+                           best_tx,
+#endif
+                           supertx_size);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8)
+        update_supertx_param(td, &pc_tree->horizontal[1],
+#if CONFIG_EXT_TX
+                             best_tx,
+#endif
+                             supertx_size);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        update_supertx_param(td, pc_tree->leaf_split[0],
+#if CONFIG_EXT_TX
+                             best_tx,
+#endif
+                             supertx_size);
+      } else {
+        update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize,
+#if CONFIG_EXT_TX
+                                best_tx,
+#endif
+                                supertx_size, pc_tree->split[0]);
+        update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize,
+#if CONFIG_EXT_TX
+                                best_tx,
+#endif
+                                supertx_size, pc_tree->split[1]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize,
+#if CONFIG_EXT_TX
+                                best_tx,
+#endif
+                                supertx_size, pc_tree->split[2]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
+#if CONFIG_EXT_TX
+                                best_tx,
+#endif
+                                supertx_size, pc_tree->split[3]);
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 void vp10_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
   uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer };
@@ -1113,6 +1522,9 @@
                              TileDataEnc *tile_data,
                              MACROBLOCK *const x,
                              int mi_row, int mi_col, RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                             int *totalrate_nocoef,
+#endif
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
   VP10_COMMON *const cm = &cpi->common;
@@ -1140,6 +1552,15 @@
     p[i].eobs = ctx->eobs_pbuf[i][0];
   }
 
+  if (cm->current_video_frame == 0 && cm->allow_screen_content_tools) {
+    for (i = 0; i < 2; ++i) {
+      if (ctx->color_index_map[i] == 0) {
+        CHECK_MEM_ERROR(cm, ctx->color_index_map[i],
+                        vpx_memalign(16, (ctx->num_4x4_blk << 4) *
+                                     sizeof(*ctx->color_index_map[i])));
+      }
+    }
+  }
   for (i = 0; i < 2; ++i)
     pd[i].color_index_map = ctx->color_index_map[i];
 
@@ -1196,17 +1617,30 @@
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
     vp10_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+    *totalrate_nocoef = 0;
+#endif  // CONFIG_SUPERTX
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         vp10_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
-      else
-        vp10_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
-                                  rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        *totalrate_nocoef = rd_cost->rate;
+#endif  // CONFIG_SUPERTX
+      } else {
+        vp10_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+#if CONFIG_SUPERTX
+                                   totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                   bsize, ctx, best_rd);
+      }
     } else {
-      vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
-                                    rd_cost, bsize, ctx, best_rd);
+      vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+#if CONFIG_SUPERTX
+                                     totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                     bsize, ctx, best_rd);
     }
   }
 
@@ -1231,7 +1665,47 @@
   ctx->dist = rd_cost->dist;
 }
 
-static void update_stats(VP10_COMMON *cm, ThreadData *td) {
+#if CONFIG_REF_MV
+static void update_inter_mode_stats(FRAME_COUNTS *counts,
+                                    PREDICTION_MODE mode,
+                                    int16_t mode_context) {
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+    ++counts->newmv_mode[mode_ctx][0];
+    return;
+  } else {
+    ++counts->newmv_mode[mode_ctx][1];
+
+    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      return;
+    }
+
+    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    if (mode == ZEROMV) {
+      ++counts->zeromv_mode[mode_ctx][0];
+      return;
+    } else {
+      ++counts->zeromv_mode[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET))
+        mode_ctx = 6;
+      if (mode_context & (1 << SKIP_NEARMV_OFFSET))
+        mode_ctx = 7;
+      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+        mode_ctx = 8;
+
+      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+    }
+  }
+}
+#endif
+
+static void update_stats(VP10_COMMON *cm, ThreadData *td
+#if CONFIG_SUPERTX
+                         , int supertx_enabled
+#endif
+                         ) {
   const MACROBLOCK *x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MODE_INFO *const mi = xd->mi[0];
@@ -1245,6 +1719,9 @@
     const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
                                                  SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
+#if CONFIG_SUPERTX
+      if (!supertx_enabled)
+#endif
       counts->intra_inter[vp10_get_intra_inter_context(xd)][inter_block]++;
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
@@ -1256,23 +1733,66 @@
                             [has_second_ref(mbmi)]++;
 
         if (has_second_ref(mbmi)) {
-          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)]
+#if CONFIG_EXT_REFS
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME ||
+                           ref0 == LAST4_FRAME);
+          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+          if (!bit) {
+            counts->comp_ref[vp10_get_pred_context_comp_ref_p1(cm, xd)][1]
+                            [ref0 == LAST_FRAME]++;
+          } else {
+            counts->comp_ref[vp10_get_pred_context_comp_ref_p2(cm, xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+            if (ref0 != GOLDEN_FRAME) {
+              counts->comp_ref[vp10_get_pred_context_comp_ref_p3(cm, xd)][3]
+                              [ref0 == LAST3_FRAME]++;
+            }
+          }
+#else
+          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_EXT_REFS
         } else {
+#if CONFIG_EXT_REFS
+          const int bit = (ref0 == ALTREF_FRAME || ref0 == GOLDEN_FRAME);
+          counts->single_ref[vp10_get_pred_context_single_ref_p1(xd)][0][bit]++;
+          if (bit) {
+            counts->single_ref[vp10_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != GOLDEN_FRAME]++;
+          } else {
+            const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+            counts->single_ref[vp10_get_pred_context_single_ref_p3(xd)][2]
+                              [bit1]++;
+            if (!bit1) {
+              counts->single_ref[vp10_get_pred_context_single_ref_p4(xd)][3]
+                                [ref0 != LAST_FRAME]++;
+            } else {
+              counts->single_ref[vp10_get_pred_context_single_ref_p5(xd)][4]
+                                [ref0 != LAST3_FRAME]++;
+            }
+          }
+#else
           counts->single_ref[vp10_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
           if (ref0 != LAST_FRAME)
             counts->single_ref[vp10_get_pred_context_single_ref_p2(xd)][1]
                               [ref0 != GOLDEN_FRAME]++;
+#endif  // CONFIG_EXT_REFS
         }
       }
     }
     if (inter_block &&
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+      int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
       if (bsize >= BLOCK_8X8) {
         const PREDICTION_MODE mode = mbmi->mode;
+#if CONFIG_REF_MV
+        mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                              mbmi->ref_frame, bsize, -1);
+        update_inter_mode_stats(counts, mode, mode_ctx);
+#else
         ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+#endif
       } else {
         const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
         const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -1281,7 +1801,13 @@
           for (idx = 0; idx < 2; idx += num_4x4_w) {
             const int j = idy * 2 + idx;
             const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+#if CONFIG_REF_MV
+            mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                                  mbmi->ref_frame, bsize, j);
+            update_inter_mode_stats(counts, b_mode, mode_ctx);
+#else
             ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+#endif
           }
         }
       }
@@ -1293,6 +1819,9 @@
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+#if CONFIG_VAR_TX
+                            TXFM_CONTEXT ta[8], TXFM_CONTEXT tl[8],
+#endif
                             BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -1317,12 +1846,21 @@
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  memcpy(xd->above_txfm_context, ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+#endif
 }
 
 static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+#if CONFIG_VAR_TX
+                         TXFM_CONTEXT ta[8], TXFM_CONTEXT tl[8],
+#endif
                          BLOCK_SIZE bsize) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
@@ -1349,6 +1887,12 @@
          sizeof(*xd->above_seg_context) * mi_width);
   memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  memcpy(ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+#endif
 }
 
 static void encode_b(VP10_COMP *cpi, const TileInfo *const tile,
@@ -1362,7 +1906,11 @@
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
   if (output_enabled) {
+#if CONFIG_SUPERTX
+    update_stats(&cpi->common, td, 0);
+#else
     update_stats(&cpi->common, td);
+#endif
   }
 }
 
@@ -1395,6 +1943,82 @@
   if (output_enabled && bsize != BLOCK_4X4)
     td->counts->partition[ctx][partition]++;
 
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      partition != PARTITION_NONE &&
+      !xd->lossless[0]) {
+    int supertx_enabled;
+    TX_SIZE supertx_size = max_txsize_lookup[bsize];
+    supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
+    if (supertx_enabled) {
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      int x_idx, y_idx, i;
+      uint8_t *dst_buf[3];
+      int dst_stride[3];
+      set_skip_context(xd, mi_row, mi_col);
+      set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
+                              output_enabled, pc_tree);
+
+      vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
+                           mi_row, mi_col);
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        dst_buf[i] = xd->plane[i].dst.buf;
+        dst_stride[i] = xd->plane[i].dst.stride;
+      }
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
+                         output_enabled, bsize, bsize,
+                         dst_buf, dst_stride, pc_tree);
+
+      set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+      if (!x->skip) {
+        xd->mi[0]->mbmi.skip = 1;
+        vp10_encode_sb_supertx(x, bsize);
+        vp10_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
+      } else {
+        xd->mi[0]->mbmi.skip = 1;
+        if (output_enabled)
+          td->counts->skip[vp10_get_skip_context(xd)][1]++;
+        reset_skip_context(xd, bsize);
+      }
+      if (output_enabled) {
+        for (y_idx = 0; y_idx < mi_height; y_idx++)
+          for (x_idx = 0; x_idx < mi_width; x_idx++) {
+            if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+                && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height
+                    > y_idx) {
+              xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
+                  xd->mi[0]->mbmi.skip;
+            }
+          }
+        td->counts->supertx
+            [partition_supertx_context_lookup[partition]][supertx_size][1]++;
+        td->counts->supertx_size[supertx_size]++;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(supertx_size, bsize, 1) > 1 &&
+            !xd->mi[0]->mbmi.skip) {
+          int eset = get_ext_tx_set(supertx_size, bsize, 1);
+          if (eset > 0) {
+            ++td->counts->inter_ext_tx[eset][supertx_size]
+                                      [xd->mi[0]->mbmi.tx_type];
+          }
+        }
+#endif  // CONFIG_EXT_TX
+      }
+      if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+      return;
+    } else {
+      if (output_enabled) {
+        td->counts->supertx
+            [partition_supertx_context_lookup[partition]][supertx_size][0]++;
+      }
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
   switch (partition) {
     case PARTITION_NONE:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
@@ -1519,6 +2143,9 @@
                              int mi_row, int mi_col,
                              BLOCK_SIZE bsize,
                              int *rate, int64_t *dist,
+#if CONFIG_SUPERTX
+                             int *rate_nocoef,
+#endif
                              int do_recon, PC_TREE *pc_tree) {
   VP10_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
@@ -1533,12 +2160,20 @@
   BLOCK_SIZE subsize;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT tl[8], ta[8];
+#endif
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+#if CONFIG_SUPERTX
+  int last_part_rate_nocoef = INT_MAX;
+  int none_rate_nocoef = INT_MAX;
+  int chosen_rate_nocoef = INT_MAX;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -1553,8 +2188,16 @@
   partition = partition_lookup[bsl][bs_type];
   subsize = get_subsize(bsize, partition);
 
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+#endif
   pc_tree->partitioning = partition;
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, mi_row, mi_col, a, l, sa, sl,
+#if CONFIG_VAR_TX
+               ta, tl,
+#endif
+               bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -1583,8 +2226,11 @@
         mi_row + (mi_step >> 1) < cm->mi_rows &&
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
-                       ctx, INT64_MAX);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+#if CONFIG_SUPERTX
+                       &none_rate_nocoef,
+#endif
+                       bsize, ctx, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
@@ -1592,9 +2238,16 @@
         none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
                                  none_rdc.dist);
+#if CONFIG_SUPERTX
+        none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
       }
 
-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl,
+#if CONFIG_VAR_TX
+                      ta, tl,
+#endif
+                      bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
@@ -1603,68 +2256,110 @@
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
                        bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
         vp10_rd_cost_init(&tmp_rdc);
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
                          mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
                          subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     case PARTITION_VERT:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
         vp10_rd_cost_init(&tmp_rdc);
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
                          mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
                          INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                         &last_part_rate_nocoef,
+#endif
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
+#if CONFIG_SUPERTX
+      last_part_rate_nocoef = 0;
+#endif
       for (i = 0; i < 4; i++) {
         int x_idx = (i & 1) * (mi_step >> 1);
         int y_idx = (i >> 1) * (mi_step >> 1);
         int jj = i >> 1, ii = i & 0x01;
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef;
+#endif
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
@@ -1673,13 +2368,22 @@
                          mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
                          i != 3, pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     default:
@@ -1692,6 +2396,9 @@
     last_part_rdc.rate += cpi->partition_cost[pl][partition];
     last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                   last_part_rdc.rate, last_part_rdc.dist);
+#if CONFIG_SUPERTX
+    last_part_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
   }
 
   if (do_partition_search
@@ -1705,7 +2412,14 @@
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = 0;
+#endif
+    restore_context(x, mi_row, mi_col, a, l, sa, sl,
+#if CONFIG_VAR_TX
+                    ta, tl,
+#endif
+                    bsize);
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
@@ -1713,27 +2427,50 @@
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
       RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+      int rt_nocoef = 0;
+#endif
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl[8], sa[8];
+#if CONFIG_VAR_TX
+      TXFM_CONTEXT tl[8], ta[8];
+#endif
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      save_context(x, mi_row, mi_col, a, l, sa, sl,
+#if CONFIG_VAR_TX
+                   ta, tl,
+#endif
+                   bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x,
                        mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+#if CONFIG_SUPERTX
+                       &rt_nocoef,
+#endif
                        split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl,
+#if CONFIG_VAR_TX
+                      ta, tl,
+#endif
+                      bsize);
 
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         vp10_rd_cost_reset(&chosen_rdc);
+#if CONFIG_SUPERTX
+        chosen_rate_nocoef = INT_MAX;
+#endif
         break;
       }
 
       chosen_rdc.rate += tmp_rdc.rate;
       chosen_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += rt_nocoef;
+#endif
 
       if (i != 3)
         encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
@@ -1742,12 +2479,18 @@
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+#endif
     }
     pl = partition_plane_context(xd, mi_row, mi_col, bsize);
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  chosen_rdc.rate, chosen_rdc.dist);
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
     }
   }
 
@@ -1757,15 +2500,29 @@
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = last_part_rate_nocoef;
+#endif
   }
   // If none was better set the partitioning to that.
   if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = PARTITION_NONE;
     chosen_rdc = none_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = none_rate_nocoef;
+#endif
   }
 
-  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+#endif
+  restore_context(x, mi_row, mi_col, a, l, sa, sl,
+#if CONFIG_VAR_TX
+                  ta, tl,
+#endif
+                  bsize);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
@@ -1780,6 +2537,9 @@
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
+#if CONFIG_SUPERTX
+  *rate_nocoef = chosen_rate_nocoef;
+#endif
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
@@ -2029,6 +2789,9 @@
                               TileDataEnc *tile_data,
                               TOKENEXTRA **tp, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                              int *rate_nocoef,
+#endif
                               int64_t best_rd, PC_TREE *pc_tree) {
   VP10_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
@@ -2037,11 +2800,21 @@
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT tl[8], ta[8];
+#endif
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i, pl;
   BLOCK_SIZE subsize;
   RD_COST this_rdc, sum_rdc, best_rdc;
+#if CONFIG_SUPERTX
+  int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
+  int tmp_rate;
+  int abort_flag;
+  int64_t tmp_dist, tmp_rd;
+  PARTITION_TYPE best_partition;
+#endif  // CONFIG_SUPERTX
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
 
@@ -2102,7 +2875,13 @@
     partition_vert_allowed &= force_vert_split;
   }
 
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -2165,14 +2944,20 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
-                     &this_rdc, bsize, ctx, best_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+#if CONFIG_SUPERTX
+                     &this_rate_nocoef,
+#endif
+                     bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  this_rdc.rate, this_rdc.dist);
+#if CONFIG_SUPERTX
+        this_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
@@ -2180,6 +2965,10 @@
         int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
 
         best_rdc = this_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = this_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
@@ -2248,7 +3037,13 @@
 #endif
       }
     }
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif  // CONFIG_VAR_TX
   }
 
   // store estimated motion vector
@@ -2265,14 +3060,82 @@
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+                       &sum_rate_nocoef, subsize, pc_tree->leaf_split[0],
+                       INT64_MAX);
+#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                        pc_tree->leaf_split[0], best_rdc.rdcost);
-      if (sum_rdc.rate == INT_MAX)
+#endif  // CONFIG_SUPERTX
+      if (sum_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      }
+#if CONFIG_SUPERTX
+      if (!frame_is_intra_only(cm) && sum_rdc.rdcost < INT64_MAX &&
+          !xd->lossless[0]) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        best_partition = pc_tree->partitioning;
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+        if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
+#if CONFIG_EXT_TX
+          TX_TYPE best_tx = DCT_DCT;
+#endif
+
+          tmp_rate = sum_rate_nocoef;
+          tmp_dist = 0;
+#if CONFIG_VAR_TX
+          xd->above_txfm_context = cm->above_txfm_context + mi_col;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + (mi_row & 0x07);
+          restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
+          restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif  // CONFIG_VAR_TX
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rate, &tmp_dist,
+#if CONFIG_EXT_TX
+                        &best_tx,
+#endif
+                        pc_tree);
+
+          tmp_rate += vp10_cost_bit(
+              cm->fc->supertx_prob
+              [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+              1);
+          tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
+          if (tmp_rd < sum_rdc.rdcost) {
+            sum_rdc.rdcost = tmp_rd;
+            sum_rdc.rate = tmp_rate;
+            sum_rdc.dist = tmp_dist;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+#if CONFIG_EXT_TX
+                                    best_tx,
+#endif
+                                    supertx_size, pc_tree);
+          }
+        }
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
     } else {
+#if CONFIG_SUPERTX
+      for (i = 0; i < 4 && sum_rdc.rdcost < INT64_MAX; ++i) {
+#else
       for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
-      const int x_idx = (i & 1) * mi_step;
-      const int y_idx = (i >> 1) * mi_step;
+#endif  // CONFIG_SUPERTX
+        const int x_idx = (i & 1) * mi_step;
+        const int y_idx = (i >> 1) * mi_step;
 
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
@@ -2281,20 +3144,90 @@
           load_pred_mv(x, ctx);
 
         pc_tree->split[i]->index = i;
+#if CONFIG_SUPERTX
+        rd_pick_partition(cpi, td, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
+                          subsize, &this_rdc, &this_rate_nocoef,
+                          INT64_MAX - sum_rdc.rdcost, pc_tree->split[i]);
+#else
         rd_pick_partition(cpi, td, tile_data, tp,
                           mi_row + y_idx, mi_col + x_idx,
                           subsize, &this_rdc,
                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+#endif  // CONFIG_SUPERTX
 
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
           break;
         } else {
           sum_rdc.rate += this_rdc.rate;
           sum_rdc.dist += this_rdc.dist;
           sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
         }
       }
+#if CONFIG_SUPERTX
+      if (!frame_is_intra_only(cm) &&
+          sum_rdc.rdcost < INT64_MAX &&
+          i == 4 && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+          !xd->lossless[0]) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        best_partition = pc_tree->partitioning;
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+#if CONFIG_EXT_TX
+          TX_TYPE best_tx = DCT_DCT;
+#endif
+
+          tmp_rate = sum_rate_nocoef;
+          tmp_dist = 0;
+#if CONFIG_VAR_TX
+          xd->above_txfm_context = cm->above_txfm_context + mi_col;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + (mi_row & 0x07);
+          restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
+          restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif  // CONFIG_VAR_TX
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rate, &tmp_dist,
+#if CONFIG_EXT_TX
+                        &best_tx,
+#endif
+                        pc_tree);
+
+          tmp_rate += vp10_cost_bit(
+              cm->fc->supertx_prob
+              [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+              1);
+          tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
+          if (tmp_rd < sum_rdc.rdcost) {
+            sum_rdc.rdcost = tmp_rd;
+            sum_rdc.rate = tmp_rate;
+            sum_rdc.dist = tmp_dist;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+#if CONFIG_EXT_TX
+                                    best_tx,
+#endif
+                                    supertx_size, pc_tree);
+          }
+        }
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
@@ -2302,9 +3235,16 @@
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+#endif  // CONFIG_SUPERTX
 
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else {
@@ -2313,23 +3253,39 @@
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-  }
+#endif
+  }  // if (do_split)
 
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rect || vp10_active_h_edge(cpi, mi_row, mi_step))) {
-      subsize = get_subsize(bsize, PARTITION_HORZ);
+    subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->horizontal[0], best_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                     subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
 
-    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+#if CONFIG_SUPERTX
+    abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
+        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
       update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
@@ -2341,33 +3297,113 @@
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+                       &this_rdc, &this_rate_nocoef,
+                       subsize, &pc_tree->horizontal[1],
+                       INT64_MAX);
+#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
                        &this_rdc, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
       }
     }
 
+#if CONFIG_SUPERTX
+    if (!frame_is_intra_only(cm) && !abort_flag &&
+        sum_rdc.rdcost < INT64_MAX && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+        !xd->lossless[0]) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      best_partition = pc_tree->partitioning;
+      pc_tree->partitioning = PARTITION_HORZ;
+
+      sum_rdc.rate += vp10_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+          [supertx_size], 0);
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+#if CONFIG_EXT_TX
+        TX_TYPE best_tx = DCT_DCT;
+#endif
+
+        tmp_rate = sum_rate_nocoef;
+        tmp_dist = 0;
+#if CONFIG_VAR_TX
+        xd->above_txfm_context = cm->above_txfm_context + mi_col;
+        xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+        restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
+        restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif  // CONFIG_VAR_TX
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                      &tmp_rate, &tmp_dist,
+#if CONFIG_EXT_TX
+                      &best_tx,
+#endif
+                      pc_tree);
+
+        tmp_rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_HORZ]][supertx_size],
+            1);
+        tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
+        if (tmp_rd < sum_rdc.rdcost) {
+          sum_rdc.rdcost = tmp_rd;
+          sum_rdc.rate = tmp_rate;
+          sum_rdc.dist = tmp_dist;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+#if CONFIG_EXT_TX
+                                  best_tx,
+#endif
+                                  supertx_size, pc_tree);
+        }
+      }
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
+
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += cpi->partition_cost[pl][PARTITION_HORZ];
+#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif
   }
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp10_active_v_edge(cpi, mi_col, mi_step))) {
-      subsize = get_subsize(bsize, PARTITION_VERT);
+    subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
@@ -2375,9 +3411,19 @@
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->vertical[0], best_rdc.rdcost);
-    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                     subsize, &pc_tree->vertical[0], best_rdc.rdcost);
+#if CONFIG_SUPERTX
+    abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
+                 (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
@@ -2389,29 +3435,106 @@
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                       &this_rate_nocoef, subsize, &pc_tree->vertical[1],
+                       INT64_MAX - sum_rdc.rdcost);
+#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
                        &this_rdc, subsize,
                        &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
       }
     }
+#if CONFIG_SUPERTX
+    if (!frame_is_intra_only(cm) && !abort_flag &&
+        sum_rdc.rdcost < INT64_MAX && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+        !xd->lossless[0]) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      best_partition = pc_tree->partitioning;
+      pc_tree->partitioning = PARTITION_VERT;
+      sum_rdc.rate += vp10_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+                              [supertx_size], 0);
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+#if CONFIG_EXT_TX
+        TX_TYPE best_tx = DCT_DCT;
+#endif
+
+        tmp_rate = sum_rate_nocoef;
+        tmp_dist = 0;
+#if CONFIG_VAR_TX
+        xd->above_txfm_context = cm->above_txfm_context + mi_col;
+        xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+        restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
+        restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif  // CONFIG_VAR_TX
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                      &tmp_rate, &tmp_dist,
+#if CONFIG_EXT_TX
+                      &best_tx,
+#endif
+                      pc_tree);
+
+        tmp_rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_VERT]][supertx_size],
+            1);
+        tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
+        if (tmp_rd < sum_rdc.rdcost) {
+          sum_rdc.rdcost = tmp_rd;
+          sum_rdc.rate = tmp_rate;
+          sum_rdc.dist = tmp_dist;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+#if CONFIG_EXT_TX
+                                  best_tx,
+#endif
+                                  supertx_size, pc_tree);
+        }
+      }
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += cpi->partition_cost[pl][PARTITION_VERT];
+#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, ta, tl, bsize);
+#else
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#endif
   }
 
   // TODO(jbb): This code added so that we avoid static analysis
@@ -2420,7 +3543,9 @@
   // checks occur in some sub function and thus are used...
   (void) best_rd;
   *rd_cost = best_rdc;
-
+#if CONFIG_SUPERTX
+  *rate_nocoef = best_rate_nocoef;
+#endif  // CONFIG_SUPERTX
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
@@ -2453,7 +3578,10 @@
   // Initialize the left context for the new SB row
   memset(&xd->left_context, 0, sizeof(xd->left_context));
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
-
+#if CONFIG_VAR_TX
+  memset(xd->left_txfm_context_buffer, 0,
+         sizeof(xd->left_txfm_context_buffer));
+#endif
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
@@ -2461,6 +3589,9 @@
     int dummy_rate;
     int64_t dummy_dist;
     RD_COST dummy_rdc;
+#if CONFIG_SUPERTX
+    int dummy_rate_nocoef;
+#endif  // CONFIG_SUPERTX
     int i;
     int seg_skip = 0;
 
@@ -2496,19 +3627,31 @@
       set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+                       BLOCK_64X64, &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, td->pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
       set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+                       BLOCK_64X64, &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+                       BLOCK_64X64, &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, td->pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
@@ -2518,7 +3661,11 @@
                                 &x->max_partition_size);
       }
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rdc, INT64_MAX, td->pc_root);
+                        &dummy_rdc,
+#if CONFIG_SUPERTX
+                        &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                        INT64_MAX, td->pc_root);
     }
   }
 }
@@ -2541,6 +3688,10 @@
          2 * aligned_mi_cols * MAX_MB_PLANE);
   memset(xd->above_seg_context, 0,
          sizeof(*xd->above_seg_context) * aligned_mi_cols);
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, 0,
+         sizeof(*xd->above_txfm_context) * aligned_mi_cols);
+#endif
 }
 
 static int check_dual_ref_flags(VP10_COMP *cpi) {
@@ -2549,8 +3700,14 @@
   if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
-    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
-        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+    return (!!(ref_flags & VP9_GOLD_FLAG) +
+            !!(ref_flags & VP9_LAST_FLAG) +
+#if CONFIG_EXT_REFS
+            !!(ref_flags & VP9_LAST2_FLAG) +
+            !!(ref_flags & VP9_LAST3_FLAG) +
+            !!(ref_flags & VP9_LAST4_FLAG) +
+#endif  // CONFIG_EXT_REFS
+            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
   }
 }
 
@@ -2575,6 +3732,8 @@
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
+    // TODO(zoeliu): TO investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     return LAST_FRAME;
 }
 
@@ -2734,6 +3893,9 @@
 
   x->quant_fp = cpi->sf.use_quant_fp;
   vp10_zero(x->skip_txfm);
+#if CONFIG_VAR_TX
+  vp10_zero(x->blk_skip);
+#endif
 
   {
     struct vpx_usec_timer emr_timer;
@@ -2798,7 +3960,14 @@
       cpi->allow_comp_inter_inter = 1;
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
+#if CONFIG_EXT_REFS
+      cm->comp_var_ref[1] = LAST2_FRAME;
+      cm->comp_var_ref[2] = LAST3_FRAME;
+      cm->comp_var_ref[3] = LAST4_FRAME;
+      cm->comp_var_ref[4] = GOLDEN_FRAME;
+#else
       cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
     }
   } else {
     cpi->allow_comp_inter_inter = 0;
@@ -2814,9 +3983,12 @@
     // either compound, single or hybrid prediction as per whatever has
     // worked best for that type of frame in the past.
     // It also predicts whether another coding mode would have worked
-    // better that this coding mode. If that is the case, it remembers
+    // better than this coding mode. If that is the case, it remembers
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
+    //
+    // TODO(zoeliu): TO investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
     int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
@@ -2836,8 +4008,9 @@
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
-    if (cm->interp_filter == SWITCHABLE)
+    if (cm->interp_filter == SWITCHABLE) {
       cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+    }
 
     encode_frame_internal(cpi);
 
@@ -2865,6 +4038,7 @@
       }
     }
 
+#if !CONFIG_VAR_TX
     if (cm->tx_mode == TX_MODE_SELECT) {
       int count4x4 = 0;
       int count8x8_lp = 0, count8x8_8x8p = 0;
@@ -2885,20 +4059,36 @@
         count32x32 += counts->tx.p32x32[i][TX_32X32];
       }
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+#if CONFIG_SUPERTX
+          cm->counts.supertx_size[TX_16X16] == 0 &&
+          cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
           count32x32 == 0) {
         cm->tx_mode = ALLOW_8X8;
         reset_skip_tx_size(cm, TX_8X8);
       } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+                 count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_8X8] == 0 &&
+                 cm->counts.supertx_size[TX_16X16] == 0 &&
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count32x32 == 0) {
         cm->tx_mode = ONLY_4X4;
         reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count4x4 == 0) {
         cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+      } else if (count32x32 == 0 && count8x8_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count4x4 == 0) {
         cm->tx_mode = ALLOW_16X16;
         reset_skip_tx_size(cm, TX_16X16);
       }
     }
+#endif
   } else {
     cm->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
@@ -2941,6 +4131,140 @@
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
+#if CONFIG_VAR_TX
+static void update_txfm_count(MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts,
+                              TX_SIZE tx_size, int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + (blk_col >> 1),
+                                   xd->left_txfm_context + (blk_row >> 1),
+                                   tx_size);
+  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    ++counts->txfm_partition[ctx][0];
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int i;
+    ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      update_txfm_count(xd, counts, tx_size - 1,
+                        blk_row + offsetr, blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_count_update(VP10_COMMON *cm,
+                                      MACROBLOCKD *xd,
+                                      BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col,
+                                      FRAME_COUNTS *td_counts) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      update_txfm_count(xd, td_counts, max_tx_size, idy, idx);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
+                             int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int tx_idx = (blk_row >> 1) * 8 + (blk_col >> 1);
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_idx];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                          xd->left_txfm_context + (blk_row >> 1), tx_size);
+
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_idx] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + (blk_col >> 1),
+                            xd->left_txfm_context + (blk_row >> 1), TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    --bsl;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
+      set_txfm_context(xd, tx_size - 1,
+                       blk_row + offsetr, blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_set_contexts(VP10_COMMON *cm,
+                                      MACROBLOCKD *xd,
+                                      BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & 0x07);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      set_txfm_context(xd, max_tx_size, idy, idx);
+}
+#endif
+
 static void encode_superblock(VP10_COMP *cpi, ThreadData *td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -2977,6 +4301,16 @@
     if (output_enabled)
       sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm));
+
+    if (bsize >= BLOCK_8X8 && output_enabled) {
+      if (mbmi->palette_mode_info.palette_size[0] > 0) {
+        mbmi->palette_mode_info.palette_first_color_idx[0] =
+            xd->plane[0].color_index_map[0];
+        // TODO(huisu): this increases the use of token buffer. Needs stretch
+        // test to verify.
+        vp10_tokenize_palette_sb(td, bsize, 0, t);
+      }
+    }
     vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
   } else {
     int ref;
@@ -2997,25 +4331,33 @@
                                      VPXMAX(bsize, BLOCK_8X8));
 
     vp10_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+#if CONFIG_VAR_TX
+    vp10_tokenize_sb_inter(cpi, td, t, !output_enabled,
+                           mi_row, mi_col, VPXMAX(bsize, BLOCK_8X8));
+#else
     vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+#endif
   }
 
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi))
+        tx_partition_count_update(cm, xd, bsize, mi_row, mi_col, td->counts);
+#endif
       ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
                       &td->counts->tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
-      if (is_inter_block(&mi->mbmi)) {
+      if (is_inter_block(&mi->mbmi))
         tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
                          max_txsize_lookup[bsize]);
-      } else {
+      else
         tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
-      }
 
       for (y = 0; y < mi_height; y++)
         for (x = 0; x < mi_width; x++)
@@ -3024,6 +4366,22 @@
     }
     ++td->counts->tx.tx_totals[mbmi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(mbmi->tx_size, bsize, is_inter_block(mbmi)) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int eset = get_ext_tx_set(mbmi->tx_size, bsize,
+                                is_inter_block(mbmi));
+      if (eset > 0) {
+        if (is_inter_block(mbmi)) {
+          ++td->counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
+        } else {
+          ++td->counts->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode]
+              [mbmi->tx_type];
+        }
+      }
+    }
+#else
     if (mbmi->tx_size < TX_32X32 &&
         cm->base_qindex > 0 && !mbmi->skip &&
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -3035,5 +4393,822 @@
                                   [mbmi->tx_type];
       }
     }
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+    if (bsize >= BLOCK_8X8 && !is_inter_block(mbmi)) {
+      if (mbmi->mode == DC_PRED)
+        ++td->counts->ext_intra[0]
+                              [mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
+      if (mbmi->uv_mode == DC_PRED)
+        ++td->counts->ext_intra[1]
+                              [mbmi->ext_intra_mode_info.use_ext_intra_mode[1]];
+    }
+#endif  // CONFIG_EXT_INTRA
+  }
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
+      is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
+    if (!output_enabled)
+      tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+  } else {
+    TX_SIZE tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter_block(mbmi))
+      tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                       max_txsize_lookup[bsize]);
+    else
+      tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
+    mbmi->tx_size = tx_size;
+    set_txfm_ctx(xd->left_txfm_context, tx_size, xd->n8_h);
+    set_txfm_ctx(xd->above_txfm_context, tx_size, xd->n8_w);
+  }
+#endif
+}
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
+  return !is_inter_mode((&ctx->mic)->mbmi.mode);
+}
+
+static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize = bsize;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return 1;
+
+  if (bsize >= BLOCK_8X8)
+    subsize = get_subsize(bsize, pc_tree->partitioning);
+  else
+    subsize = BLOCK_4X4;
+
+  partition = partition_lookup[bsl][subsize];
+
+  switch (partition) {
+    case PARTITION_NONE:
+      return check_intra_b(&pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      if (check_intra_b(&pc_tree->vertical[0]))
+        return 1;
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        if (check_intra_b(&pc_tree->vertical[1]))
+          return 1;
+      }
+      break;
+    case PARTITION_HORZ:
+      if (check_intra_b(&pc_tree->horizontal[0]))
+        return 1;
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        if (check_intra_b(&pc_tree->horizontal[1]))
+          return 1;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        if (check_intra_b(pc_tree->leaf_split[0]))
+          return 1;
+      } else {
+        if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
+                           pc_tree->split[0]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
+                           pc_tree->split[1]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
+                           pc_tree->split[2]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
+                           pc_tree->split[3]))
+          return 1;
+      }
+      break;
+    default:
+      assert(0);
+  }
+  return 0;
+}
+
+static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
+  return ctx->mic.mbmi.tx_size == supertx_size;
+}
+
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree) {
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  partition = pc_tree->partitioning;
+  subsize = get_subsize(bsize, partition);
+  switch (partition) {
+    case PARTITION_NONE:
+      return check_supertx_b(supertx_size, &pc_tree->none);
+    case PARTITION_VERT:
+      return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
+    case PARTITION_HORZ:
+      return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8)
+        return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
+      else
+        return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
+    default:
+      assert(0);
+      return 0;
   }
 }
+
+static void predict_superblock(VP10_COMP *cpi, ThreadData *td,
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi_8x8 = xd->mi[0];
+  MODE_INFO *mi = mi_8x8;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int ref;
+  const int is_compound = has_second_ref(mbmi);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
+                                                   mbmi->ref_frame[ref]);
+    vp10_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
+                         &xd->block_refs[ref]->sf);
+  }
+
+  if (!b_sub8x8)
+    vp10_build_inter_predictors_sb(xd, mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    vp10_build_inter_predictors_sb_sub8x8(xd, mi_row_pred, mi_col_pred,
+                                          bsize_pred, block);
+}
+
+static void predict_b_extend(VP10_COMP *cpi, ThreadData *td,
+                             const TileInfo *const tile,
+                             int block,
+                             int mi_row_ori, int mi_col_ori,
+                             int mi_row_pred, int mi_col_pred,
+                             int mi_row_top, int mi_col_top,
+                             uint8_t * dst_buf[3], int dst_stride[3],
+                             BLOCK_SIZE bsize_ori, BLOCK_SIZE bsize_top,
+                             BLOCK_SIZE bsize_pred, int output_enabled,
+                             int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top ||
+      mi_row_pred >= cm->mi_rows || mi_col_pred >= cm->mi_cols)
+    return;
+
+  set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred,
+                     mi_row_ori, mi_col_ori, bsize_pred, bsize_ori);
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  predict_superblock(cpi, td,
+                     mi_row_pred, mi_col_pred, bsize_pred,
+                     b_sub8x8, block);
+
+  if (output_enabled && !bextend)
+    update_stats(&cpi->common, td, 1);
+}
+
+static void extend_dir(VP10_COMP *cpi, ThreadData *td,
+                       const TileInfo *const tile,
+                       int block, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                       int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top,
+                       int output_enabled,
+                       uint8_t * dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  MACROBLOCKD *xd = &td->mb.e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {  // lower and upper
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss) ?
+                   BLOCK_8X8 : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride,
+                     bsize, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      for (i = 0; i < mi_width/unit - 1; i++) {
+        mi_col_pred += unit;
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                         dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                         output_enabled, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {  // left and right
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss) ?
+                   BLOCK_8X8 : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                     dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height/unit - 1; i++) {
+        mi_row_pred += unit;
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                         dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                         output_enabled, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                     dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+  }
+}
+
+static void extend_all(VP10_COMP *cpi, ThreadData *td,
+                       const TileInfo *const tile,
+                       int block,
+                       BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                       int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top,
+                       int output_enabled,
+                       uint8_t * dst_buf[3], int dst_stride[3]) {
+  assert(block >= 0 && block < 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 7);
+}
+
+
+// This function generates prediction for multiple blocks, between which
+// discontinuity around boundary is reduced by smoothing masks. The basic
+// smoothing mask is a soft step function along horz/vert direction. In more
+// complicated case when a block is split into 4 subblocks, the basic mask is
+// first applied to neighboring subblocks (2 pairs) in horizontal direction and
+// then applied to the 2 masked prediction mentioned above in vertical direction
+// If the block is split into more than one level, at every stage, masked
+// prediction is stored in dst_buf[] passed from higher level.
+static void predict_sb_complex(VP10_COMP *cpi, ThreadData *td,
+                               const TileInfo *const tile,
+                               int mi_row, int mi_col,
+                               int mi_row_top, int mi_col_top,
+                               int output_enabled, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize,
+                               uint8_t *dst_buf[3], int dst_stride[3],
+                               PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  int i, ctx;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf1[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf2[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf3[MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * 2]);
+  int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+  int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+  int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * len);
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAXTXLEN * MAXTXLEN;
+    dst_buf1[2] = tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAXTXLEN * MAXTXLEN;
+    dst_buf2[2] = tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAXTXLEN * MAXTXLEN;
+    dst_buf3[2] = tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize >= BLOCK_8X8) {
+    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    subsize = get_subsize(bsize, pc_tree->partitioning);
+  } else {
+    ctx = 0;
+    subsize = BLOCK_4X4;
+  }
+  partition = partition_lookup[bsl][subsize];
+  if (output_enabled && bsize != BLOCK_4X4 && bsize < top_bsize)
+      cm->counts.partition[ctx][partition]++;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       bsize, top_bsize, bsize, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8) {
+        // Fisrt half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  &xd->plane[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, 0);
+      }  else {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, subsize, output_enabled, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, subsize, top_bsize, subsize,
+                           output_enabled, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1, 1);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8) {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  &xd->plane[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, 0);
+      } else {
+        // bsize: not important, not useful
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, subsize, output_enabled, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride, 3);
+
+
+        if (mi_col + hbs < cm->mi_cols) {
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, subsize, top_bsize, subsize,
+                           output_enabled, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1, 2);
+
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+
+        if (bsize < top_bsize) {
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf2, dst_stride2);
+          extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf3, dst_stride3);
+        }
+      } else {
+        predict_sb_complex(cpi, td, tile, mi_row, mi_col,
+                           mi_row_top, mi_col_top, output_enabled, subsize,
+                           top_bsize, dst_buf, dst_stride,
+                           pc_tree->split[0]);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             top_bsize, dst_buf1, dst_stride1,
+                             pc_tree->split[1]);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             top_bsize, dst_buf2, dst_stride2,
+                             pc_tree->split[2]);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             top_bsize, dst_buf3, dst_stride3,
+                             pc_tree->split[3]);
+      }
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i],
+                                                      dst_stride[i],
+                                                      dst_buf1[i],
+                                                      dst_stride1[i],
+                                                      &xd->plane[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_VERT, i);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        dst_buf3[i],
+                                                        dst_stride3[i],
+                                                        &xd->plane[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_VERT, i);
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf[i],
+                                                        dst_stride[i],
+                                                       dst_buf2[i],
+                                                       dst_stride2[i],
+                                                       &xd->plane[i],
+                                                       mi_row, mi_col,
+                                                       mi_row_top, mi_col_top,
+                                                       bsize, top_bsize,
+                                                       PARTITION_HORZ, i);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i],
+                                                      dst_stride[i],
+                                                      dst_buf2[i],
+                                                      dst_stride2[i],
+                                                      &xd->plane[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_HORZ, i);
+          }
+        }
+        break;
+    default:
+        assert(0);
+  }
+
+
+  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
+                          const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          int *tmp_rate, int64_t *tmp_dist,
+#if CONFIG_EXT_TX
+                          TX_TYPE *best_tx,
+#endif  // CONFIG_EXT_TX
+                          PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
+      base_rate = *tmp_rate;
+  int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
+  uint8_t *dst_buf[3];
+  int dst_stride[3];
+  TX_SIZE tx_size;
+#if CONFIG_EXT_TX
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  TX_TYPE tx_type, best_tx_nostx = xd->mi[0]->mbmi.tx_type;
+  int ext_tx_set;
+  int tmp_rate_tx = 0, skip_tx = 0;
+  int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
+  uint8_t tmp_zcoeff_blk = 0;
+#endif  // CONFIG_EXT_TX
+
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 0, pc_tree);
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
+                        mi_row, mi_col);
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    dst_buf[plane] = xd->plane[plane].dst.buf;
+    dst_stride[plane] = xd->plane[plane].dst.stride;
+  }
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
+                     0, bsize, bsize, dst_buf, dst_stride, pc_tree);
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+#if CONFIG_EXT_TX
+  *best_tx = DCT_DCT;
+#endif
+
+  // chroma
+  skippable_uv = 1;
+  rate_uv = 0;
+  dist_uv = 0;
+  sse_uv = 0;
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    tx_size = max_txsize_lookup[bsize];
+    tx_size = get_uv_tx_size_impl(tx_size, bsize,
+                                  cm->subsampling_x, cm->subsampling_y);
+    vp10_subtract_plane(x, bsize, plane);
+    vp10_txfm_rd_in_plane_supertx(x,
+#if CONFIG_VAR_TX
+                                  cpi,
+#endif
+                                  &this_rate, &this_dist, &pnskip, &pnsse,
+                                  INT64_MAX, plane, bsize, tx_size, 0);
+    rate_uv += this_rate;
+    dist_uv += this_dist;
+    sse_uv += pnsse;
+    skippable_uv &= pnskip;
+  }
+
+  // luma
+  tx_size = max_txsize_lookup[bsize];
+  vp10_subtract_plane(x, bsize, 0);
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(tx_size, bsize, 1);
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    if (!ext_tx_used_inter[ext_tx_set][tx_type])
+      continue;
+    mbmi->tx_type = tx_type;
+    if (ext_tx_set == 1 &&
+        mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
+        *best_tx == DCT_DCT) {
+      tx_type = IDTX - 1;
+      break;
+    }
+    vp10_txfm_rd_in_plane_supertx(x,
+#if CONFIG_VAR_TX
+                                  cpi,
+#endif
+                                  &this_rate, &this_dist, &pnskip,
+                                  &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
+    if (get_ext_tx_types(tx_size, bsize, 1) > 1 &&
+        !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+        this_rate != INT_MAX) {
+      if (ext_tx_set > 0)
+        this_rate += cpi->inter_tx_type_costs[ext_tx_set]
+            [mbmi->tx_size][mbmi->tx_type];
+    }
+    *tmp_rate = rate_uv + this_rate;
+    *tmp_dist = dist_uv + this_dist;
+    sse = sse_uv + pnsse;
+    skippable = skippable_uv && pnskip;
+    if (skippable) {
+      *tmp_rate = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+      x->skip = 1;
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist)
+          < RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+        *tmp_rate += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+        x->skip = 0;
+      } else {
+        *tmp_dist = sse;
+        *tmp_rate = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+        x->skip = 1;
+      }
+    }
+    *tmp_rate += base_rate;
+    rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+    if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
+      *best_tx = tx_type;
+      bestrd_tx = rd_tx;
+      tmp_rate_tx = *tmp_rate;
+      tmp_dist_tx = *tmp_dist;
+      skip_tx = x->skip;
+      tmp_zcoeff_blk = x->zcoeff_blk[tx_size][0];
+    }
+  }
+  x->zcoeff_blk[tx_size][0] = tmp_zcoeff_blk;
+  *tmp_rate = tmp_rate_tx;
+  *tmp_dist = tmp_dist_tx;
+  x->skip = skip_tx;
+  xd->mi[0]->mbmi.tx_type = best_tx_nostx;
+
+#else   // CONFIG_EXT_TX
+
+    vp10_txfm_rd_in_plane_supertx(x,
+#if CONFIG_VAR_TX
+                                  cpi,
+#endif
+                                  &this_rate, &this_dist, &pnskip, &pnsse,
+                                  INT64_MAX, 0, bsize, tx_size, 0);
+    *tmp_rate = rate_uv + this_rate;
+    *tmp_dist = dist_uv + this_dist;
+    sse = sse_uv + pnsse;
+    skippable = skippable_uv && pnskip;
+    if (skippable) {
+      *tmp_rate = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+      x->skip = 1;
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist)
+          < RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+        *tmp_rate += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+        x->skip = 0;
+      } else {
+        *tmp_dist = sse;
+        *tmp_rate = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+        x->skip = 1;
+      }
+    }
+    *tmp_rate += base_rate;
+#endif  // CONFIG_EXT_TX
+}
+#endif  // CONFIG_SUPERTX

diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 92ba4dd..68cf932 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c

@@ -23,6 +23,8 @@
 #include "vp10/common/scan.h"
 
 #include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/hybrid_fwd_txfm.h"
+#include "vp10/encoder/quantize.h"
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/tokenize.h"
 
@@ -104,8 +106,9 @@
   const int mul = 1 + (tx_size == TX_32X32);
   const int16_t *dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type = get_tx_type(type, xd, block);
-  const scan_order *const so = get_scan(tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order *const so =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
   int next = eob, sz = 0;
@@ -301,451 +304,107 @@
   final_eob++;
 
   mb->plane[plane].eobs[block] = final_eob;
+  assert(final_eob <= default_eob);
   return final_eob;
 }
 
-static INLINE void fdct32x32(int rd_transform,
-                             const int16_t *src, tran_low_t *dst,
-                             int src_stride) {
-  if (rd_transform)
-    vpx_fdct32x32_rd(src, dst, src_stride);
-  else
-    vpx_fdct32x32(src, dst, src_stride);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
-                                    tran_low_t *dst, int src_stride) {
-  if (rd_transform)
-    vpx_highbd_fdct32x32_rd(src, dst, src_stride);
-  else
-    vpx_highbd_fdct32x32(src, dst, src_stride);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_LOWBD_32 = 1,
+  QUANT_FUNC_HIGHBD = 2,
+  QUANT_FUNC_HIGHBD_32 = 3,
+  QUANT_FUNC_LAST = 4
+} QUANT_FUNC;
 
-void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                       int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    vp10_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-  }
-}
+static VP10_QUANT_FACADE
+    quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
+        {vp10_quantize_fp_facade, vp10_quantize_fp_32x32_facade,
+         vp10_highbd_quantize_fp_facade, vp10_highbd_quantize_fp_32x32_facade},
+        {vp10_quantize_b_facade, vp10_quantize_b_32x32_facade,
+         vp10_highbd_quantize_b_facade, vp10_highbd_quantize_b_32x32_facade},
+        {vp10_quantize_dc_facade, vp10_quantize_dc_32x32_facade,
+         vp10_highbd_quantize_dc_facade, vp10_highbd_quantize_dc_32x32_facade},
+        {NULL, NULL, NULL, NULL}};
 
-static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
+#else
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_LOWBD_32 = 1,
+  QUANT_FUNC_LAST = 2
+} QUANT_FUNC;
 
-static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
+static VP10_QUANT_FACADE
+    quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
+        {vp10_quantize_fp_facade, vp10_quantize_fp_32x32_facade},
+        {vp10_quantize_b_facade, vp10_quantize_b_32x32_facade},
+        {vp10_quantize_dc_facade, vp10_quantize_dc_32x32_facade},
+        {NULL, NULL}};
+#endif
 
-static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
-                           tran_low_t *coeff, int diff_stride,
-                           TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      fdct32x32(rd_transform, src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      assert(0);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
+static FWD_TXFM_OPT fwd_txfm_opt_list[VP10_XFORM_QUANT_LAST] = {
+    FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_DC,
+    FWD_TXFM_OPT_NORMAL};
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                              int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    assert(tx_type == DCT_DCT);
-    vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-  }
-}
-
-static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
-                                  tran_low_t *coeff, int diff_stride,
-                                  TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      highbd_fdct32x32(rd_transform, src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      assert(0);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                      int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      VP10_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const scan_order *const scan_order =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
+  const int tx1d_size = get_tx1d_size(tx_size);
+  const int tx2d_size = tx1d_size * tx1d_size;
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+  fwd_txfm_param.tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[xform_quant_idx];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                     p->round_fp, p->quant_fp, p->quant_shift,
-                                     qcoeff, dqcoeff, pd->dequant,
-                                     eob, scan_order->scan,
-                                     scan_order->iscan);
-        break;
-      case TX_16X16:
-        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      case TX_8X8:
-        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      default:
-        assert(0);
-    }
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  switch (tx_size) {
-    case TX_32X32:
-      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
-      break;
-    case TX_16X16:
-      vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    case TX_8X8:
-      vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
-                        x->skip_block, p->zbin, p->round_fp,
-                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                        pd->dequant, eob,
-                        scan_order->scan, scan_order->iscan);
-      break;
-    case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        vp10_fwht4x4(src_diff, coeff, diff_stride);
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (xform_quant_idx != VP10_XFORM_QUANT_SKIP_QUANT) {
+      if (x->skip_block) {
+        vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
       } else {
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
+        if (tx_size == TX_32X32)
+          quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD_32](
+              coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
+        else
+          quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+              coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
       }
-      vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  const int16_t *src_diff;
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
-        break;
-      case TX_16X16:
-        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      case TX_8X8:
-        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      default:
-        assert(0);
     }
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  switch (tx_size) {
-    case TX_32X32:
-      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                            p->quant_fp[0], qcoeff, dqcoeff,
-                            pd->dequant[0], eob);
-      break;
-    case TX_16X16:
-      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
-                     p->quant_fp[0], qcoeff, dqcoeff,
-                     pd->dequant[0], eob);
-      break;
-    case TX_8X8:
-      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
-                      p->quant_fp[0], qcoeff, dqcoeff,
-                      pd->dequant[0], eob);
-      break;
-    case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        vp10_fwht4x4(src_diff, coeff, diff_stride);
-      } else {
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-      }
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
-                      p->quant_fp[0], qcoeff, dqcoeff,
-                      pd->dequant[0], eob);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-
-
-void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
-                      int blk_row, int blk_col,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  const int16_t *src_diff;
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-     switch (tx_size) {
-      case TX_32X32:
-        highbd_fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride,
-                         tx_type);
-        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob,
-                                    scan_order->scan, scan_order->iscan);
-        break;
-      case TX_16X16:
-        highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
-        break;
-      case TX_8X8:
-        highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
-        break;
-      case TX_4X4:
-        vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                                 xd->lossless[xd->mi[0]->mbmi.segment_id]);
-        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
-        break;
-      default:
-        assert(0);
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (xform_quant_idx != VP10_XFORM_QUANT_SKIP_QUANT) {
+    if (x->skip_block) {
+      vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+    } else {
+      if (tx_size == TX_32X32)
+        quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD_32](
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
+      else
+        quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
     }
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  switch (tx_size) {
-    case TX_32X32:
-      fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride, tx_type);
-      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-                           scan_order->iscan);
-      break;
-    case TX_16X16:
-      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
-      break;
-    case TX_8X8:
-      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
-      break;
-    case TX_4X4:
-      vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                        xd->lossless[xd->mi[0]->mbmi.segment_id]);
-      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
-      break;
-    default:
-      assert(0);
-      break;
   }
 }
 
@@ -761,20 +420,31 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  INV_TXFM_PARAM inv_txfm_param;
+#if CONFIG_VAR_TX
+  int i;
+  const int bwl = b_width_log2_lookup[plane_bsize];
+#endif
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
   a = &ctx->ta[plane][blk_col];
   l = &ctx->tl[plane][blk_row];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
-  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
-    p->eobs[block] = 0;
-    *a = *l = 0;
-    return;
-  }
+  // Turn this back on when the rate-distortion loop is synchronized with
+  // the recursive transform block coding.
+//  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+//    p->eobs[block] = 0;
+//    *a = *l = 0;
+//    return;
+//  }
 
+#if CONFIG_VAR_TX
+  if (!x->skip_recode &&
+      x->blk_skip[plane][(blk_row << bwl) + blk_col] == 0) {
+#else
   if (!x->skip_recode) {
+#endif
     if (x->quant_fp) {
       // Encoding process for rtc mode
       if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
@@ -783,102 +453,151 @@
         *a = *l = 0;
         return;
       } else {
-        vp10_xform_quant_fp(x, plane, block, blk_row, blk_col,
-                            plane_bsize, tx_size);
+        vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                         tx_size, VP10_XFORM_QUANT_FP);
       }
     } else {
       if (max_txsize_lookup[plane_bsize] == tx_size) {
         int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
         if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
           // full forward transform and quantization
-          vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                           plane_bsize, tx_size);
+          vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                           tx_size, VP10_XFORM_QUANT_B);
         } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
           // fast path forward transform and quantization
-          vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
-                              plane_bsize, tx_size);
+          vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                           tx_size, VP10_XFORM_QUANT_DC);
         } else {
           // skip forward transform
           p->eobs[block] = 0;
           *a = *l = 0;
+#if !CONFIG_VAR_TX
           return;
+#endif
         }
       } else {
-        vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                         plane_bsize, tx_size);
+        vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                         tx_size, VP10_XFORM_QUANT_B);
       }
     }
   }
+#if CONFIG_VAR_TX
+  else {
+    if (!x->skip_recode)
+      p->eobs[block] = 0;
+  }
+#endif
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    const int ctx = combine_entropy_contexts(*a, *l);
+    int ctx;
+#if CONFIG_VAR_TX
+    switch (tx_size) {
+      case TX_4X4:
+        break;
+      case TX_8X8:
+        a[0] = !!*(const uint16_t *)&a[0];
+        l[0] = !!*(const uint16_t *)&l[0];
+        break;
+      case TX_16X16:
+        a[0] = !!*(const uint32_t *)&a[0];
+        l[0] = !!*(const uint32_t *)&l[0];
+        break;
+      case TX_32X32:
+        a[0] = !!*(const uint64_t *)&a[0];
+        l[0] = !!*(const uint64_t *)&l[0];
+        break;
+      default:
+        assert(0 && "Invalid transform size.");
+        break;
+    }
+#endif
+    ctx = combine_entropy_contexts(*a, *l);
     *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
   } else {
     *a = *l = p->eobs[block] > 0;
   }
 
+#if CONFIG_VAR_TX
+  for (i = 0; i < (1 << tx_size); ++i) {
+    a[i] = a[0];
+    l[i] = l[0];
+  }
+#endif
+
   if (p->eobs[block])
     *(args->skip) = 0;
 
   if (p->eobs[block] == 0)
     return;
+
+  // inverse transform parameters
+  inv_txfm_param.tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+  inv_txfm_param.tx_size = tx_size;
+  inv_txfm_param.eob = p->eobs[block];
+  inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride,
-                                       p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_16X16:
-        vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride,
-                                       p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_8X8:
-        vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride,
-                                     p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_4X4:
-        // this is like vp10_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride,
-                                     p->eobs[block], xd->bd, tx_type,
-                                     xd->lossless[xd->mi[0]->mbmi.segment_id]);
-        break;
-      default:
-        assert(0 && "Invalid transform size");
-        break;
-    }
-
+    inv_txfm_param.bd = xd->bd;
+    highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &inv_txfm_param);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  inv_txfm_add(dqcoeff, dst, pd->dst.stride, &inv_txfm_param);
+}
 
-  switch (tx_size) {
-    case TX_32X32:
-      vp10_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                              tx_type);
-      break;
-    case TX_16X16:
-      vp10_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                              tx_type);
-      break;
-    case TX_8X8:
-      vp10_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                            tx_type);
-      break;
-    case TX_4X4:
-      // this is like vp10_short_idct4x4 but has a special case around eob<=1
-      // which is significant (not just an optimization) for the lossless
-      // case.
-      vp10_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                            tx_type, xd->lossless[xd->mi[0]->mbmi.segment_id]);
-      break;
-    default:
-      assert(0 && "Invalid transform size");
-      break;
+#if CONFIG_VAR_TX
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int blk_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+                (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[blk_idx], bsize,
+                          0, 0) :
+      mbmi->inter_tx_size[blk_idx];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize,
+                 tx_size, arg);
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      encode_block_inter(plane, block + i * step, offsetr, offsetc,
+                         plane_bsize, tx_size - 1, arg);
+    }
   }
 }
+#endif
 
 static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize,
@@ -891,12 +610,13 @@
   uint8_t *dst;
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
 
-  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
+  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                   tx_size, VP10_XFORM_QUANT_B);
 
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless[0]) {
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
         vp10_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride,
                                 p->eobs[block], xd->bd);
       } else {
@@ -906,7 +626,7 @@
       return;
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    if (xd->lossless[0]) {
+    if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
       vp10_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
     } else {
       vp10_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
@@ -917,7 +637,7 @@
 void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
   vp10_subtract_plane(x, bsize, 0);
   vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
-                                         encode_block_pass1, x);
+                                          encode_block_pass1, x);
 }
 
 void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -933,20 +653,72 @@
     return;
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+    // TODO(jingning): Clean this up.
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_tx_size * 2);
+#endif
     if (!x->skip_recode)
       vp10_subtract_plane(x, bsize, plane);
 
     if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+#if CONFIG_VAR_TX
+      vp10_get_entropy_contexts(bsize, TX_4X4, pd,
+                                ctx.ta[plane], ctx.tl[plane]);
+#else
       const struct macroblockd_plane* const pd = &xd->plane[plane];
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
       vp10_get_entropy_contexts(bsize, tx_size, pd,
-                               ctx.ta[plane], ctx.tl[plane]);
+                                ctx.ta[plane], ctx.tl[plane]);
+#endif
     }
 
+#if CONFIG_VAR_TX
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        encode_block_inter(plane, block, idy, idx, plane_bsize,
+                           max_tx_size, &arg);
+        block += step;
+      }
+    }
+#else
+    vp10_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                            &arg);
+#endif
+  }
+}
+
+#if CONFIG_SUPERTX
+void vp10_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
+
+  mbmi->skip = 1;
+  if (x->skip)
+    return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+    vp10_subtract_plane(x, bsize, plane);
+    vp10_get_entropy_contexts(bsize, tx_size, pd,
+                              ctx.ta[plane], ctx.tl[plane]);
     vp10_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
                                            &arg);
   }
 }
+#endif  // CONFIG_SUPERTX
 
 void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                              BLOCK_SIZE plane_bsize,
@@ -957,12 +729,9 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
   PREDICTION_MODE mode;
   const int bwl = b_width_log2_lookup[plane_bsize];
   const int bhl = b_height_log2_lookup[plane_bsize];
@@ -972,156 +741,57 @@
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+
+  const int tx1d_size = get_tx1d_size(tx_size);
+
+  INV_TXFM_PARAM inv_txfm_param;
+
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
   mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-  vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride,
-                          dst, dst_stride, blk_col, blk_row, plane);
-
+  vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride, dst,
+                           dst_stride, blk_col, blk_row, plane);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          highbd_fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff,
-                                diff_stride, tx_type);
-          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                      p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
-        }
-        if (*eob)
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                         tx_type);
-        break;
-      case TX_16X16:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob,
-                                scan_order->scan, scan_order->iscan);
-        }
-        if (*eob)
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                         tx_type);
-        break;
-      case TX_8X8:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob,
-                                scan_order->scan, scan_order->iscan);
-        }
-        if (*eob)
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                       tx_type);
-        break;
-      case TX_4X4:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                                   xd->lossless[mbmi->segment_id]);
-          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob,
-                                scan_order->scan, scan_order->iscan);
-        }
-
-        if (*eob)
-          // this is like vp10_short_idct4x4 but has a special case around
-          // eob<=1 which is significant (not just an optimization) for the
-          // lossless case.
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                       tx_type, xd->lossless[mbmi->segment_id]);
-        break;
-      default:
-        assert(0);
-        return;
-    }
-    if (*eob)
-      *(args->skip) = 0;
-    return;
+    vpx_highbd_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+                              src_stride, dst, dst_stride, xd->bd);
+  } else {
+    vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+                       src_stride, dst, dst_stride);
   }
+#else
+  vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+                     src_stride, dst, dst_stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  switch (tx_size) {
-    case TX_32X32:
-      if (!x->skip_recode) {
-        vpx_subtract_block(32, 32, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride,
-                       tx_type);
-        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, eob, scan_order->scan,
-                             scan_order->iscan);
-      }
-      if (*eob)
-        vp10_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_16X16:
-      if (!x->skip_recode) {
-        vpx_subtract_block(16, 16, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                       p->quant, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
-      }
-      if (*eob)
-        vp10_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_8X8:
-      if (!x->skip_recode) {
-        vpx_subtract_block(8, 8, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
-      }
-      if (*eob)
-        vp10_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_4X4:
-      if (!x->skip_recode) {
-        vpx_subtract_block(4, 4, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                          xd->lossless[mbmi->segment_id]);
-        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
-      }
+  if (!x->skip_recode)
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                     VP10_XFORM_QUANT_B);
+  else
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                     VP10_XFORM_QUANT_SKIP_QUANT);
 
-      if (*eob) {
-        // this is like vp10_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        vp10_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, tx_type,
-                              xd->lossless[mbmi->segment_id]);
-      }
-      break;
-    default:
-      assert(0);
-      break;
-  }
-  if (*eob)
+  if (*eob) {
+    // inverse transform
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = *eob;
+    inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+#if CONFIG_VP9_HIGHBITDEPTH
+    inv_txfm_param.bd = xd->bd;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      highbd_inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+    } else {
+      inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+    }
+#else
+    inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
     *(args->skip) = 0;
+  }
 }
 
 void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {

diff --git a/vp10/encoder/encodemb.h b/vp10/encoder/encodemb.h
index 2e6516e..e208c88 100644
--- a/vp10/encoder/encodemb.h
+++ b/vp10/encoder/encodemb.h

@@ -23,17 +23,24 @@
   struct optimize_ctx *ctx;
   int8_t *skip;
 };
+
+typedef enum VP10_XFORM_QUANT {
+  VP10_XFORM_QUANT_FP = 0,
+  VP10_XFORM_QUANT_B = 1,
+  VP10_XFORM_QUANT_DC = 2,
+  VP10_XFORM_QUANT_SKIP_QUANT = 3,
+  VP10_XFORM_QUANT_LAST = 4
+} VP10_XFORM_QUANT;
+
 void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp10_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif  // CONFIG_SUPERTX
 void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
-void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
                       int blk_row, int blk_col,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      VP10_XFORM_QUANT xform_quant_idx);
 
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
@@ -43,14 +50,6 @@
 
 void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                       int diff_stride, TX_TYPE tx_type, int lossless);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                              int diff_stride, TX_TYPE tx_type, int lossless);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index 0736c65..623e6f6 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c

@@ -137,19 +137,8 @@
 
 static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
                       vpx_prob upd_p) {
-#if CONFIG_MISC_FIXES
   (void) upd_p;
   vp10_cond_prob_diff_update(w, cur_p, ct);
-#else
-  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
-  const int update = cost_branch256(ct, *cur_p) + vp10_cost_zero(upd_p) >
-                     cost_branch256(ct, new_p) + vp10_cost_one(upd_p) + 7 * 256;
-  vpx_write(w, update, upd_p);
-  if (update) {
-    *cur_p = new_p;
-    vpx_write_literal(w, new_p >> 1, 7);
-  }
-#endif
 }
 
 static void write_mv_update(const vpx_tree_index *tree,

diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 9e3bec4..a90d0c9 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c

@@ -391,6 +391,9 @@
 
   vp10_free_pc_tree(&cpi->td);
 
+  if (cpi->common.allow_screen_content_tools)
+    vpx_free(cpi->td.mb.palette_buffer);
+
   if (cpi->source_diff_var != NULL) {
     vpx_free(cpi->source_diff_var);
     cpi->source_diff_var = NULL;
@@ -416,10 +419,6 @@
   memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
          MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
-#if !CONFIG_MISC_FIXES
-  vp10_copy(cc->segment_pred_probs, cm->segp.pred_probs);
-#endif
-
   memcpy(cpi->coding_context.last_frame_seg_map_copy,
          cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
@@ -444,10 +443,6 @@
   memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
          MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
-#if !CONFIG_MISC_FIXES
-  vp10_copy(cm->segp.pred_probs, cc->segment_pred_probs);
-#endif
-
   memcpy(cm->last_frame_seg_map,
          cpi->coding_context.last_frame_seg_map_copy,
          (cm->mi_rows * cm->mi_cols));
@@ -722,9 +717,17 @@
 }
 
 static void init_buffer_indices(VP10_COMP *cpi) {
+#if CONFIG_EXT_REFS
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx)
+    cpi->lst_fb_idxes[fb_idx] = fb_idx;
+  cpi->gld_fb_idx = LAST_REF_FRAMES;
+  cpi->alt_fb_idx = cpi->gld_fb_idx + 1;
+#else
   cpi->lst_fb_idx = 0;
   cpi->gld_fb_idx = 1;
   cpi->alt_fb_idx = 2;
+#endif  // CONFIG_EXT_REFS
 }
 
 static void init_config(struct VP10_COMP *cpi, VP10EncoderConfig *oxcf) {
@@ -749,6 +752,10 @@
   cpi->td.counts = &cm->counts;
 
   // change includes all joint functionality
+#if CONFIG_EXT_REFS
+  cpi->last_ref_to_refresh = LAST_FRAME;
+#endif  // CONFIG_EXT_REFS
+
   vp10_change_config(cpi, oxcf);
 
   cpi->static_mb_pct = 0;
@@ -1406,6 +1413,9 @@
 void vp10_change_config(struct VP10_COMP *cpi, const VP10EncoderConfig *oxcf) {
   VP10_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+#if CONFIG_EXT_REFS
+  int ref_frame;
+#endif  // CONFIG_EXT_REFS
 
   if (cm->profile != oxcf->profile)
     cm->profile = oxcf->profile;
@@ -1430,13 +1440,33 @@
   }
 
   cpi->refresh_golden_frame = 0;
+
+#if CONFIG_EXT_REFS
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame) {
+    if (ref_frame == cpi->last_ref_to_refresh)
+      cpi->refresh_last_frames[ref_frame - LAST_FRAME] = 1;
+    else
+      cpi->refresh_last_frames[ref_frame - LAST_FRAME] = 0;
+  }
+#else
   cpi->refresh_last_frame = 1;
+#endif  // CONFIG_EXT_REFS
+
   cm->refresh_frame_context =
       oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
           oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
                                              : REFRESH_FRAME_CONTEXT_BACKWARD;
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
 
+  cm->allow_screen_content_tools = (cpi->oxcf.content == VP9E_CONTENT_SCREEN);
+  if (cm->allow_screen_content_tools) {
+    MACROBLOCK *x = &cpi->td.mb;
+    if (x->palette_buffer == 0) {
+      CHECK_MEM_ERROR(cm, x->palette_buffer,
+                      vpx_memalign(16, sizeof(*x->palette_buffer)));
+    }
+  }
+
   vp10_reset_segment_features(cm);
   vp10_set_high_precision_mv(cpi, 0);
 
@@ -1942,6 +1972,8 @@
 
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
+      if (cpi->common.allow_screen_content_tools)
+        vpx_free(thread_data->td->mb.palette_buffer);
       vpx_free(thread_data->td->counts);
       vp10_free_pc_tree(thread_data->td);
       vpx_free(thread_data->td);
@@ -2277,7 +2309,7 @@
 }
 
 int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags) {
-  if (ref_frame_flags > 7)
+  if (ref_frame_flags > ((1 << REFS_PER_FRAME) - 1))
     return -1;
 
   cpi->ref_frame_flags = ref_frame_flags;
@@ -2287,7 +2319,14 @@
 void vp10_update_reference(VP10_COMP *cpi, int ref_frame_flags) {
   cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0;
   cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0;
+#if CONFIG_EXT_REFS
+  cpi->ext_refresh_last_frames[0] = (ref_frame_flags & VP9_LAST_FLAG) != 0;
+  cpi->ext_refresh_last_frames[1] = (ref_frame_flags & VP9_LAST2_FLAG) != 0;
+  cpi->ext_refresh_last_frames[2] = (ref_frame_flags & VP9_LAST3_FLAG) != 0;
+  cpi->ext_refresh_last_frames[3] = (ref_frame_flags & VP9_LAST4_FLAG) != 0;
+#else
   cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0;
+#endif  // CONFIG_EXT_REFS
   cpi->ext_refresh_frame_flags_pending = 1;
 }
 
@@ -2296,6 +2335,14 @@
   MV_REFERENCE_FRAME ref_frame = NONE;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
+#if CONFIG_EXT_REFS
+  else if (ref_frame_flag == VP9_LAST2_FLAG)
+    ref_frame = LAST2_FRAME;
+  else if (ref_frame_flag == VP9_LAST3_FLAG)
+    ref_frame = LAST3_FRAME;
+  else if (ref_frame_flag == VP9_LAST4_FLAG)
+    ref_frame = LAST4_FRAME;
+#endif  // CONFIG_EXT_REFS
   else if (ref_frame_flag == VP9_GOLD_FLAG)
     ref_frame = GOLDEN_FRAME;
   else if (ref_frame_flag == VP9_ALT_FLAG)
@@ -2577,6 +2624,9 @@
 void vp10_update_reference_frames(VP10_COMP *cpi) {
   VP10_COMMON * const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
+#if CONFIG_EXT_REFS
+  int ref_frame;
+#endif  // CONFIG_EXT_REFS
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
@@ -2631,22 +2681,49 @@
     }
   }
 
+#if CONFIG_EXT_REFS
+  for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame) {
+    const int ref_idx = ref_frame - LAST_FRAME;
+    if (cpi->refresh_last_frames[ref_idx]) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->lst_fb_idxes[ref_idx]],
+                 cm->new_fb_idx);
+      if (!cpi->rc.is_src_frame_alt_ref) {
+        memcpy(cpi->interp_filter_selected[ref_frame],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
+      }
+    }
+  }
+  // NOTE: The order for the refreshing of the 4 last reference frames are:
+  // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME -> LAST4_FRAME -> LAST_FRAME
+  cpi->last_ref_to_refresh += 1;
+  if (cpi->last_ref_to_refresh == LAST4_FRAME)
+    cpi->last_ref_to_refresh = LAST_FRAME;
+#else
   if (cpi->refresh_last_frame) {
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
-    if (!cpi->rc.is_src_frame_alt_ref)
+    if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
+    }
   }
+#endif  // CONFIG_EXT_REFS
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp10_denoiser_update_frame_info(&cpi->denoiser,
                                    *cpi->Source,
                                    cpi->common.frame_type,
+#if CONFIG_EXT_REFS
+                                   cpi->refresh_last_frames,
+#else
+                                   cpi->refresh_last_frame,
+#endif  // CONFIG_EXT_REFS
                                    cpi->refresh_alt_ref_frame,
-                                   cpi->refresh_golden_frame,
-                                   cpi->refresh_last_frame);
+                                   cpi->refresh_golden_frame);
   }
 #endif
 }
@@ -2670,6 +2747,9 @@
   }
 
   if (lf->filter_level > 0) {
+#if CONFIG_VAR_TX
+    vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#else
     if (cpi->num_workers > 1)
       vp10_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
                                lf->filter_level, 0, 0,
@@ -2677,6 +2757,7 @@
                                &cpi->lf_row_sync);
     else
       vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif
   }
 
   vpx_extend_frame_inner_borders(cm->frame_to_show);
@@ -2700,7 +2781,16 @@
 void vp10_scale_references(VP10_COMP *cpi) {
   VP10_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
-  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+  const VP9_REFFRAME ref_mask[REFS_PER_FRAME] = {
+    VP9_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_LAST2_FLAG,
+    VP9_LAST3_FLAG,
+    VP9_LAST4_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_GOLD_FLAG,
+    VP9_ALT_FLAG
+  };
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
@@ -2785,10 +2875,18 @@
   if (cpi->oxcf.pass == 0) {
     // Only release scaled references under certain conditions:
     // if reference will be updated, or if scaled reference has same resolution.
-    int refresh[3];
+    int refresh[REFS_PER_FRAME];
+#if CONFIG_EXT_REFS
+    for (i = LAST_FRAME; i <= LAST4_FRAME; ++i)
+      refresh[i - LAST_FRAME] =
+          (cpi->refresh_last_frames[i - LAST_FRAME]) ? 1 : 0;
+    refresh[4] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#else
     refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
     refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
     refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#endif  // CONFIG_EXT_REFS
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i - 1];
       RefCntBuffer *const buf = idx != INVALID_IDX ?
@@ -3060,7 +3158,7 @@
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
 
     ref_buf->idx = buf_idx;
@@ -3242,14 +3340,12 @@
     // update_base_skip_probs(cpi);
 
     vpx_clear_system_state();
-
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       save_coding_context(cpi);
       vp10_pack_bitstream(cpi, dest, size);
-
       rc->projected_frame_size = (int)(*size) << 3;
       restore_coding_context(cpi);
 
@@ -3422,12 +3518,45 @@
 
 static int get_ref_frame_flags(const VP10_COMP *cpi) {
   const int *const map = cpi->common.ref_frame_map;
-  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
-  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  if (gold_is_last)
+#if CONFIG_EXT_REFS
+  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
+
+  const int last2_is_last =
+      map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]];
+  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
+  const int alt_is_last2 = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[1]];
+
+  const int last3_is_last =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
+  const int last3_is_last2 =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
+  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
+  const int alt_is_last3 = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[2]];
+
+  const int last4_is_last =
+      map[cpi->lst_fb_idxes[3]] == map[cpi->lst_fb_idxes[0]];
+  const int last4_is_last2 =
+      map[cpi->lst_fb_idxes[3]] == map[cpi->lst_fb_idxes[1]];
+  const int last4_is_last3 =
+      map[cpi->lst_fb_idxes[3]] == map[cpi->lst_fb_idxes[2]];
+  const int gld_is_last4 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[3]];
+  const int alt_is_last4 = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[3]];
+#else
+  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+#endif  // CONFIG_EXT_REFS
+  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+#if CONFIG_EXT_REFS
+  flags |= VP9_LAST2_FLAG;
+  flags |= VP9_LAST3_FLAG;
+  flags |= VP9_LAST4_FLAG;
+#endif  // CONFIG_EXT_REFS
+
+  if (gld_is_last)
     flags &= ~VP9_GOLD_FLAG;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX)
@@ -3436,9 +3565,26 @@
   if (alt_is_last)
     flags &= ~VP9_ALT_FLAG;
 
-  if (gold_is_alt)
+  if (gld_is_alt)
     flags &= ~VP9_ALT_FLAG;
 
+#if CONFIG_EXT_REFS
+  if (last4_is_last || last4_is_last2 || last4_is_last3)
+    flags &= ~VP9_LAST4_FLAG;
+
+  if (last3_is_last || last3_is_last2)
+    flags &= ~VP9_LAST3_FLAG;
+
+  if (last2_is_last)
+    flags &= ~VP9_LAST2_FLAG;
+
+  if (gld_is_last4 || gld_is_last3 || gld_is_last2)
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (alt_is_last4 || alt_is_last3 || alt_is_last2)
+    flags &= ~VP9_ALT_FLAG;
+#endif  // CONFIG_EXT_REFS
+
   return flags;
 }
 
@@ -3452,7 +3598,15 @@
     cpi->ext_refresh_frame_context_pending = 0;
   }
   if (cpi->ext_refresh_frame_flags_pending) {
+#if CONFIG_EXT_REFS
+    int ref_frame;
+    for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame) {
+      cpi->refresh_last_frames[ref_frame - LAST_FRAME] =
+          cpi->ext_refresh_last_frames[ref_frame - LAST_FRAME];
+    }
+#else
     cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+#endif  // CONFIG_EXT_REFS
     cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
     cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
     cpi->ext_refresh_frame_flags_pending = 0;
@@ -3521,6 +3675,17 @@
   for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
     if ((ref_total[LAST_FRAME] &&
         cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+#if CONFIG_EXT_REFS
+        (ref_total[LAST2_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50
+         < ref_total[LAST2_FRAME]) &&
+        (ref_total[LAST3_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50
+         < ref_total[LAST3_FRAME]) &&
+        (ref_total[LAST4_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST4_FRAME][ifilter] * 50
+         < ref_total[LAST4_FRAME]) &&
+#endif  // CONFIG_EXT_REFS
         (ref_total[GOLDEN_FRAME] == 0 ||
          cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
            < ref_total[GOLDEN_FRAME]) &&
@@ -3637,8 +3802,16 @@
   }
 
   // If the encoder forced a KEY_FRAME decision
-  if (cm->frame_type == KEY_FRAME)
+  if (cm->frame_type == KEY_FRAME) {
+#if CONFIG_EXT_REFS
+    int ref_frame;
+    for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame)
+      cpi->refresh_last_frames[ref_frame - LAST_FRAME] = 1;
+    cpi->last_ref_to_refresh = LAST_FRAME;
+#else
     cpi->refresh_last_frame = 1;
+#endif  // CONFIG_EXT_REFS
+  }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
   cm->frame_to_show->color_space = cm->color_space;
@@ -3666,12 +3839,7 @@
 
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
     vp10_adapt_coef_probs(cm);
-#if CONFIG_MISC_FIXES
     vp10_adapt_intra_frame_probs(cm);
-#else
-    if (!frame_is_intra_only(cm))
-      vp10_adapt_intra_frame_probs(cm);
-#endif
   }
 
   if (!frame_is_intra_only(cm)) {
@@ -3693,6 +3861,10 @@
 
   cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
+#if CONFIG_EXT_REFS
+  cm->last3_frame_type = cm->last2_frame_type;
+  cm->last2_frame_type = cm->last_frame_type;
+#endif  // CONFIG_EXT_REFS
   cm->last_frame_type = cm->frame_type;
 
   vp10_rc_postencode_update(cpi, *size);
@@ -3855,7 +4027,14 @@
   const VP10_COMMON *cm = &cpi->common;
 
   return cm->frame_type == KEY_FRAME ||
+#if CONFIG_EXT_REFS
+         cpi->refresh_last_frames[LAST_FRAME - LAST_FRAME] ||
+         cpi->refresh_last_frames[LAST2_FRAME - LAST_FRAME] ||
+         cpi->refresh_last_frames[LAST3_FRAME - LAST_FRAME] ||
+         cpi->refresh_last_frames[LAST4_FRAME - LAST_FRAME] ||
+#else
          cpi->refresh_last_frame ||
+#endif  // CONFIG_EXT_REFS
          cpi->refresh_golden_frame ||
          cpi->refresh_alt_ref_frame ||
          cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF ||
@@ -3935,12 +4114,21 @@
   }
 
   if (rc->is_src_frame_alt_ref) {
+#if CONFIG_EXT_REFS
+    int ref_frame;
+#endif  // CONFIG_EXT_REFS
+
     // Current frame is an ARF overlay frame.
     cpi->alt_ref_source = NULL;
 
     // Don't refresh the last buffer for an ARF overlay frame. It will
     // become the GF so preserve last as an alternative prediction option.
+#if CONFIG_EXT_REFS
+    for (ref_frame = LAST_FRAME; ref_frame <= LAST4_FRAME; ++ref_frame)
+      cpi->refresh_last_frames[ref_frame - LAST_FRAME] = 0;
+#else
     cpi->refresh_last_frame = 0;
+#endif  // CONFIG_EXT_REFS
   }
 }
 
@@ -3991,7 +4179,16 @@
           oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
                                              : REFRESH_FRAME_CONTEXT_BACKWARD;
 
+#if CONFIG_EXT_REFS
+  for (i = LAST_FRAME; i <= LAST4_FRAME; ++i) {
+    if (i == cpi->last_ref_to_refresh)
+      cpi->refresh_last_frames[i - LAST_FRAME] = 1;
+    else
+      cpi->refresh_last_frames[i - LAST_FRAME] = 0;
+  }
+#else
   cpi->refresh_last_frame = 1;
+#endif  // CONFIG_EXT_REFS
   cpi->refresh_golden_frame = 0;
   cpi->refresh_alt_ref_frame = 0;
 
@@ -4015,12 +4212,15 @@
       cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+      for (i = LAST_FRAME; i <= LAST4_FRAME; ++i)
+        cpi->refresh_last_frames[i - LAST_FRAME] = 0;
+#else
       cpi->refresh_last_frame = 0;
+#endif  // CONFIG_EXT_REFS
       rc->is_src_frame_alt_ref = 0;
-      rc->source_alt_ref_pending = 0;
-    } else {
-      rc->source_alt_ref_pending = 0;
     }
+    rc->source_alt_ref_pending = 0;
   }
 
   if (!source) {

diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index bd6a009..707255d 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h

@@ -55,10 +55,6 @@
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
-#if !CONFIG_MISC_FIXES
-  vpx_prob segment_pred_probs[PREDICTION_PROBS];
-#endif
-
   unsigned char *last_frame_seg_map_copy;
 
   // 0 = Intra, Last, GF, ARF
@@ -308,17 +304,33 @@
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
+#if CONFIG_EXT_REFS
+  int last_ref_to_refresh;
+#endif  // CONFIG_EXT_REFS
+
   int scaled_ref_idx[MAX_REF_FRAMES];
+#if CONFIG_EXT_REFS
+  int lst_fb_idxes[LAST_REF_FRAMES];
+#else
   int lst_fb_idx;
+#endif  // CONFIG_EXT_REFS
   int gld_fb_idx;
   int alt_fb_idx;
 
+#if CONFIG_EXT_REFS
+  int refresh_last_frames[LAST_REF_FRAMES];
+#else
   int refresh_last_frame;
+#endif  // CONFIG_EXT_REFS
   int refresh_golden_frame;
   int refresh_alt_ref_frame;
 
   int ext_refresh_frame_flags_pending;
+#if CONFIG_EXT_REFS
+  int ext_refresh_last_frames[LAST_REF_FRAMES];
+#else
   int ext_refresh_last_frame;
+#endif  // CONFIG_EXT_REFS
   int ext_refresh_golden_frame;
   int ext_refresh_alt_ref_frame;
 
@@ -456,19 +468,36 @@
 
   search_site_config ss_cfg;
 
-  int mbmode_cost[INTRA_MODES];
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+#if CONFIG_REF_MV
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+#endif
+
   unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
   int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_y_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
+                                                 [PALETTE_COLORS];
+  int palette_uv_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
+                                                  [PALETTE_COLORS];
 
   int multi_arf_allowed;
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
-
+#if CONFIG_EXT_TX
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                                                          [TX_TYPES];
+#else
   int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
   int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
@@ -549,13 +578,17 @@
 
 static INLINE int get_ref_frame_map_idx(const VP10_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
-  if (ref_frame == LAST_FRAME) {
+#if CONFIG_EXT_REFS
+  if (ref_frame >= LAST_FRAME && ref_frame <= LAST4_FRAME)
+    return cpi->lst_fb_idxes[ref_frame - 1];
+#else
+  if (ref_frame == LAST_FRAME)
     return cpi->lst_fb_idx;
-  } else if (ref_frame == GOLDEN_FRAME) {
+#endif  // CONFIG_EXT_REFS
+  else if (ref_frame == GOLDEN_FRAME)
     return cpi->gld_fb_idx;
-  } else {
+  else
     return cpi->alt_fb_idx;
-  }
 }
 
 static INLINE int get_ref_frame_buf_idx(const VP10_COMP *const cpi,

diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index ad47ccf..6cb9494 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c

@@ -133,6 +133,13 @@
       memcpy(thread_data->td->counts, &cpi->common.counts,
              sizeof(cpi->common.counts));
     }
+
+    // Allocate buffers used by palette coding mode.
+    if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
+        MACROBLOCK *x = &thread_data->td->mb;
+        CHECK_MEM_ERROR(cm, x->palette_buffer,
+                        vpx_memalign(16, sizeof(*x->palette_buffer)));
+    }
   }
 
   // Encode a frame

diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index c41fa3e..c865408 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c

@@ -1044,8 +1044,13 @@
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
+#if CONFIG_EXT_REFS
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                  cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif  // CONFIG_EXT_REFS
     }
     twopass->sr_update_lag = 1;
   } else {
@@ -1055,14 +1060,25 @@
   vpx_extend_frame_borders(new_yv12);
 
   // The frame we just compressed now becomes the last frame.
+#if CONFIG_EXT_REFS
+  ref_cnt_fb(pool->frame_bufs,
+             &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]],
+             cm->new_fb_idx);
+#else
   ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
              cm->new_fb_idx);
+#endif  // CONFIG_EXT_REFS
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
   if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
+#if CONFIG_EXT_REFS
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif  // CONFIG_EXT_REFS
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -2382,28 +2398,48 @@
   cpi->rc.is_src_frame_alt_ref = 0;
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
     case KF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_last_frames[LAST_FRAME - LAST_FRAME] = 1;
+#else
       cpi->refresh_last_frame = 1;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 1;
       break;
     case LF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_last_frames[LAST_FRAME - LAST_FRAME] = 1;
+#else
       cpi->refresh_last_frame = 1;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_golden_frame = 0;
       cpi->refresh_alt_ref_frame = 0;
       break;
     case GF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_last_frames[LAST_FRAME - LAST_FRAME] = 1;
+#else
       cpi->refresh_last_frame = 1;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       break;
     case OVERLAY_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_last_frames[LAST_FRAME - LAST_FRAME] = 0;
+#else
       cpi->refresh_last_frame = 0;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
     case ARF_UPDATE:
+#if CONFIG_EXT_REFS
+      cpi->refresh_last_frames[LAST_FRAME - LAST_FRAME] = 0;
+#else
       cpi->refresh_last_frame = 0;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_golden_frame = 0;
       cpi->refresh_alt_ref_frame = 1;
       break;

diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000..8b48276
--- /dev/null
+++ b/vp10/encoder/hybrid_fwd_txfm.c

@@ -0,0 +1,406 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp10/common/idct.h"
+#include "vp10/encoder/hybrid_fwd_txfm.h"
+
+static INLINE void fdct32x32(int rd_transform, const int16_t *src,
+                             tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vpx_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_fdct32x32(src, dst, src_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
+                                    tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vpx_highbd_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_highbd_fdct32x32(src, dst, src_stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EXT_TX
+// Forward identity transform.
+static void fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
+                       int bs) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+    src_diff += stride;
+    coeff += bs;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                       int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      // Use C version since DST exists only in C
+      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      else  // FWD_TXFM_OPT_DC
+        vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      // Use C version since DST exists only in C
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      else  // FWD_TXFM_OPT_DC
+        vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      // Use C version since DST exists only in C
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
+                           tran_low_t *coeff, int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      else  // FWD_TXFM_OPT_DC
+        vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 32);
+      break;
+#endif  // CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                              int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      // Use C version since DST exists only in C
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      // Use C version since DST exists only in C
+      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      // Use C version since DST exists only in C
+      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
+                                  tran_low_t *coeff, int diff_stride,
+                                  TX_TYPE tx_type, FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        highbd_fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      else  // FWD_TXFM_OPT_DC
+        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case IDTX:
+      fwd_idtx_c(src_diff, coeff, diff_stride, 32);
+      break;
+#endif  // CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+              FWD_TXFM_PARAM *fwd_txfm_param) {
+  const int fwd_txfm_opt = fwd_txfm_param->fwd_txfm_opt;
+  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+  const int rd_transform = fwd_txfm_param->rd_transform;
+  const int lossless = fwd_txfm_param->lossless;
+  switch (tx_size) {
+    case TX_32X32:
+      fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
+                     fwd_txfm_opt);
+      break;
+    case TX_16X16:
+      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X8:
+      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_4X4:
+      vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                     int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) {
+  const int fwd_txfm_opt = fwd_txfm_param->fwd_txfm_opt;
+  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+  const int rd_transform = fwd_txfm_param->rd_transform;
+  const int lossless = fwd_txfm_param->lossless;
+  switch (tx_size) {
+    case TX_32X32:
+      highbd_fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
+                            fwd_txfm_opt);
+      break;
+    case TX_16X16:
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type,
+                            fwd_txfm_opt);
+      break;
+    case TX_8X8:
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_4X4:
+      vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vp10/encoder/hybrid_fwd_txfm.h b/vp10/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000..62b8d5a
--- /dev/null
+++ b/vp10/encoder/hybrid_fwd_txfm.h

@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_HYBRID_FWD_TXFM_H_
+#define VP10_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "./vpx_config.h"
+
+typedef enum FWD_TXFM_OPT { FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_DC } FWD_TXFM_OPT;
+
+typedef struct FWD_TXFM_PARAM {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  FWD_TXFM_OPT fwd_txfm_opt;
+  int rd_transform;
+  int lossless;
+} FWD_TXFM_PARAM;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+              FWD_TXFM_PARAM *fwd_txfm_param);
+void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                       int diff_stride, TX_TYPE tx_type, int lossless);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                     int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
+void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                              int diff_stride, TX_TYPE tx_type, int lossless);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int get_tx1d_size(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_32X32:
+      return 32;
+    case TX_16X16:
+      return 16;
+    case TX_8X8:
+      return 8;
+    case TX_4X4:
+      return 4;
+    default:
+      assert(0);
+      return -1;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_HYBRID_FWD_TXFM_H_

diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
new file mode 100644
index 0000000..522e185
--- /dev/null
+++ b/vp10/encoder/palette.c

@@ -0,0 +1,194 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "vp10/encoder/palette.h"
+
+static double calc_dist(const double *p1, const double *p2, int dim) {
+  double dist = 0;
+  int i = 0;
+
+  for (i = 0; i < dim; ++i) {
+    dist = dist + (p1[i] - round(p2[i])) * (p1[i] - round(p2[i]));
+  }
+  return dist;
+}
+
+void vp10_calc_indices(const double *data, const double *centroids,
+                       uint8_t *indices, int n, int k, int dim) {
+  int i, j;
+  double min_dist, this_dist;
+
+  for (i = 0; i < n; ++i) {
+    min_dist = calc_dist(data + i * dim, centroids, dim);
+    indices[i] = 0;
+    for (j = 1; j < k; ++j) {
+      this_dist = calc_dist(data + i * dim, centroids + j * dim, dim);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+  *state = *state * 1103515245 + 12345;
+  return *state / 65536 % 32768;
+}
+
+static void calc_centroids(const double *data, double *centroids,
+                           const uint8_t *indices, int n, int k, int dim) {
+  int i, j, index;
+  int count[PALETTE_MAX_SIZE];
+  unsigned int rand_state = (unsigned int)data[0];
+
+  assert(n <= 32768);
+
+  memset(count, 0, sizeof(count[0]) * k);
+  memset(centroids, 0, sizeof(centroids[0]) * k * dim);
+
+  for (i = 0; i < n; ++i) {
+    index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < dim; ++j) {
+      centroids[index * dim + j] += data[i * dim + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
+                 sizeof(centroids[0]) * dim);
+    } else {
+      const double norm = 1.0 / count[i];
+      for (j = 0; j < dim; ++j)
+        centroids[i * dim + j] *= norm;
+    }
+  }
+}
+
+static double calc_total_dist(const double *data, const double *centroids,
+                              const uint8_t *indices, int n, int k, int dim) {
+  double dist = 0;
+  int i;
+  (void) k;
+
+  for (i = 0; i < n; ++i)
+    dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
+
+  return dist;
+}
+
+int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
+                 uint8_t *pre_indices, int n, int k, int dim, int max_itr) {
+  int i = 0;
+  double pre_dist, this_dist;
+  double pre_centroids[PALETTE_MAX_SIZE];
+
+  vp10_calc_indices(data, centroids, indices, n, k, dim);
+  pre_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+  memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+  memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+  while (i < max_itr) {
+    calc_centroids(data, centroids, indices, n, k, dim);
+    vp10_calc_indices(data, centroids, indices, n, k, dim);
+    this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+    if (this_dist > pre_dist) {
+      memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
+      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
+      break;
+
+    memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+    pre_dist = this_dist;
+    ++i;
+  }
+
+  return i;
+}
+
+void vp10_insertion_sort(double *data, int n) {
+  int i, j, k;
+  double val;
+
+  if (n <= 1)
+    return;
+
+  for (i = 1; i < n; ++i) {
+    val = data[i];
+    j = 0;
+    while (val > data[j] && j < i)
+      ++j;
+
+    if (j == i)
+      continue;
+
+    for (k = i; k > j; --k)
+      data[k] = data[k - 1];
+    data[j] = val;
+  }
+}
+
+int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int n = 0, r, c, i, val_count[256];
+  uint8_t val;
+  memset(val_count, 0, sizeof(val_count));
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      val = src[r * stride + c];
+      ++val_count[val];
+    }
+  }
+
+  for (i = 0; i < 256; ++i) {
+    if (val_count[i]) {
+      ++n;
+    }
+  }
+
+  return n;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp10_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth) {
+  int n = 0, r, c, i;
+  uint16_t val;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int val_count[1 << 12];
+
+  assert(bit_depth <= 12);
+  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      val = src[r * stride + c];
+      ++val_count[val];
+    }
+  }
+
+  for (i = 0; i < (1 << bit_depth); ++i) {
+    if (val_count[i]) {
+      ++n;
+    }
+  }
+
+  return n;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+

diff --git a/vp10/encoder/palette.h b/vp10/encoder/palette.h
new file mode 100644
index 0000000..124cf74
--- /dev/null
+++ b/vp10/encoder/palette.h

@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_PALETTE_H_
+#define VP10_ENCODER_PALETTE_H_
+
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_insertion_sort(double *data, int n);
+void vp10_calc_indices(const double *data, const double *centroids,
+                       uint8_t *indices, int n, int k, int dim);
+int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
+                 uint8_t *pre_indices, int n, int k, int dim, int max_itr);
+int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp10_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_PALETTE_H_ */

diff --git a/vp10/encoder/picklpf.c b/vp10/encoder/picklpf.c
index 045e03d..1f5711d 100644
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c

@@ -41,6 +41,10 @@
   VP10_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
+#if CONFIG_VAR_TX
+  vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                        1, partial_frame);
+#else
   if (cpi->num_workers > 1)
     vp10_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
                              filt_level, 1, partial_frame,
@@ -48,6 +52,7 @@
   else
     vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
                           1, partial_frame);
+#endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {

diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 86b324f..739a06d 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c

@@ -10,16 +10,194 @@
 
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
 #include "vp10/common/quant_common.h"
+#include "vp10/common/scan.h"
 #include "vp10/common/seg_common.h"
 
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/quantize.h"
 #include "vp10/encoder/rd.h"
 
+void vp10_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+void vp10_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vp10_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                   p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                   pd->dequant, eob_ptr, sc->scan, sc->iscan);
+}
+
+void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vpx_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
+                 p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                 sc->scan, sc->iscan);
+}
+
+void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  (void)sc;
+
+  vpx_quantize_dc(coeff_ptr, n_coeffs, skip_block, p->round, p->quant_fp[0],
+                  qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vp10_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                          pd->dequant, eob_ptr, sc->scan, sc->iscan);
+}
+
+void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vpx_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                        p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                        pd->dequant, eob_ptr, sc->scan, sc->iscan);
+}
+
+void vp10_highbd_quantize_dc_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  (void)sc;
+
+  vpx_highbd_quantize_dc(coeff_ptr, n_coeffs, skip_block, p->round,
+                         p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant[0], eob_ptr);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_quantize_fp_32x32_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vp10_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan);
+}
+
+void vp10_quantize_b_32x32_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  const MACROBLOCKD_PLANE *pd,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vpx_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                       pd->dequant, eob_ptr, sc->scan, sc->iscan);
+}
+
+void vp10_quantize_dc_32x32_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  (void)sc;
+  (void)n_coeffs;
+
+  vpx_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                        qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_32x32_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vp10_highbd_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                                p->round_fp, p->quant_fp, p->quant_shift,
+                                qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                                sc->scan, sc->iscan);
+}
+
+void vp10_highbd_quantize_b_32x32_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vpx_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                              p->round, p->quant, p->quant_shift, qcoeff_ptr,
+                              dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                              sc->iscan);
+}
+
+void vp10_highbd_quantize_dc_32x32_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  (void)sc;
+  (void)n_coeffs;
+
+  vpx_highbd_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                               qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                               eob_ptr);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,

diff --git a/vp10/encoder/quantize.h b/vp10/encoder/quantize.h
index b44088e..9c0ab3f 100644
--- a/vp10/encoder/quantize.h
+++ b/vp10/encoder/quantize.h

@@ -12,12 +12,20 @@
 #define VP10_ENCODER_QUANTIZE_H_
 
 #include "./vpx_config.h"
+#include "vp10/common/scan.h"
 #include "vp10/encoder/block.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef void (*VP10_QUANT_FACADE)(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  const MACROBLOCKD_PLANE *pd,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const scan_order *sc);
+
 typedef struct {
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -38,7 +46,7 @@
 } QUANTS;
 
 void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
+                                 const int16_t *scan, const int16_t *iscan);
 
 struct VP10_COMP;
 struct VP10Common;
@@ -55,6 +63,81 @@
 
 int vp10_qindex_to_quantizer(int qindex);
 
+void vp10_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void vp10_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc);
+
+void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const scan_order *sc);
+
+void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+
+void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc);
+
+void vp10_highbd_quantize_dc_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_quantize_fp_32x32_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc);
+
+void vp10_quantize_b_32x32_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  const MACROBLOCKD_PLANE *pd,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const scan_order *sc);
+
+void vp10_quantize_dc_32x32_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_32x32_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+
+void vp10_highbd_quantize_b_32x32_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+
+void vp10_highbd_quantize_dc_32x32_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index f4fdb24..5dcfa55 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c

@@ -75,7 +75,10 @@
       vp10_cost_tokens(cpi->y_mode_costs[i][j], vp10_kf_y_mode_prob[i][j],
                       vp10_intra_mode_tree);
 
-  vp10_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp10_intra_mode_tree);
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    vp10_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i],
+                     vp10_intra_mode_tree);
+
   for (i = 0; i < INTRA_MODES; ++i)
     vp10_cost_tokens(cpi->intra_uv_mode_cost[i],
                      fc->uv_mode_prob[i], vp10_intra_mode_tree);
@@ -84,6 +87,44 @@
     vp10_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp10_switchable_interp_tree);
 
+  for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
+    vp10_cost_tokens(cpi->palette_y_size_cost[i],
+                     vp10_default_palette_y_size_prob[i],
+                     vp10_palette_size_tree);
+    vp10_cost_tokens(cpi->palette_uv_size_cost[i],
+                     vp10_default_palette_uv_size_prob[i],
+                     vp10_palette_size_tree);
+  }
+
+  for (i = 0; i < PALETTE_MAX_SIZE - 1; ++i)
+    for (j = 0; j < PALETTE_COLOR_CONTEXTS; ++j) {
+      vp10_cost_tokens(cpi->palette_y_color_cost[i][j],
+                       vp10_default_palette_y_color_prob[i][j],
+                       vp10_palette_color_tree[i]);
+      vp10_cost_tokens(cpi->palette_uv_color_cost[i][j],
+                       vp10_default_palette_uv_color_prob[i][j],
+                       vp10_palette_color_tree[i]);
+    }
+#if CONFIG_EXT_TX
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        vp10_cost_tokens(cpi->inter_tx_type_costs[s][i],
+                         fc->inter_ext_tx_prob[s][i],
+                         vp10_ext_tx_inter_tree[s]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j)
+          vp10_cost_tokens(cpi->intra_tx_type_costs[s][i][j],
+                           fc->intra_ext_tx_prob[s][i][j],
+                           vp10_ext_tx_intra_tree[s]);
+      }
+    }
+  }
+#else
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j)
       vp10_cost_tokens(cpi->intra_tx_type_costs[i][j],
@@ -95,6 +136,7 @@
                      fc->inter_ext_tx_prob[i],
                      vp10_ext_tx_tree);
   }
+#endif  // CONFIG_EXT_TX
 }
 
 static void fill_token_costs(vp10_coeff_cost *c,
@@ -311,10 +353,26 @@
                              cm->allow_high_precision_mv ? x->nmvcost_hp
                                                          : x->nmvcost,
                              &cm->fc->nmvc, cm->allow_high_precision_mv);
+#if CONFIG_REF_MV
+    for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+      cpi->newmv_mode_cost[i][0] = vp10_cost_bit(cm->fc->newmv_prob[i], 0);
+      cpi->newmv_mode_cost[i][1] = vp10_cost_bit(cm->fc->newmv_prob[i], 1);
+    }
 
+    for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
+      cpi->zeromv_mode_cost[i][0] = vp10_cost_bit(cm->fc->zeromv_prob[i], 0);
+      cpi->zeromv_mode_cost[i][1] = vp10_cost_bit(cm->fc->zeromv_prob[i], 1);
+    }
+
+    for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+      cpi->refmv_mode_cost[i][0] = vp10_cost_bit(cm->fc->refmv_prob[i], 0);
+      cpi->refmv_mode_cost[i][1] = vp10_cost_bit(cm->fc->refmv_prob[i], 1);
+    }
+#else
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
                       cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
+#endif
   }
 }
 
@@ -563,8 +621,11 @@
                              const MACROBLOCKD *const xd) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp10_get_pred_context_switchable_interp(xd);
+#if CONFIG_EXT_INTERP
+  if (!vp10_is_interp_needed(xd)) return 0;
+#endif  // CONFIG_EXT_INTERP
   return SWITCHABLE_INTERP_RATE_FACTOR *
-             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+      cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
 
 void vp10_set_rd_speed_thresholds(VP10_COMP *cpi) {
@@ -578,10 +639,20 @@
 
   if (sf->adaptive_rd_thresh) {
     rd->thresh_mult[THR_NEARESTMV] = 300;
+#if CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTL2] = 300;
+    rd->thresh_mult[THR_NEARESTL3] = 300;
+    rd->thresh_mult[THR_NEARESTL4] = 300;
+#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTG] = 300;
     rd->thresh_mult[THR_NEARESTA] = 300;
   } else {
     rd->thresh_mult[THR_NEARESTMV] = 0;
+#if CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTL2] = 0;
+    rd->thresh_mult[THR_NEARESTL3] = 0;
+    rd->thresh_mult[THR_NEARESTL4] = 0;
+#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTG] = 0;
     rd->thresh_mult[THR_NEARESTA] = 0;
   }
@@ -589,26 +660,61 @@
   rd->thresh_mult[THR_DC] += 1000;
 
   rd->thresh_mult[THR_NEWMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWL2] += 1000;
+  rd->thresh_mult[THR_NEWL3] += 1000;
+  rd->thresh_mult[THR_NEWL4] += 1000;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
   rd->thresh_mult[THR_NEARMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEARL2] += 1000;
+  rd->thresh_mult[THR_NEARL3] += 1000;
+  rd->thresh_mult[THR_NEARL4] += 1000;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARA] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_ZEROL2] += 2000;
+  rd->thresh_mult[THR_ZEROL3] += 2000;
+  rd->thresh_mult[THR_ZEROL4] += 2000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
 
   rd->thresh_mult[THR_TM] += 1000;
 
+  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL4A] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
-  rd->thresh_mult[THR_NEARG] += 1000;
   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL3A] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL4A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL4A] += 2000;
+#endif  // CONFIG_EXT_REFS
 
-  rd->thresh_mult[THR_ZEROMV] += 2000;
-  rd->thresh_mult[THR_ZEROG] += 2000;
-  rd->thresh_mult[THR_ZEROA] += 2000;
   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_ZEROL2A] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL3A] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL4A] += 2500;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
 
   rd->thresh_mult[THR_H_PRED] += 2000;
@@ -622,9 +728,15 @@
 }
 
 void vp10_set_rd_speed_thresholds_sub8x8(VP10_COMP *cpi) {
-  static const int thresh_mult[2][MAX_REFS] =
-      {{2500, 2500, 2500, 4500, 4500, 2500},
-       {2000, 2000, 2000, 4000, 4000, 2000}};
+  static const int thresh_mult[2][MAX_REFS] = {
+#if CONFIG_EXT_REFS
+    {2500, 2500, 2500, 2500, 2500, 2500, 4500, 4500, 4500, 4500, 4500, 2500},
+    {2000, 2000, 2000, 2000, 2000, 2000, 4000, 4000, 4000, 4000, 4000, 2000}
+#else
+    {2500, 2500, 2500, 4500, 4500, 2500},
+    {2000, 2000, 2000, 4000, 4000, 2000}
+#endif  // CONFIG_EXT_REFS
+  };
   RD_OPT *const rd = &cpi->rd;
   const int idx = cpi->oxcf.mode == BEST;
   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));

diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index cd58bf8..42261ac 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h

@@ -33,8 +33,17 @@
 
 #define INVALID_MV 0x80008000
 
+#if CONFIG_EXT_REFS
+#define MAX_MODES 54
+#else
 #define MAX_MODES 30
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+#define MAX_REFS  12
+#else
 #define MAX_REFS  6
+#endif  // CONFIG_EXT_REFS
 
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC      1
@@ -43,34 +52,72 @@
 // const MODE_DEFINITION vp10_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
   THR_NEARESTMV,
+#if CONFIG_EXT_REFS
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTL4,
+#endif  // CONFIG_EXT_REFS
   THR_NEARESTA,
   THR_NEARESTG,
 
   THR_DC,
 
   THR_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWL4,
+#endif  // CONFIG_EXT_REFS
   THR_NEWA,
   THR_NEWG,
 
   THR_NEARMV,
+#if CONFIG_EXT_REFS
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARL4,
+#endif  // CONFIG_EXT_REFS
   THR_NEARA,
   THR_NEARG,
 
   THR_ZEROMV,
+#if CONFIG_EXT_REFS
+  THR_ZEROL2,
+  THR_ZEROL3,
+  THR_ZEROL4,
+#endif  // CONFIG_EXT_REFS
   THR_ZEROG,
   THR_ZEROA,
 
   THR_COMP_NEARESTLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARESTL2A,
+  THR_COMP_NEARESTL3A,
+  THR_COMP_NEARESTL4A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_NEARESTGA,
 
   THR_TM,
 
   THR_COMP_NEARLA,
   THR_COMP_NEWLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARL2A,
+  THR_COMP_NEWL2A,
+  THR_COMP_NEARL3A,
+  THR_COMP_NEWL3A,
+  THR_COMP_NEARL4A,
+  THR_COMP_NEWL4A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_NEARGA,
   THR_COMP_NEWGA,
 
   THR_COMP_ZEROLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_ZEROL2A,
+  THR_COMP_ZEROL3A,
+  THR_COMP_ZEROL4A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_ZEROGA,
 
   THR_H_PRED,
@@ -85,9 +132,19 @@
 
 typedef enum {
   THR_LAST,
+#if CONFIG_EXT_REFS
+  THR_LAST2,
+  THR_LAST3,
+  THR_LAST4,
+#endif  // CONFIG_EXT_REFS
   THR_GOLD,
   THR_ALTR,
   THR_COMP_LA,
+#if CONFIG_EXT_REFS
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+  THR_COMP_L4A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_GA,
   THR_INTRA,
 } THR_MODES_SUB8X8;

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 90a716d..ca978ba 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -35,13 +35,38 @@
 #include "vp10/encoder/encodemb.h"
 #include "vp10/encoder/encodemv.h"
 #include "vp10/encoder/encoder.h"
+#include "vp10/encoder/hybrid_fwd_txfm.h"
 #include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/palette.h"
 #include "vp10/encoder/quantize.h"
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/rdopt.h"
 #include "vp10/encoder/aq_variance.h"
 
+#if CONFIG_EXT_REFS
+
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << INTRA_FRAME) | \
+                                 (1 << LAST3_FRAME) | (1 << LAST4_FRAME))
+#define LAST2_FRAME_MODE_MASK   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << LAST_FRAME) | (1 << INTRA_FRAME) | \
+                                 (1 << LAST3_FRAME) | (1 << LAST4_FRAME))
+#define LAST3_FRAME_MODE_MASK   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << LAST_FRAME) | (1 << INTRA_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << LAST4_FRAME))
+#define LAST4_FRAME_MODE_MASK   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << LAST_FRAME) | (1 << INTRA_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << LAST3_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << INTRA_FRAME) | \
+                                 (1 << LAST3_FRAME) | (1 << LAST4_FRAME))
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << INTRA_FRAME) | \
+                                 (1 << LAST3_FRAME) | (1 << LAST4_FRAME))
+
+#else
+
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
 #define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
@@ -49,12 +74,19 @@
 #define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
                                  (1 << INTRA_FRAME))
 
+#endif  // CONFIG_EXT_REFS
+
 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
 
 #define MIN_EARLY_TERM_INDEX    3
 #define NEW_MV_DISCOUNT_FACTOR  8
 
+#if CONFIG_EXT_TX
+const double ext_tx_th = 0.98;
+#else
 const double ext_tx_th = 0.99;
+#endif
+
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -66,6 +98,9 @@
 } REF_DEFINITION;
 
 struct rdcost_block_args {
+#if CONFIG_VAR_TX
+  const VP10_COMP *cpi;
+#endif
   MACROBLOCK *x;
   ENTROPY_CONTEXT t_above[16];
   ENTROPY_CONTEXT t_left[16];
@@ -83,34 +118,72 @@
 #define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION vp10_mode_order[MAX_MODES] = {
   {NEARESTMV, {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEARESTMV, {LAST2_FRAME,  NONE}},
+  {NEARESTMV, {LAST3_FRAME,  NONE}},
+  {NEARESTMV, {LAST4_FRAME,  NONE}},
+#endif  // CONFIG_EXT_REFS
   {NEARESTMV, {ALTREF_FRAME, NONE}},
   {NEARESTMV, {GOLDEN_FRAME, NONE}},
 
   {DC_PRED,   {INTRA_FRAME,  NONE}},
 
   {NEWMV,     {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEWMV,     {LAST2_FRAME,  NONE}},
+  {NEWMV,     {LAST3_FRAME,  NONE}},
+  {NEWMV,     {LAST4_FRAME,  NONE}},
+#endif  // CONFIG_EXT_REFS
   {NEWMV,     {ALTREF_FRAME, NONE}},
   {NEWMV,     {GOLDEN_FRAME, NONE}},
 
   {NEARMV,    {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEARMV,    {LAST2_FRAME,  NONE}},
+  {NEARMV,    {LAST3_FRAME,  NONE}},
+  {NEARMV,    {LAST4_FRAME,  NONE}},
+#endif  // CONFIG_EXT_REFS
   {NEARMV,    {ALTREF_FRAME, NONE}},
   {NEARMV,    {GOLDEN_FRAME, NONE}},
 
   {ZEROMV,    {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {LAST2_FRAME,  NONE}},
+  {ZEROMV,    {LAST3_FRAME,  NONE}},
+  {ZEROMV,    {LAST4_FRAME,  NONE}},
+#endif  // CONFIG_EXT_REFS
   {ZEROMV,    {GOLDEN_FRAME, NONE}},
   {ZEROMV,    {ALTREF_FRAME, NONE}},
 
   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEARESTMV, {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEARESTMV, {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEARESTMV, {LAST4_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
 
   {TM_PRED,   {INTRA_FRAME,  NONE}},
 
   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEARMV,    {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEWMV,     {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEARMV,    {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEWMV,     {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEARMV,    {LAST4_FRAME,  ALTREF_FRAME}},
+  {NEWMV,     {LAST4_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
 
   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {LAST3_FRAME,  ALTREF_FRAME}},
+  {ZEROMV,    {LAST2_FRAME,  ALTREF_FRAME}},
+  {ZEROMV,    {LAST4_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 
   {H_PRED,    {INTRA_FRAME,  NONE}},
@@ -125,9 +198,19 @@
 
 static const REF_DEFINITION vp10_ref_order[MAX_REFS] = {
   {{LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {{LAST2_FRAME,  NONE}},
+  {{LAST3_FRAME,  NONE}},
+  {{LAST4_FRAME,  NONE}},
+#endif  // CONFIG_EXT_REFS
   {{GOLDEN_FRAME, NONE}},
   {{ALTREF_FRAME, NONE}},
   {{LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {{LAST2_FRAME,  ALTREF_FRAME}},
+  {{LAST3_FRAME,  ALTREF_FRAME}},
+  {{LAST4_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {{GOLDEN_FRAME, ALTREF_FRAME}},
   {{INTRA_FRAME,  NONE}},
 };
@@ -346,7 +429,11 @@
 };
 static int cost_coeffs(MACROBLOCK *x,
                        int plane, int block,
+#if CONFIG_VAR_TX
+                       int coeff_ctx,
+#else
                        ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+#endif
                        TX_SIZE tx_size,
                        const int16_t *scan, const int16_t *nb,
                        int use_fast_coef_costing) {
@@ -361,7 +448,11 @@
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
   uint8_t token_cache[32 * 32];
+#if CONFIG_VAR_TX
+  int pt = coeff_ctx;
+#else
   int pt = combine_entropy_contexts(*A, *L);
+#endif
   int c, cost;
 #if CONFIG_VP9_HIGHBITDEPTH
   const int16_t *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
@@ -369,9 +460,11 @@
   const int16_t *cat6_high_cost = vp10_get_high_cost_table(8);
 #endif
 
+#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
+#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
 
   if (eob == 0) {
     // single eob token
@@ -425,8 +518,10 @@
     }
   }
 
+#if !CONFIG_VAR_TX
   // is eob first coefficient;
   *A = *L = (c > 0);
+#endif
 
   return cost;
 }
@@ -454,10 +549,23 @@
 
 static int rate_block(int plane, int block, int blk_row, int blk_col,
                       TX_SIZE tx_size, struct rdcost_block_args* args) {
-  return cost_coeffs(args->x, plane, block, args->t_above + blk_col,
-                     args->t_left + blk_row, tx_size,
-                     args->so->scan, args->so->neighbors,
+#if CONFIG_VAR_TX
+  int coeff_ctx = combine_entropy_contexts(*(args->t_above + blk_col),
+                                           *(args->t_left + blk_row));
+  int coeff_cost = cost_coeffs(args->x, plane, block, coeff_ctx,
+                               tx_size, args->so->scan, args->so->neighbors,
+                               args->use_fast_coef_costing);
+  const struct macroblock_plane *p = &args->x->plane[plane];
+  *(args->t_above + blk_col) = !(p->eobs[block] == 0);
+  *(args->t_left  + blk_row) = !(p->eobs[block] == 0);
+  return coeff_cost;
+#else
+  return cost_coeffs(args->x, plane, block,
+                     args->t_above + blk_col,
+                     args->t_left + blk_row,
+                     tx_size, args->so->scan, args->so->neighbors,
                      args->use_fast_coef_costing);
+#endif
 }
 
 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
@@ -476,24 +584,55 @@
     return;
 
   if (!is_inter_block(mbmi)) {
+#if CONFIG_VAR_TX
+    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+#if CONFIG_VP9_HIGHBITDEPTH
+    vp10_encode_block_intra(plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size, &arg);
+    dist_block(x, plane, block, tx_size, &dist, &sse);
+#else
+    uint8_t *dst, *src;
+    int src_stride = x->plane[plane].src.stride;
+    int dst_stride = xd->plane[plane].dst.stride;
+    unsigned int tmp_sse;
+    PREDICTION_MODE mode = (plane == 0) ?
+        get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
+
+    src = &x->plane[plane].src.buf[4 * (blk_row * src_stride + blk_col)];
+    dst = &xd->plane[plane].dst.buf[4 * (blk_row * dst_stride + blk_col)];
+    vp10_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
+                             b_height_log2_lookup[plane_bsize],
+                             tx_size, mode, dst, dst_stride,
+                             dst, dst_stride, blk_col, blk_row, plane);
+    args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
+                                                   dst, dst_stride, &tmp_sse);
+    sse = (int64_t)tmp_sse * 16;
+    vp10_encode_block_intra(plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size, &arg);
+    args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
+                                                   dst, dst_stride, &tmp_sse);
+    dist = (int64_t)tmp_sse * 16;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
     struct encode_b_args arg = {x, NULL, &mbmi->skip};
     vp10_encode_block_intra(plane, block, blk_row, blk_col,
                             plane_bsize, tx_size, &arg);
     dist_block(x, plane, block, tx_size, &dist, &sse);
+#endif
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
         SKIP_TXFM_NONE) {
       // full forward transform and quantization
       vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                       plane_bsize, tx_size);
+                       plane_bsize, tx_size, VP10_XFORM_QUANT_B);
       dist_block(x, plane, block, tx_size, &dist, &sse);
     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
                SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
-      vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
-                          plane_bsize, tx_size);
+      vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                          plane_bsize, tx_size, VP10_XFORM_QUANT_DC);
       sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       dist = sse;
       if (x->plane[plane].eobs[block]) {
@@ -517,7 +656,8 @@
     }
   } else {
     // full forward transform and quantization
-    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                     VP10_XFORM_QUANT_B);
     dist_block(x, plane, block, tx_size, &dist, &sse);
   }
 
@@ -551,6 +691,9 @@
 }
 
 static void txfm_rd_in_plane(MACROBLOCK *x,
+#if CONFIG_VAR_TX
+                             const VP10_COMP *cpi,
+#endif
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
@@ -562,6 +705,9 @@
   struct rdcost_block_args args;
   vp10_zero(args);
   args.x = x;
+#if CONFIG_VAR_TX
+  args.cpi = cpi;
+#endif
   args.best_rd = ref_best_rd;
   args.use_fast_coef_costing = use_fast_coef_casting;
   args.skippable = 1;
@@ -571,11 +717,11 @@
 
   vp10_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-  tx_type = get_tx_type(pd->plane_type, xd, 0);
-  args.so = get_scan(tx_size, tx_type);
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.so = get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
 
   vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                         block_rd_txfm, &args);
+                                          block_rd_txfm, &args);
   if (args.exit_early) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
@@ -589,6 +735,54 @@
   }
 }
 
+#if CONFIG_SUPERTX
+void vp10_txfm_rd_in_plane_supertx(MACROBLOCK *x,
+#if CONFIG_VAR_TX
+                                   const VP10_COMP *cpi,
+#endif
+                                   int *rate, int64_t *distortion,
+                                   int *skippable, int64_t *sse,
+                                   int64_t ref_best_rd, int plane,
+                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                   int use_fast_coef_casting) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  TX_TYPE tx_type;
+
+  vp10_zero(args);
+  args.x = x;
+#if CONFIG_VAR_TX
+  args.cpi = cpi;
+#endif
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+
+  if (plane == 0)
+    xd->mi[0]->mbmi.tx_size = tx_size;
+
+  vp10_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.so = get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+
+  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd),
+                tx_size, &args);
+
+  if (args.exit_early) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = !x->plane[plane].eobs[0];
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
                                    int *rate, int64_t *distortion,
                                    int *skip, int64_t *sse,
@@ -599,21 +793,92 @@
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   int r, s;
   int64_t d, psse, this_rd, best_rd = INT64_MAX;
   vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
   int  s0 = vp10_cost_bit(skip_prob, 0);
   int  s1 = vp10_cost_bit(skip_prob, 1);
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
   const int is_inter = is_inter_block(mbmi);
 
   mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
+
+  if (is_inter &&
+      get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (is_inter) {
+        if (!ext_tx_used_inter[ext_tx_set][tx_type])
+          continue;
+      } else {
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+            continue;
+        }
+        if (!ext_tx_used_intra[ext_tx_set][tx_type])
+          continue;
+      }
+
+      mbmi->tx_type = tx_type;
+      if (ext_tx_set == 1 &&
+          mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
+          best_tx_type == DCT_DCT) {
+        tx_type = IDTX - 1;
+        continue;
+      }
+
+      txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                       cpi,
+#endif
+                       &r, &d, &s,
+                       &psse, ref_best_rd, 0, bs, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+
+      if (r == INT_MAX)
+        continue;
+      if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1) {
+        if (is_inter) {
+          if (ext_tx_set > 0)
+            r += cpi->inter_tx_type_costs[ext_tx_set]
+                                         [mbmi->tx_size][mbmi->tx_type];
+        } else {
+          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+            r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                         [mbmi->mode][mbmi->tx_type];
+        }
+      }
+
+      if (s)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
+      if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && !s)
+        this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
+
+      if (this_rd < ((best_tx_type == DCT_DCT) ? ext_tx_th : 1) * best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+      }
+    }
+  }
+
+#else  // CONFIG_EXT_TX
   if (mbmi->tx_size < TX_32X32 &&
       !xd->lossless[mbmi->segment_id]) {
     for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
       mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x, &r, &d, &s,
+      txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                       cpi,
+#endif
+                       &r, &d, &s,
                        &psse, ref_best_rd, 0, bs, mbmi->tx_size,
                        cpi->sf.use_fast_coef_costing);
       if (r == INT_MAX)
@@ -637,10 +902,33 @@
       }
     }
   }
+#endif  // CONFIG_EXT_TX
   mbmi->tx_type = best_tx_type;
-  txfm_rd_in_plane(x, rate, distortion, skip,
+
+  txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                   cpi,
+#endif
+                   rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[mbmi->segment_id] && *rate != INT_MAX) {
+    int ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *rate += cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                                      [mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *rate +=
+            cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                                 [mbmi->mode][mbmi->tx_type];
+    }
+  }
+#else
   if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id] &&
       *rate != INT_MAX) {
     if (is_inter)
@@ -650,6 +938,7 @@
           [intra_mode_to_tx_type_context[mbmi->mode]]
           [mbmi->tx_type];
   }
+#endif  // CONFIG_EXT_TX
 }
 
 static void choose_smallest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
@@ -662,7 +951,11 @@
 
   mbmi->tx_size = TX_4X4;
 
-  txfm_rd_in_plane(x, rate, distortion, skip,
+  txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                   cpi,
+#endif
+                   rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 }
@@ -690,6 +983,9 @@
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
 
   const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
@@ -722,11 +1018,56 @@
           r_tx_size += vp10_cost_one(tx_probs[m]);
       }
 
+#if CONFIG_EXT_TX
+      ext_tx_set = get_ext_tx_set(n, bs, is_inter);
+      if (is_inter) {
+        if (!ext_tx_used_inter[ext_tx_set][tx_type])
+          continue;
+      } else {
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+            continue;
+        }
+        if (!ext_tx_used_intra[ext_tx_set][tx_type])
+          continue;
+      }
+      mbmi->tx_type = tx_type;
+      if (ext_tx_set == 1 &&
+          mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
+          best_tx_type == DCT_DCT) {
+        tx_type = IDTX - 1;
+        break;
+      }
+      txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                       cpi,
+#endif
+                       &r, &d, &s,
+                       &sse, ref_best_rd, 0, bs, n,
+                       cpi->sf.use_fast_coef_costing);
+      if (get_ext_tx_types(n, bs, is_inter) > 1 &&
+          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+          r != INT_MAX) {
+        if (is_inter) {
+          if (ext_tx_set > 0)
+            r += cpi->inter_tx_type_costs[ext_tx_set]
+                                         [mbmi->tx_size][mbmi->tx_type];
+        } else {
+          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+            r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                         [mbmi->mode][mbmi->tx_type];
+        }
+      }
+#else  // CONFIG_EXT_TX
       if (n >= TX_32X32 && tx_type != DCT_DCT) {
         continue;
       }
       mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x, &r, &d, &s,
+      txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                       cpi,
+#endif
+                       &r, &d, &s,
                        &sse, ref_best_rd, 0, bs, n,
                        cpi->sf.use_fast_coef_costing);
       if (n < TX_32X32 &&
@@ -739,6 +1080,7 @@
               [intra_mode_to_tx_type_context[mbmi->mode]]
               [mbmi->tx_type];
       }
+#endif  // CONFIG_EXT_TX
 
       if (r == INT_MAX)
         continue;
@@ -783,9 +1125,11 @@
 
   mbmi->tx_size = best_tx;
   mbmi->tx_type = best_tx_type;
-  if (mbmi->tx_size >= TX_32X32)
-    assert(mbmi->tx_type == DCT_DCT);
-  txfm_rd_in_plane(x, &r, &d, &s,
+  txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                   cpi,
+#endif
+                   &r, &d, &s,
                    &sse, ref_best_rd, 0, bs, best_tx,
                    cpi->sf.use_fast_coef_costing);
 }
@@ -800,11 +1144,10 @@
 
   assert(bs == xd->mi[0]->mbmi.sb_type);
 
-  if (CONFIG_MISC_FIXES && xd->lossless[0]) {
+  if (xd->lossless[0]) {
     choose_smallest_tx_size(cpi, x, rate, distortion, skip, ret_sse,
                             ref_best_rd, bs);
-  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-             xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
                            bs);
   } else {
@@ -834,6 +1177,163 @@
   return 0;
 }
 
+void rd_pick_palette_intra_sby(VP10_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                               int palette_ctx, int dc_mode_cost,
+                               PALETTE_MODE_INFO *palette_mode_info,
+                               uint8_t *best_palette_color_map,
+                               TX_SIZE *best_tx, PREDICTION_MODE *mode_selected,
+                               int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
+  int colors, n;
+  int src_stride = x->plane[0].src.stride;
+  uint8_t *src = x->plane[0].src.buf;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth)
+    colors = vp10_count_colors_highbd(src, src_stride, rows, cols,
+                                      cpi->common.bit_depth);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    colors = vp10_count_colors(src, src_stride, rows, cols);
+  palette_mode_info->palette_size[0] = 0;
+
+  if (colors > 1 && colors <= 64 && cpi->common.allow_screen_content_tools) {
+    int r, c, i, j, k;
+    int max_itr = 50;
+    int color_ctx, color_idx = 0;
+    int color_order[PALETTE_MAX_SIZE];
+    double *data = x->palette_buffer->kmeans_data_buf;
+    uint8_t *indices = x->palette_buffer->kmeans_indices_buf;
+    uint8_t *pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
+    double centroids[PALETTE_MAX_SIZE];
+    uint8_t *color_map;
+    double lb, ub, val;
+    PALETTE_MODE_INFO *pmi = &mic->mbmi.palette_mode_info;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    if (cpi->common.use_highbitdepth)
+      lb = ub = src16[0];
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      lb = ub = src[0];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src16[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    mic->mbmi.mode = DC_PRED;
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+        n >= 2; --n) {
+      for (i = 0; i < n; ++i)
+        centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+      vp10_k_means(data, centroids, indices, pre_indices, rows * cols,
+                   n, 1, max_itr);
+      vp10_insertion_sort(centroids, n);
+      for (i = 0; i < n; ++i)
+        centroids[i] = round(centroids[i]);
+      // remove duplicates
+      i = 1;
+      k = n;
+      while (i < k) {
+        if (centroids[i] == centroids[i - 1]) {
+          j = i;
+          while (j < k - 1) {
+            centroids[j] = centroids[j + 1];
+            ++j;
+          }
+          --k;
+        } else {
+          ++i;
+        }
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cpi->common.use_highbitdepth)
+        for (i = 0; i < k; ++i)
+          mic->mbmi.palette_mode_info.palette_colors[i] =
+              clip_pixel_highbd(round(centroids[i]), cpi->common.bit_depth);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        for (i = 0; i < k; ++i)
+          pmi->palette_colors[i] = clip_pixel((int)round(centroids[i]));
+      pmi->palette_size[0] = k;
+
+      vp10_calc_indices(data, centroids, indices, rows * cols, k, 1);
+      for (r = 0; r < rows; ++r)
+        for (c = 0; c < cols; ++c)
+          xd->plane[0].color_index_map[r * cols + c] = indices[r * cols + c];
+
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, *best_rd);
+      if (this_rate_tokenonly == INT_MAX)
+        continue;
+
+      this_rate = this_rate_tokenonly + dc_mode_cost +
+          cpi->common.bit_depth * k * vp10_cost_bit(128, 0) +
+          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - 2];
+      this_rate +=
+          vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                         [palette_ctx], 1);
+      color_map = xd->plane[0].color_index_map;
+      this_rate +=  write_uniform_cost(k, xd->plane[0].color_index_map[0]);
+      for (i = 0; i < rows; ++i) {
+        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+          color_ctx = vp10_get_palette_color_context(color_map, cols, i, j,
+                                                     k, color_order);
+          for (r = 0; r < k; ++r)
+            if (color_map[i * cols + j] == color_order[r]) {
+              color_idx = r;
+              break;
+            }
+          assert(color_idx < k);
+          this_rate +=
+              cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
+        }
+      }
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *palette_mode_info = mic->mbmi.palette_mode_info;
+        memcpy(best_palette_color_map, xd->plane[0].color_index_map,
+               rows * cols * sizeof(xd->plane[0].color_index_map[0]));
+        *mode_selected = DC_PRED;
+        *best_tx = mic->mbmi.tx_size;
+      }
+    }
+  }
+}
+
 static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
                                      int row, int col,
                                      PREDICTION_MODE *best_mode,
@@ -864,6 +1364,7 @@
   memcpy(ta, a, sizeof(ta));
   memcpy(tl, l, sizeof(tl));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
+  xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -880,7 +1381,7 @@
       // one of the neighboring directional modes
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
         if (conditional_skipintra(mode, *best_mode))
-            continue;
+          continue;
       }
 
       memcpy(tempa, ta, sizeof(ta));
@@ -892,8 +1393,8 @@
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
           int16_t *const src_diff = vp10_raster_block_offset_int16(BLOCK_8X8,
-                                                                  block,
-                                                                  p->src_diff);
+                                                                   block,
+                                                                   p->src_diff);
           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
           xd->mi[0]->bmi[block].as_mode = mode;
           vp10_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride,
@@ -902,11 +1403,21 @@
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
                                     dst, dst_stride, xd->bd);
           if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = get_scan(TX_4X4, tx_type);
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+            const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+#if CONFIG_VAR_TX
+            const int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                           *(templ + idy));
+#endif
             vp10_highbd_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
             vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+            ratey += cost_coeffs(x, 0, block,
+#if CONFIG_VAR_TX
+                                 coeff_ctx,
+#else
+                                 tempa + idx, templ + idy,
+#endif
+                                 TX_4X4,
                                  so->scan, so->neighbors,
                                  cpi->sf.use_fast_coef_costing);
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -916,11 +1427,21 @@
                                          xd->bd, DCT_DCT, 1);
           } else {
             int64_t unused;
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = get_scan(TX_4X4, tx_type);
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+            const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+#if CONFIG_VAR_TX
+            const int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                           *(templ + idy));
+#endif
             vp10_highbd_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
             vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+            ratey += cost_coeffs(x, 0, block,
+#if CONFIG_VAR_TX
+                                 coeff_ctx,
+#else
+                                 tempa + idx, templ + idy,
+#endif
+                                 TX_4X4,
                                  so->scan, so->neighbors,
                                  cpi->sf.use_fast_coef_costing);
             distortion += vp10_highbd_block_error(
@@ -952,7 +1473,7 @@
                  num_4x4_blocks_wide * 4 * sizeof(uint16_t));
         }
       }
-    next_highbd:
+next_highbd:
       {}
     }
     if (best_rd >= rd_thresh)
@@ -981,7 +1502,7 @@
     // one of the neighboring directional modes
     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
       if (conditional_skipintra(mode, *best_mode))
-          continue;
+        continue;
     }
 
     memcpy(tempa, ta, sizeof(ta));
@@ -1001,26 +1522,49 @@
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = get_scan(TX_4X4, tx_type);
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+          const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+#if CONFIG_VAR_TX
+          int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                   *(templ + idy));
+#endif
           vp10_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
           vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+#if CONFIG_VAR_TX
+          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          *(tempa + idx) = !(p->eobs[block] == 0);
+          *(templ + idy) = !(p->eobs[block] == 0);
+#else
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy,
+                               TX_4X4,
                                so->scan, so->neighbors,
                                cpi->sf.use_fast_coef_costing);
+#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp10_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
                                 dst, dst_stride, p->eobs[block], DCT_DCT, 1);
         } else {
           int64_t unused;
-          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = get_scan(TX_4X4, tx_type);
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+          const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+#if CONFIG_VAR_TX
+          int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                   *(templ + idy));
+#endif
           vp10_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
           vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                             so->scan, so->neighbors,
-                             cpi->sf.use_fast_coef_costing);
+#if CONFIG_VAR_TX
+          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          *(tempa + idx) = !(p->eobs[block] == 0);
+          *(templ + idy) = !(p->eobs[block] == 0);
+#else
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy,
+                               TX_4X4, so->scan, so->neighbors,
+                               cpi->sf.use_fast_coef_costing);
+#endif
           distortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >> 2;
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -1078,11 +1622,15 @@
   int tot_rate_y = 0;
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
-  const int *bmode_costs = cpi->mbmode_cost;
+  const int *bmode_costs = cpi->mbmode_cost[0];
 
   memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
+#if CONFIG_EXT_INTRA
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+#endif  // CONFIG_EXT_INTRA
+
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
@@ -1127,6 +1675,294 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
 
+#if CONFIG_EXT_INTRA
+// Return 1 if an ext intra mode is selected; return 0 otherwise.
+static int rd_pick_ext_intra_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                 int *rate, int *rate_tokenonly,
+                                 int64_t *distortion, int *skippable,
+                                 BLOCK_SIZE bsize, int mode_cost,
+                                 int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int ext_intra_selected_flag = 0;
+  int64_t this_distortion, this_rd;
+  EXT_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_4X4;
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+#if CONFIG_EXT_TX
+  TX_TYPE best_tx_type;
+#endif  // CONFIG_EXT_TX
+
+  vp10_zero(ext_intra_mode_info);
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 1;
+  mbmi->mode = DC_PRED;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[0] = mode;
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                    &s, NULL, bsize, *best_rd);
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
+
+    this_rate = this_rate_tokenonly +
+        vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 1) +
+        write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < *best_rd) {
+      *best_rd            = this_rd;
+      best_tx_size        = mic->mbmi.tx_size;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+#if CONFIG_EXT_TX
+      best_tx_type        = mic->mbmi.tx_type;
+#endif  // CONFIG_EXT_TX
+      *rate               = this_rate;
+      *rate_tokenonly     = this_rate_tokenonly;
+      *distortion         = this_distortion;
+      *skippable          = s;
+      ext_intra_selected_flag = 1;
+    }
+  }
+
+  if (ext_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
+        ext_intra_mode_info.use_ext_intra_mode[0];
+    mbmi->ext_intra_mode_info.ext_intra_mode[0] =
+        ext_intra_mode_info.ext_intra_mode[0];
+#if CONFIG_EXT_TX
+    mbmi->tx_type = best_tx_type;
+#endif  // CONFIG_EXT_TX
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static int64_t rd_pick_intra_angle_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, int rate_overhead,
+                                       int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int angle_delta, best_angle_delta = 0;
+  const double rd_adjust = 1.2;
+  int64_t this_distortion, this_rd, sse_dummy;
+  TX_SIZE best_tx_size = mic->mbmi.tx_size;
+#if CONFIG_EXT_TX
+  TX_TYPE best_tx_type = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = {0, -2, 2};
+    int deltas_level2[3][2] = {
+        {-1, 1}, {-3, -1}, {1, 3},
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
+
+    for (i = 0; i < level1; ++i) {
+      mic->mbmi.angle_delta[0] = deltas_level1[i];
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize,
+                      (i == 0 && best_rd < INT64_MAX) ? best_rd * rd_adjust :
+                          best_rd);
+      if (this_rate_tokenonly == INT_MAX) {
+        if (i == 0)
+          break;
+        else
+          continue;
+      }
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (i == 0 && best_rd < INT64_MAX && this_rd > best_rd * rd_adjust)
+        break;
+      if (this_rd < best_rd) {
+        best_i              = i;
+        best_rd             = this_rd;
+        best_angle_delta    = mbmi->angle_delta[0];
+        best_tx_size        = mbmi->tx_size;
+#if CONFIG_EXT_TX
+        best_tx_type        = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+        *rate               = this_rate;
+        *rate_tokenonly     = this_rate_tokenonly;
+        *distortion         = this_distortion;
+        *skippable          = s;
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mic->mbmi.angle_delta[0] = deltas_level2[best_i][j];
+        super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                        &s, NULL, bsize, best_rd);
+        if (this_rate_tokenonly == INT_MAX)
+          continue;
+        this_rate = this_rate_tokenonly + rate_overhead;
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+        if (this_rd < best_rd) {
+          best_rd             = this_rd;
+          best_angle_delta    = mbmi->angle_delta[0];
+          best_tx_size        = mbmi->tx_size;
+#if CONFIG_EXT_TX
+          best_tx_type        = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+          *rate               = this_rate;
+          *rate_tokenonly     = this_rate_tokenonly;
+          *distortion         = this_distortion;
+          *skippable          = s;
+        }
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+        ++angle_delta) {
+      mic->mbmi.angle_delta[0] = angle_delta;
+
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, best_rd);
+      if (this_rate_tokenonly == INT_MAX)
+        continue;
+
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+      if (this_rd < best_rd) {
+        best_rd             = this_rd;
+        best_angle_delta    = mbmi->angle_delta[0];
+        best_tx_size        = mbmi->tx_size;
+#if CONFIG_EXT_TX
+        best_tx_type        = mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
+        *rate               = this_rate;
+        *rate_tokenonly     = this_rate_tokenonly;
+        *distortion         = this_distortion;
+        *skippable          = s;
+      }
+    }
+  }
+
+  mbmi->tx_size = best_tx_size;
+  mbmi->angle_delta[0] = best_angle_delta;
+#if CONFIG_EXT_TX
+  mbmi->tx_type = best_tx_type;
+#endif  // CONFIG_EXT_TX
+
+  if (*rate_tokenonly < INT_MAX) {
+    txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                     cpi,
+#endif
+                     &this_rate_tokenonly, &this_distortion, &s,
+                     &sse_dummy, INT64_MAX, 0, bsize, mbmi->tx_size,
+                     cpi->sf.use_fast_coef_costing);
+  }
+
+  return best_rd;
+}
+
+static inline int get_angle_index(double angle) {
+  const double step = 22.5, base = 45;
+  return (int)round((angle - base) / step);
+}
+
+static void angle_estimation(const uint8_t *src, int src_stride,
+                             int rows, int cols, double *hist) {
+  int r, c, i, index;
+  const double pi = 3.1415;
+  double angle, dx, dy;
+  double temp, divisor = 0;
+
+  for (i = 0; i < DIRECTIONAL_MODES; ++i)
+    hist[i] = 0;
+
+  src += src_stride;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      temp = dx * dx + dy * dy;
+      if (dy == 0)
+        angle = 90;
+      else
+        angle = (atan((double)dx / (double)dy)) * 180 / pi;
+      assert(angle >= -90 && angle <= 90);
+      index = get_angle_index(angle + 180);
+      if (index < DIRECTIONAL_MODES) {
+        hist[index] += temp;
+        divisor += temp;
+      }
+      if (angle > 0) {
+        index = get_angle_index(angle);
+        if (index >= 0) {
+          hist[index] += temp;
+          divisor += temp;
+        }
+      }
+    }
+    src += src_stride;
+  }
+
+  if (divisor < 1)
+    divisor = 1;
+  for (i = 0; i < DIRECTIONAL_MODES; ++i)
+    hist[i] /= divisor;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+                                    int rows, int cols, double *hist) {
+  int r, c, i, index;
+  const double pi = 3.1415;
+  double angle, dx, dy;
+  double temp, divisor = 0;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  for (i = 0; i < DIRECTIONAL_MODES; ++i)
+    hist[i] = 0;
+
+  src += src_stride;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      temp = dx * dx + dy * dy;
+      if (dy == 0)
+        angle = 90;
+      else
+        angle = (atan((double)dx / (double)dy)) * 180 / pi;
+      assert(angle >= -90 && angle <= 90);
+      index = get_angle_index(angle + 180);
+      if (index < DIRECTIONAL_MODES) {
+        hist[index] += temp;
+        divisor += temp;
+      }
+      if (angle > 0) {
+        index = get_angle_index(angle);
+        if (index >= 0) {
+          hist[index] += temp;
+          divisor += temp;
+        }
+      }
+    }
+    src += src_stride;
+  }
+
+  if (divisor < 1)
+    divisor = 1;
+  for (i = 0; i < DIRECTIONAL_MODES; ++i)
+    hist[i] /= divisor;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTRA
+
 // This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
@@ -1140,33 +1976,117 @@
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
+#if CONFIG_EXT_INTRA
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+  double hist[DIRECTIONAL_MODES];
+#endif  // CONFIG_EXT_INTRA
   TX_TYPE best_tx_type = DCT_DCT;
   int *bmode_costs;
+  PALETTE_MODE_INFO palette_mode_info;
+  uint8_t *best_palette_color_map = cpi->common.allow_screen_content_tools ?
+      x->palette_buffer->best_palette_color_map : NULL;
+  const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int palette_ctx = 0;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp10_left_block_mode(mic, left_mi, 0);
   bmode_costs = cpi->y_mode_costs[A][L];
 
+#if CONFIG_EXT_INTRA
+  ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mic->mbmi.angle_delta[0] = 0;
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    highbd_angle_estimation(src, src_stride, rows, cols, hist);
+  else
+#endif
+    angle_estimation(src, src_stride, rows, cols, hist);
+
+  for (mode = 0; mode < INTRA_MODES; ++mode) {
+    if (mode != DC_PRED && mode != TM_PRED) {
+      int index = get_angle_index((double)mode_to_angle_map[mode]);
+      double score, weight = 1.0;
+      score = hist[index];
+      if (index > 0) {
+        score += hist[index - 1] * 0.5;
+        weight += 0.5;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1] * 0.5;
+        weight += 0.5;
+      }
+      score /= weight;
+      if (score < ANGLE_SKIP_THRESH)
+        directional_mode_skip_mask[mode] = 1;
+    }
+  }
+#endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  palette_mode_info.palette_size[0] = 0;
+  mic->mbmi.palette_mode_info.palette_size[0] = 0;
+  if (above_mi)
+    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  if (left_mi)
+    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
 
   /* Y Search for intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     mic->mbmi.mode = mode;
-
+#if CONFIG_EXT_INTRA
+    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
+    if (is_directional_mode && directional_mode_skip_mask[mode])
+      continue;
+    if (is_directional_mode) {
+      rate_overhead = bmode_costs[mode] +
+          write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+      this_rate_tokenonly = INT_MAX;
+      this_rd =
+          rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rate_tokenonly,
+                                  &this_distortion, &s, bsize, rate_overhead,
+                                  best_rd);
+    } else {
+      mic->mbmi.angle_delta[0] = 0;
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, best_rd);
+    }
+#endif  // CONFIG_EXT_INTRA
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, NULL, bsize, best_rd);
+                    &s, NULL, bsize, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
 
     this_rate = this_rate_tokenonly + bmode_costs[mode];
+    if (cpi->common.allow_screen_content_tools && mode == DC_PRED)
+      this_rate +=
+          vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                         [palette_ctx], 0);
+#if CONFIG_EXT_INTRA
+    if (mode == DC_PRED && ALLOW_FILTER_INTRA_MODES)
+      this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 0);
+    if (is_directional_mode)
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS +
+                                      mic->mbmi.angle_delta[0]);
+#endif  // CONFIG_EXT_INTRA
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
       mode_selected   = mode;
       best_rd         = this_rd;
       best_tx         = mic->mbmi.tx_size;
+#if CONFIG_EXT_INTRA
+      best_angle_delta = mic->mbmi.angle_delta[0];
+#endif  // CONFIG_EXT_INTRA
       best_tx_type    = mic->mbmi.tx_type;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
@@ -1175,13 +2095,685 @@
     }
   }
 
+  if (cpi->common.allow_screen_content_tools)
+    rd_pick_palette_intra_sby(cpi, x, bsize, palette_ctx, bmode_costs[DC_PRED],
+                              &palette_mode_info, best_palette_color_map,
+                              &best_tx, &mode_selected, &best_rd);
+
+#if CONFIG_EXT_INTRA
+  if (!palette_mode_info.palette_size[0] > 0 && ALLOW_FILTER_INTRA_MODES) {
+    if (rd_pick_ext_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                              skippable, bsize, bmode_costs[DC_PRED],
+                              &best_rd)) {
+      mode_selected       = mic->mbmi.mode;
+      best_tx             = mic->mbmi.tx_size;
+      ext_intra_mode_info = mic->mbmi.ext_intra_mode_info;
+      best_tx_type        = mic->mbmi.tx_type;
+    }
+  }
+
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] =
+      ext_intra_mode_info.use_ext_intra_mode[0];
+  if (ext_intra_mode_info.use_ext_intra_mode[0]) {
+    mic->mbmi.ext_intra_mode_info.ext_intra_mode[0] =
+        ext_intra_mode_info.ext_intra_mode[0];
+  }
+#endif  // CONFIG_EXT_INTRA
+
   mic->mbmi.mode = mode_selected;
   mic->mbmi.tx_size = best_tx;
+#if CONFIG_EXT_INTRA
+  mic->mbmi.angle_delta[0] = best_angle_delta;
+#endif  // CONFIG_EXT_INTRA
   mic->mbmi.tx_type = best_tx_type;
+  mic->mbmi.palette_mode_info.palette_size[0] =
+      palette_mode_info.palette_size[0];
+  if (palette_mode_info.palette_size[0] > 0) {
+    memcpy(mic->mbmi.palette_mode_info.palette_colors,
+           palette_mode_info.palette_colors,
+           PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
+    memcpy(xd->plane[0].color_index_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+  }
 
   return best_rd;
 }
 
+#if CONFIG_VAR_TX
+static void tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                          int blk_row, int blk_col, int plane, int block,
+                          int plane_bsize, int coeff_ctx,
+                          int *rate, int64_t *dist, int64_t *bsse, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int ss_txfrm_size = tx_size << 1;
+  int64_t this_sse;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+#endif
+  unsigned int tmp_sse = 0;
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const scan_order *const scan_order =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+
+  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+  int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
+  int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
+  uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, rec_buffer_alloc_16[32 * 32]);
+  uint8_t *rec_buffer;
+#else
+  DECLARE_ALIGNED(16, uint8_t, rec_buffer[32 * 32]);
+#endif
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                   VP10_XFORM_QUANT_B);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer_alloc_16);
+    vpx_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+                             NULL, 0, NULL, 0, bh, bh, xd->bd);
+  } else {
+    rec_buffer = (uint8_t *)rec_buffer_alloc_16;
+    vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+                      NULL, 0, NULL, 0, bh, bh);
+  }
+#else
+  vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, 32,
+                    NULL, 0, NULL, 0, bh, bh);
+#endif
+
+  if (blk_row + (bh >> 2) > max_blocks_high ||
+      blk_col + (bh >> 2) > max_blocks_wide) {
+    int idx, idy;
+    unsigned int this_sse;
+    int blocks_height = VPXMIN(bh >> 2, max_blocks_high - blk_row);
+    int blocks_width  = VPXMIN(bh >> 2, max_blocks_wide - blk_col);
+    for (idy = 0; idy < blocks_height; idy += 2) {
+      for (idx = 0; idx < blocks_width; idx += 2) {
+        cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
+                                  src_stride,
+                                  rec_buffer + 4 * idy * 32 + 4 * idx,
+                                  32, &this_sse);
+        tmp_sse += this_sse;
+      }
+    }
+  } else {
+    cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, 32, &tmp_sse);
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  *dist += vp10_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                   &this_sse, xd->bd) >> shift;
+  *bsse += this_sse >> shift;
+#else
+  *bsse += (int64_t)tmp_sse * 16;
+
+  if (p->eobs[block] > 0) {
+    switch (tx_size) {
+      case TX_32X32:
+        vp10_inv_txfm_add_32x32(dqcoeff, rec_buffer, 32, p->eobs[block],
+                                tx_type);
+        break;
+      case TX_16X16:
+        vp10_inv_txfm_add_16x16(dqcoeff, rec_buffer, 32, p->eobs[block],
+                                tx_type);
+        break;
+      case TX_8X8:
+        vp10_inv_txfm_add_8x8(dqcoeff, rec_buffer, 32, p->eobs[block],
+                              tx_type);
+        break;
+      case TX_4X4:
+        vp10_inv_txfm_add_4x4(dqcoeff, rec_buffer, 32, p->eobs[block],
+                              tx_type,
+                              xd->lossless[xd->mi[0]->mbmi.segment_id]);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        break;
+    }
+
+    if ((bh >> 2) + blk_col > max_blocks_wide ||
+        (bh >> 2) + blk_row > max_blocks_high) {
+      int idx, idy;
+      unsigned int this_sse;
+      int blocks_height = VPXMIN(bh >> 2, max_blocks_high - blk_row);
+      int blocks_width  = VPXMIN(bh >> 2, max_blocks_wide - blk_col);
+      tmp_sse = 0;
+      for (idy = 0; idy < blocks_height; idy += 2) {
+        for (idx = 0; idx < blocks_width; idx += 2) {
+          cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
+                                    src_stride,
+                                    rec_buffer + 4 * idy * 32 + 4 * idx,
+                                    32, &this_sse);
+          tmp_sse += this_sse;
+        }
+      }
+    } else {
+      cpi->fn_ptr[txm_bsize].vf(src, src_stride,
+                                rec_buffer, 32, &tmp_sse);
+    }
+  }
+  *dist += (int64_t)tmp_sse * 16;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *rate += cost_coeffs(x, plane, block, coeff_ctx, tx_size,
+                       scan_order->scan, scan_order->neighbors, 0);
+  *skip &= (p->eobs[block] == 0);
+}
+
+static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int blk_row, int blk_col, int plane, int block,
+                            TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                            int *rate, int64_t *dist,
+                            int64_t *bsse, int *skip,
+                            int64_t ref_best_rd, int *is_cost_valid) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int64_t this_rd = INT64_MAX;
+  ENTROPY_CONTEXT *pta = ta + blk_col;
+  ENTROPY_CONTEXT *ptl = tl + blk_row;
+  ENTROPY_CONTEXT stxa = 0, stxl = 0;
+  int coeff_ctx, i;
+  int ctx = txfm_partition_context(tx_above + (blk_col >> 1),
+                                   tx_left + (blk_row >> 1), tx_size);
+
+  int64_t sum_dist = 0, sum_bsse = 0;
+  int64_t sum_rd = INT64_MAX;
+  int sum_rate = vp10_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+  int all_skip = 1;
+  int tmp_eob = 0;
+  int zero_blk_rate;
+
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  switch (tx_size) {
+    case TX_4X4:
+      stxa = pta[0];
+      stxl = ptl[0];
+      break;
+    case TX_8X8:
+      stxa = !!*(const uint16_t *)&pta[0];
+      stxl = !!*(const uint16_t *)&ptl[0];
+      break;
+    case TX_16X16:
+      stxa = !!*(const uint32_t *)&pta[0];
+      stxl = !!*(const uint32_t *)&ptl[0];
+      break;
+    case TX_32X32:
+      stxa = !!*(const uint64_t *)&pta[0];
+      stxl = !!*(const uint64_t *)&ptl[0];
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+  coeff_ctx = combine_entropy_contexts(stxa, stxl);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  *rate = 0;
+  *dist = 0;
+  *bsse = 0;
+  *skip = 1;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  zero_blk_rate =
+      x->token_costs[tx_size][pd->plane_type][1][0][0][coeff_ctx][EOB_TOKEN];
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+    mbmi->inter_tx_size[tx_idx] = tx_size;
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                  plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+
+    if ((RDCOST(x->rdmult, x->rddiv, *rate, *dist) >=
+         RDCOST(x->rdmult, x->rddiv, zero_blk_rate, *bsse) || *skip == 1) &&
+        !xd->lossless[mbmi->segment_id]) {
+      *rate = zero_blk_rate;
+      *dist = *bsse;
+      *skip = 1;
+      x->blk_skip[plane][blk_row * max_blocks_wide + blk_col] = 1;
+      p->eobs[block] = 0;
+    } else {
+      x->blk_skip[plane][blk_row * max_blocks_wide + blk_col] = 0;
+      *skip = 0;
+    }
+
+    if (tx_size > TX_4X4)
+      *rate += vp10_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
+    tmp_eob = p->eobs[block];
+  }
+
+  if (tx_size > TX_4X4) {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_height_log2_lookup[bsize];
+    int sub_step = 1 << (2 * (tx_size - 1));
+    int i;
+    int this_rate;
+    int64_t this_dist;
+    int64_t this_bsse;
+    int this_skip;
+    int this_cost_valid = 1;
+    int64_t tmp_rd = 0;
+
+    --bsl;
+    for (i = 0; i < 4 && this_cost_valid; ++i) {
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
+      select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc,
+                      plane, block + i * sub_step, tx_size - 1,
+                      plane_bsize, ta, tl, tx_above, tx_left,
+                      &this_rate, &this_dist,
+                      &this_bsse, &this_skip,
+                      ref_best_rd - tmp_rd, &this_cost_valid);
+      sum_rate += this_rate;
+      sum_dist += this_dist;
+      sum_bsse += this_bsse;
+      all_skip &= this_skip;
+      tmp_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      if (this_rd < tmp_rd)
+        break;
+    }
+    if (this_cost_valid)
+      sum_rd = tmp_rd;
+  }
+
+  if (this_rd < sum_rd) {
+    int idx, idy;
+    for (i = 0; i < (1 << tx_size); ++i)
+      pta[i] = ptl[i] = !(tmp_eob == 0);
+    txfm_partition_update(tx_above + (blk_col >> 1),
+                          tx_left + (blk_row >> 1), tx_size);
+    mbmi->inter_tx_size[tx_idx] = tx_size;
+
+    for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
+      for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
+        mbmi->inter_tx_size[tx_idx + (idy << 3) + idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    if (this_rd == INT64_MAX)
+      *is_cost_valid = 0;
+    x->blk_skip[plane][blk_row * max_blocks_wide + blk_col] = *skip;
+  } else {
+    *rate = sum_rate;
+    *dist = sum_dist;
+    *bsse = sum_bsse;
+    *skip = all_skip;
+    if (sum_rd == INT64_MAX)
+      *is_cost_valid = 0;
+  }
+}
+
+static void inter_block_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+    ENTROPY_CONTEXT ctxa[16], ctxl[16];
+    TXFM_CONTEXT tx_above[8], tx_left[8];
+
+    int pnrate = 0, pnskip = 1;
+    int64_t pndist = 0, pnsse = 0;
+
+    vp10_get_entropy_contexts(bsize, TX_4X4, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_width >> 1));
+    memcpy(tx_left, xd->left_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_height >> 1));
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        select_tx_block(cpi, x, idy, idx, 0, block,
+                        max_txsize_lookup[plane_bsize], plane_bsize,
+                        ctxa, ctxl, tx_above, tx_left,
+                        &pnrate, &pndist, &pnsse, &pnskip,
+                        ref_best_rd - this_rd, &is_cost_valid);
+        *rate += pnrate;
+        *distortion += pndist;
+        *sse += pnsse;
+        *skippable &= pnskip;
+        this_rd += VPXMIN(RDCOST(x->rdmult, x->rddiv, pnrate, pndist),
+                          RDCOST(x->rdmult, x->rddiv, 0, pnsse));
+        block += step;
+      }
+    }
+  }
+
+  this_rd = VPXMIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
+                   RDCOST(x->rdmult, x->rddiv, 0, *sse));
+  if (this_rd > ref_best_rd)
+    is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+}
+
+#if CONFIG_EXT_TX
+static void select_tx_type_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                               int *rate, int64_t *distortion, int *skippable,
+                               int64_t *sse, BLOCK_SIZE bsize,
+                               int64_t ref_best_rd) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  const VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int ext_tx_set;
+  const int is_inter = is_inter_block(mbmi);
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int s0 = vp10_cost_bit(skip_prob, 0);
+  int s1 = vp10_cost_bit(skip_prob, 1);
+  TX_SIZE best_tx_size[64];
+  TX_SIZE best_tx = TX_SIZES;
+  uint8_t best_blk_skip[256];
+  const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
+  int idx, idy;
+
+  *distortion = INT64_MAX;
+  *rate       = INT_MAX;
+  *skippable  = 0;
+  *sse        = INT64_MAX;
+
+  ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    int this_rate = 0;
+    int this_skip = 1;
+    int64_t this_dist = 0;
+    int64_t this_sse  = 0;
+
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type])
+        continue;
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+          continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type])
+        continue;
+    }
+
+    mbmi->tx_type = tx_type;
+
+    if (ext_tx_set == 1 &&
+        mbmi->tx_type >= DST_ADST && mbmi->tx_type < IDTX &&
+        best_tx_type == DCT_DCT) {
+      tx_type = IDTX - 1;
+      break;
+    }
+
+    inter_block_yrd(cpi, x, &this_rate, &this_dist, &this_skip, &this_sse,
+                    bsize, ref_best_rd);
+
+    if (get_ext_tx_types(max_tx_size, bsize, is_inter) > 1 &&
+        !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+        this_rate != INT_MAX) {
+      if (is_inter) {
+        if (ext_tx_set > 0)
+          this_rate += cpi->inter_tx_type_costs[ext_tx_set]
+                                       [max_tx_size][mbmi->tx_type];
+      } else {
+        if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+          this_rate += cpi->intra_tx_type_costs[ext_tx_set][max_tx_size]
+                                       [mbmi->mode][mbmi->tx_type];
+      }
+    }
+
+    if (this_rate == INT_MAX)
+      continue;
+
+    if (this_skip)
+      rd = RDCOST(x->rdmult, x->rddiv, s1, this_sse);
+    else
+      rd = RDCOST(x->rdmult, x->rddiv, this_rate + s0, this_dist);
+
+    if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !this_skip)
+      rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, this_sse));
+
+    if (rd <
+        (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) *
+        best_rd) {
+      best_rd = rd;
+      *distortion = this_dist;
+      *rate       = this_rate;
+      *skippable  = this_skip;
+      *sse        = this_sse;
+      best_tx_type = mbmi->tx_type;
+      best_tx = mbmi->tx_size;
+      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_tx_size[idy * 8 + idx] = mbmi->inter_tx_size[idy * 8 + idx];
+    }
+  }
+
+  mbmi->tx_type = best_tx_type;
+  for (idy = 0; idy < xd->n8_h; ++idy)
+    for (idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy * 8 + idx] = best_tx_size[idy * 8 + idx];
+  mbmi->tx_size = best_tx;
+  memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+}
+#endif
+
+static void tx_block_rd(const VP10_COMP *cpi, MACROBLOCK *x,
+                        int blk_row, int blk_col, int plane, int block,
+                        TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                        ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+                        int *rate, int64_t *dist, int64_t *bsse, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int tx_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+               (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_idx], bsize,
+                          0, 0) :
+      mbmi->inter_tx_size[tx_idx];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    int coeff_ctx, i;
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx  + blk_row;
+    switch (tx_size) {
+      case TX_4X4:
+        break;
+      case TX_8X8:
+        ta[0] = !!*(const uint16_t *)&ta[0];
+        tl[0] = !!*(const uint16_t *)&tl[0];
+        break;
+      case TX_16X16:
+        ta[0] = !!*(const uint32_t *)&ta[0];
+        tl[0] = !!*(const uint32_t *)&tl[0];
+        break;
+      case TX_32X32:
+        ta[0] = !!*(const uint64_t *)&ta[0];
+        tl[0] = !!*(const uint64_t *)&tl[0];
+        break;
+      default:
+        assert(0 && "Invalid transform size.");
+        break;
+    }
+    coeff_ctx = combine_entropy_contexts(ta[0], tl[0]);
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                  plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+    for (i = 0; i < (1 << tx_size); ++i) {
+      ta[i] = !(p->eobs[block] == 0);
+      tl[i] = !(p->eobs[block] == 0);
+    }
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int step = 1 << (2 * (tx_size - 1));
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
+      tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
+                  block + i * step, tx_size - 1, plane_bsize,
+                  above_ctx, left_ctx, rate, dist, bsse, skip);
+    }
+  }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int plane;
+  int is_cost_valid = 1;
+  int64_t this_rd;
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp10_subtract_plane(x, bsize, plane);
+  }
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+    int pnrate = 0, pnskip = 1;
+    int64_t pndist = 0, pnsse = 0;
+    ENTROPY_CONTEXT ta[16], tl[16];
+
+    vp10_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        tx_block_rd(cpi, x, idy, idx, plane, block,
+                    max_txsize_lookup[plane_bsize], plane_bsize, ta, tl,
+                    &pnrate, &pndist, &pnsse, &pnskip);
+        block += step;
+      }
+    }
+
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+
+    *rate += pnrate;
+    *distortion += pndist;
+    *sse += pnsse;
+    *skippable &= pnskip;
+
+    this_rd = VPXMIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
+                     RDCOST(x->rdmult, x->rddiv, 0, *sse));
+
+    if (this_rd > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
+}
+#endif
+
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
 static int super_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x,
@@ -1211,7 +2803,11 @@
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(x,
+#if CONFIG_VAR_TX
+                     cpi,
+#endif
+                     &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_tx_size,
                      cpi->sf.use_fast_coef_costing);
     if (pnrate == INT_MAX) {
@@ -1235,34 +2831,221 @@
   return is_cost_valid;
 }
 
+#if CONFIG_EXT_INTRA
+// Return 1 if an ext intra mode is selected; return 0 otherwise.
+static int rd_pick_ext_intra_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int *rate, int *rate_tokenonly,
+                                  int64_t *distortion, int *skippable,
+                                  BLOCK_SIZE bsize, int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int ext_intra_selected_flag = 0;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  EXT_INTRA_MODE mode;
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+
+  vp10_zero(ext_intra_mode_info);
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 1;
+  mbmi->uv_mode = DC_PRED;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] = mode;
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, *best_rd))
+      continue;
+
+    this_rate = this_rate_tokenonly +
+        vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 1) +
+        cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+        write_uniform_cost(FILTER_INTRA_MODES, mode);
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    if (this_rd < *best_rd) {
+      *best_rd        = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+      ext_intra_selected_flag = 1;
+      if (!x->select_tx_size)
+        swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
+    }
+  }
+
+
+  if (ext_intra_selected_flag) {
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+        ext_intra_mode_info.use_ext_intra_mode[1];
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+        ext_intra_mode_info.ext_intra_mode[1];
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static int rd_pick_intra_angle_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                    PICK_MODE_CONTEXT *ctx,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  int angle_delta, best_angle_delta = 0;
+  const double rd_adjust = 1.2;
+
+  (void)ctx;
+  *rate_tokenonly = INT_MAX;
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = {0, -2, 2};
+    int deltas_level2[3][2] = {
+        {-1, 1}, {-3, -1}, {1, 3},
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
+
+    for (i = 0; i < level1; ++i) {
+      mbmi->angle_delta[1] = deltas_level1[i];
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize,
+                            (i == 0 && best_rd < INT64_MAX) ?
+                                best_rd * rd_adjust : best_rd)) {
+        if (i == 0)
+          break;
+        else
+          continue;
+      }
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (i == 0 && best_rd < INT64_MAX && this_rd > best_rd * rd_adjust)
+        break;
+      if (this_rd < best_rd) {
+        best_i           = i;
+        best_rd          = this_rd;
+        best_angle_delta = mbmi->angle_delta[1];
+        *rate            = this_rate;
+        *rate_tokenonly  = this_rate_tokenonly;
+        *distortion      = this_distortion;
+        *skippable       = s;
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mbmi->angle_delta[1] = deltas_level2[best_i][j];
+        if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                              &this_distortion, &s, &this_sse, bsize, best_rd))
+          continue;
+        this_rate = this_rate_tokenonly + rate_overhead;
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+        if (this_rd < best_rd) {
+          best_rd          = this_rd;
+          best_angle_delta = mbmi->angle_delta[1];
+          *rate            = this_rate;
+          *rate_tokenonly  = this_rate_tokenonly;
+          *distortion      = this_distortion;
+          *skippable       = s;
+        }
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+        ++angle_delta) {
+      mbmi->angle_delta[1] = angle_delta;
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize, best_rd))
+        continue;
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (this_rd < best_rd) {
+        best_rd          = this_rd;
+        best_angle_delta = mbmi->angle_delta[1];
+        *rate            = this_rate;
+        *rate_tokenonly  = this_rate_tokenonly;
+        *distortion      = this_distortion;
+        *skippable       = s;
+      }
+    }
+  }
+
+  mbmi->angle_delta[1] = best_angle_delta;
+  if (*rate_tokenonly != INT_MAX)
+    super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                     &this_distortion, &s, &this_sse, bsize, INT_MAX);
+  return *rate_tokenonly != INT_MAX;
+}
+#endif  // CONFIG_EXT_INTRA
+
 static int64_t rd_pick_intra_sbuv_mode(VP10_COMP *cpi, MACROBLOCK *x,
                                        PICK_MODE_CONTEXT *ctx,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   PREDICTION_MODE mode;
   PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
+#if CONFIG_EXT_INTRA
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
 
+  ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  xd->mi[0]->mbmi.palette_mode_info.palette_size[1] = 0;
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    xd->mi[0]->mbmi.uv_mode = mode;
-
+    mbmi->uv_mode = mode;
+#if CONFIG_EXT_INTRA
+    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
+    rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+        write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+    mbmi->angle_delta[1] = 0;
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode) {
+      if (!rd_pick_intra_angle_sbuv(cpi, x, ctx, &this_rate,
+                                    &this_rate_tokenonly, &this_distortion, &s,
+                                    bsize, rate_overhead, best_rd))
+        continue;
+    } else {
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize, best_rd))
+        continue;
+    }
+    this_rate = this_rate_tokenonly +
+        cpi->intra_uv_mode_cost[mbmi->mode][mode];
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode)
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS +
+                                      mbmi->angle_delta[1]);
+    if (mode == DC_PRED && 0)
+      this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 0);
+#else
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
                           &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
         cpi->intra_uv_mode_cost[xd->mi[0]->mbmi.mode][mode];
+#endif  // CONFIG_EXT_INTRA
+
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
       mode_selected   = mode;
+#if CONFIG_EXT_INTRA
+      best_angle_delta = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
       best_rd         = this_rd;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
@@ -1273,7 +3056,23 @@
     }
   }
 
-  xd->mi[0]->mbmi.uv_mode = mode_selected;
+#if CONFIG_EXT_INTRA
+  if (mbmi->sb_type >= BLOCK_8X8 && ALLOW_FILTER_INTRA_MODES) {
+    if (rd_pick_ext_intra_sbuv(cpi, x, ctx, rate, rate_tokenonly, distortion,
+                               skippable, bsize, &best_rd)) {
+      mode_selected   = mbmi->uv_mode;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+    }
+  }
+
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+      ext_intra_mode_info.use_ext_intra_mode[1];
+  if (ext_intra_mode_info.use_ext_intra_mode[1])
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+        ext_intra_mode_info.ext_intra_mode[1];
+  mbmi->angle_delta[1] = best_angle_delta;
+#endif  // CONFIG_EXT_INTRA
+  mbmi->uv_mode = mode_selected;
   return best_rd;
 }
 
@@ -1314,9 +3113,46 @@
 }
 
 static int cost_mv_ref(const VP10_COMP *cpi, PREDICTION_MODE mode,
-                       int mode_context) {
+                       int16_t mode_context) {
+#if CONFIG_REF_MV
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+    if (is_all_zero_mv)
+      return mode_cost;
+
+    if (mode == ZEROMV) {
+      mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET))
+        mode_ctx = 6;
+      if (mode_context & (1 << SKIP_NEARMV_OFFSET))
+        mode_ctx = 7;
+      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+        mode_ctx = 8;
+
+      mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+#else
   assert(is_inter_mode(mode));
   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+#endif
 }
 
 static int set_and_cost_bmi_mvs(VP10_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
@@ -1334,6 +3170,7 @@
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   const int is_compound = has_second_ref(mbmi);
+  int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
 
   switch (mode) {
     case NEWMV:
@@ -1371,8 +3208,11 @@
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mbmi->ref_frame[0]]) +
-            thismvcost;
+#if CONFIG_REF_MV
+  mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                        mbmi->ref_frame, mbmi->sb_type, i);
+#endif
+  return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost;
 }
 
 static int64_t encode_inter_mb_segment(VP10_COMP *cpi,
@@ -1402,8 +3242,8 @@
                                                             pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i);
-  const scan_order *so = get_scan(TX_4X4, tx_type);
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, TX_4X4);
+  const scan_order *so = get_scan(TX_4X4, tx_type, 1);
 
   vp10_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
 
@@ -1439,8 +3279,14 @@
     for (idx = 0; idx < width / 4; ++idx) {
       int64_t ssz, rd, rd1, rd2;
       tran_low_t* coeff;
-
+#if CONFIG_VAR_TX
+      int coeff_ctx;
+#endif
       k += (idy * 2 + idx);
+#if CONFIG_VAR_TX
+      coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)),
+                                           *(tl + (k >> 1)));
+#endif
       coeff = BLOCK_OFFSET(p->coeff, k);
       fwd_txm4x4(vp10_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                  coeff, 8);
@@ -1459,9 +3305,19 @@
                                         16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
+#if CONFIG_VAR_TX
+      thisrate += cost_coeffs(x, 0, k, coeff_ctx,
+                              TX_4X4,
                               so->scan, so->neighbors,
                               cpi->sf.use_fast_coef_costing);
+      *(ta + (k & 1)) = !(p->eobs[k] == 0);
+      *(tl + (k >> 1)) = !(p->eobs[k] == 0);
+#else
+      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1),
+                              TX_4X4,
+                              so->scan, so->neighbors,
+                              cpi->sf.use_fast_coef_costing);
+#endif
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
       rd = VPXMIN(rd1, rd2);
@@ -1534,25 +3390,32 @@
     x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
-static INLINE int mv_has_subpel(const MV *mv) {
-  return (mv->row & 0x0F) || (mv->col & 0x0F);
-}
-
 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
-    const VP10_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
+    const VP10_COMP *cpi, const int16_t mode_context[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
-    const MV_REFERENCE_FRAME ref_frames[2]) {
+    const MV_REFERENCE_FRAME ref_frames[2],
+    const BLOCK_SIZE bsize, int block) {
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
       (ref_frames[1] == NONE ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
-    int rfc = mode_context[ref_frames[0]];
+#if CONFIG_REF_MV
+    int16_t rfc = vp10_mode_context_analyzer(mode_context,
+                                             ref_frames, bsize, block);
+#else
+    int16_t rfc = mode_context[ref_frames[0]];
+#endif
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
 
+#if !CONFIG_REF_MV
+    (void)bsize;
+    (void)block;
+#endif
+
     if (this_mode == NEARMV) {
       if (c1 > c3) return 0;
     } else if (this_mode == NEARESTMV) {
@@ -1630,11 +3493,11 @@
   // frame we must use a unit scaling factor during mode selection.
 #if CONFIG_VP9_HIGHBITDEPTH
   vp10_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
-                                    cm->width, cm->height,
-                                    cm->use_highbitdepth);
+                                     cm->width, cm->height,
+                                     cm->use_highbitdepth);
 #else
   vp10_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
-                                    cm->width, cm->height);
+                                     cm->width, cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Allow joint search multiple times iteratively for each reference frame
@@ -1832,8 +3695,7 @@
         frame_mv[ZEROMV][frame].as_int = 0;
         vp10_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
                                       &frame_mv[NEARESTMV][frame],
-                                      &frame_mv[NEARMV][frame],
-                                      mbmi_ext->mode_context);
+                                      &frame_mv[NEARMV][frame]);
       }
 
       // search for the best motion vector on this segment
@@ -1847,7 +3709,7 @@
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
-                                this_mode, mbmi->ref_frame))
+                                this_mode, mbmi->ref_frame, bsize, i))
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
@@ -2151,34 +4013,108 @@
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       vpx_prob ref_single_p1 = vp10_get_pred_prob_single_ref_p1(cm, xd);
       vpx_prob ref_single_p2 = vp10_get_pred_prob_single_ref_p2(cm, xd);
+#if CONFIG_EXT_REFS
+      vpx_prob ref_single_p3 = vp10_get_pred_prob_single_ref_p3(cm, xd);
+      vpx_prob ref_single_p4 = vp10_get_pred_prob_single_ref_p4(cm, xd);
+      vpx_prob ref_single_p5 = vp10_get_pred_prob_single_ref_p5(cm, xd);
+#endif  // CONFIG_EXT_REFS
       unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
         base_cost += vp10_cost_bit(comp_inter_p, 0);
 
-      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
+      ref_costs_single[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+          ref_costs_single[LAST2_FRAME] =
+          ref_costs_single[LAST3_FRAME] =
+          ref_costs_single[LAST4_FRAME] =
+#endif  // CONFIG_EXT_REFS
+          ref_costs_single[GOLDEN_FRAME] =
           ref_costs_single[ALTREF_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST4_FRAME]  += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p3, 0);
+      ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p3, 0);
+      ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p3, 1);
+      ref_costs_single[LAST4_FRAME]  += vp10_cost_bit(ref_single_p3, 1);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p4, 0);
+      ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p4, 1);
+      ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p5, 0);
+      ref_costs_single[LAST4_FRAME]  += vp10_cost_bit(ref_single_p5, 1);
+#else
       ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
       ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p2, 0);
       ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+#endif  // CONFIG_EXT_REFS
     } else {
       ref_costs_single[LAST_FRAME]   = 512;
+#if CONFIG_EXT_REFS
+      ref_costs_single[LAST2_FRAME]  = 512;
+      ref_costs_single[LAST3_FRAME]  = 512;
+      ref_costs_single[LAST4_FRAME]  = 512;
+#endif  // CONFIG_EXT_REFS
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
     }
+
     if (cm->reference_mode != SINGLE_REFERENCE) {
       vpx_prob ref_comp_p = vp10_get_pred_prob_comp_ref_p(cm, xd);
+#if CONFIG_EXT_REFS
+      vpx_prob ref_comp_p1 = vp10_get_pred_prob_comp_ref_p1(cm, xd);
+      vpx_prob ref_comp_p2 = vp10_get_pred_prob_comp_ref_p2(cm, xd);
+      vpx_prob ref_comp_p3 = vp10_get_pred_prob_comp_ref_p3(cm, xd);
+#endif  // CONFIG_EXT_REFS
       unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
         base_cost += vp10_cost_bit(comp_inter_p, 1);
 
-      ref_costs_comp[LAST_FRAME]   = base_cost + vp10_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp10_cost_bit(ref_comp_p, 1);
+      ref_costs_comp[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+          ref_costs_comp[LAST2_FRAME] =
+          ref_costs_comp[LAST3_FRAME] =
+          ref_costs_comp[LAST4_FRAME] =
+#endif  // CONFIG_EXT_REFS
+          ref_costs_comp[GOLDEN_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+      ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST2_FRAME]  += vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST3_FRAME]  += vp10_cost_bit(ref_comp_p, 1);
+      ref_costs_comp[LAST4_FRAME]  += vp10_cost_bit(ref_comp_p, 1);
+      ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p, 1);
+
+      ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p1, 1);
+      ref_costs_comp[LAST2_FRAME]  += vp10_cost_bit(ref_comp_p1, 0);
+      ref_costs_comp[LAST3_FRAME]  += vp10_cost_bit(ref_comp_p2, 0);
+      ref_costs_comp[LAST4_FRAME]  += vp10_cost_bit(ref_comp_p2, 0);
+      ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p2, 1);
+
+      ref_costs_comp[LAST3_FRAME]  += vp10_cost_bit(ref_comp_p3, 1);
+      ref_costs_comp[LAST4_FRAME]  += vp10_cost_bit(ref_comp_p3, 0);
+#else
+      ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p, 1);
+#endif  // CONFIG_EXT_REFS
     } else {
       ref_costs_comp[LAST_FRAME]   = 512;
+#if CONFIG_EXT_REFS
+      ref_costs_comp[LAST2_FRAME]  = 512;
+      ref_costs_comp[LAST3_FRAME]  = 512;
+      ref_costs_comp[LAST4_FRAME]  = 512;
+#endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
     }
   }
@@ -2206,13 +4142,14 @@
          sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
-static void setup_buffer_inter(VP10_COMP *cpi, MACROBLOCK *x,
-                               MV_REFERENCE_FRAME ref_frame,
-                               BLOCK_SIZE block_size,
-                               int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+static void setup_buffer_inter(
+    VP10_COMP *cpi, MACROBLOCK *x,
+    MV_REFERENCE_FRAME ref_frame,
+    BLOCK_SIZE block_size,
+    int mi_row, int mi_col,
+    int_mv frame_nearest_mv[MAX_REF_FRAMES],
+    int_mv frame_near_mv[MAX_REF_FRAMES],
+    struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE]) {
   const VP10_COMMON *cm = &cpi->common;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2228,8 +4165,13 @@
   vp10_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp10_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
+  vp10_find_mv_refs(cm, xd, mi, ref_frame,
+#if CONFIG_REF_MV
+                    &mbmi_ext->ref_mv_count[ref_frame],
+                    mbmi_ext->ref_mv_stack[ref_frame],
+#endif
+                    candidates, mi_row, mi_col,
+                    NULL, NULL, mbmi_ext->mode_context);
 
   // Candidate refinement carried out at encoder and decoder
   vp10_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
@@ -2372,8 +4314,6 @@
   }
 }
 
-
-
 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
                                    uint8_t *orig_dst[MAX_MB_PLANE],
                                    int orig_dst_stride[MAX_MB_PLANE]) {
@@ -2468,6 +4408,12 @@
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int64_t distortion_y = 0, distortion_uv = 0;
+  int16_t mode_ctx = mbmi_ext->mode_context[refs[0]];
+
+#if CONFIG_REF_MV
+  mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                        mbmi->ref_frame, bsize, -1);
+#endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -2572,12 +4518,10 @@
   // initiation of a motion field.
   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
                           mode_mv, refs[0])) {
-    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode,
-                                 mbmi_ext->mode_context[refs[0]]),
-                     cost_mv_ref(cpi, NEARESTMV,
-                                 mbmi_ext->mode_context[refs[0]]));
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
+                     cost_mv_ref(cpi, NEARESTMV, mode_ctx));
   } else {
-    *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
+    *rate2 += cost_mv_ref(cpi, this_mode, mode_ctx);
   }
 
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
@@ -2598,6 +4542,10 @@
   if (cm->interp_filter != BILINEAR) {
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
       best_filter = EIGHTTAP;
+#if CONFIG_EXT_INTERP
+    } else if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE) {
+      best_filter = EIGHTTAP;
+#endif
     } else if (best_filter == SWITCHABLE) {
       int newbest;
       int tmp_rate_sum = 0;
@@ -2613,7 +4561,7 @@
         rs = vp10_get_switchable_rate(cpi, xd);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
-        if (i > 0 && intpel_mv) {
+        if (i > 0 && intpel_mv && IsInterpolatingFilter(i)) {
           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
           filter_cache[i] = rd;
           filter_cache[SWITCHABLE_FILTERS] =
@@ -2635,7 +4583,7 @@
                (!i || best_needs_copy)) ||
               (cm->interp_filter != SWITCHABLE &&
                (cm->interp_filter == mbmi->interp_filter ||
-                (i == 0 && intpel_mv)))) {
+                (i == 0 && intpel_mv && IsInterpolatingFilter(i))))) {
             restore_dst_buf(xd, orig_dst, orig_dst_stride);
           } else {
             for (j = 0; j < MAX_MB_PLANE; j++) {
@@ -2655,7 +4603,7 @@
             rd += rs_rd;
           *mask_filter = VPXMAX(*mask_filter, rd);
 
-          if (i == 0 && intpel_mv) {
+          if (i == 0 && intpel_mv && IsInterpolatingFilter(i)) {
             tmp_rate_sum = rate_sum;
             tmp_dist_sum = dist_sum;
           }
@@ -2672,7 +4620,8 @@
         if (newbest) {
           best_rd = rd;
           best_filter = mbmi->interp_filter;
-          if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
+          if (cm->interp_filter == SWITCHABLE && i &&
+              !(intpel_mv && IsInterpolatingFilter(i)))
             best_needs_copy = !best_needs_copy;
         }
 
@@ -2691,6 +4640,7 @@
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
+
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : best_filter;
@@ -2750,8 +4700,27 @@
 
     // Y cost and distortion
     vp10_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+    if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+#if CONFIG_EXT_TX
+      select_tx_type_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                         bsize, ref_best_rd);
+#else
+      inter_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                      bsize, ref_best_rd);
+#endif
+    } else {
+      int idx, idy;
+      super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                      bsize, ref_best_rd);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          mbmi->inter_tx_size[idy * 8 + idx] = mbmi->tx_size;
+    }
+#else
     super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
                     bsize, ref_best_rd);
+#endif
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -2766,8 +4735,13 @@
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
+#if CONFIG_VAR_TX
+    if (!inter_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                          &sseuv, bsize, ref_best_rd - rdcosty)) {
+#else
     if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
                           &sseuv, bsize, ref_best_rd - rdcosty)) {
+#endif
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2980,7 +4954,11 @@
                                 TileDataEnc *tile_data,
                                 MACROBLOCK *x,
                                 int mi_row, int mi_col,
-                                RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                BLOCK_SIZE bsize,
                                 PICK_MODE_CONTEXT *ctx,
                                 int64_t best_rd_so_far) {
   VP10_COMMON *const cm = &cpi->common;
@@ -2995,12 +4973,21 @@
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  static const int flag_list[REFS_PER_FRAME + 1] = {
+    0,
+    VP9_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_LAST2_FLAG,
+    VP9_LAST3_FLAG,
+    VP9_LAST4_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_GOLD_FLAG,
+    VP9_ALT_FLAG
+  };
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3018,8 +5005,17 @@
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
+#if CONFIG_EXT_INTRA
+  EXT_INTRA_MODE_INFO ext_intra_mode_info_uv[TX_SIZES];
+  int8_t uv_angle_delta[TX_SIZES];
+  int is_directional_mode, angle_stats_ready = 0;
+  int rate_overhead, rate_dummy;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+#endif  // CONFIG_EXT_INTRA
   const int intra_cost_penalty = vp10_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int * const intra_mode_cost =
+      cpi->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
@@ -3034,6 +5030,11 @@
 
   vp10_zero(best_mbmode);
 
+#if CONFIG_EXT_INTRA
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#endif  // CONFIG_EXT_INTRA
+
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     filter_cache[i] = INT64_MAX;
 
@@ -3056,9 +5057,13 @@
   }
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3102,7 +5107,14 @@
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[0] =
+          (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+          (1 << LAST2_FRAME) |
+          (1 << LAST3_FRAME) |
+          (1 << LAST4_FRAME) |
+#endif  // CONFIG_EXT_REFS
+          (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
@@ -3159,6 +5171,8 @@
     midx = end_pos;
   }
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -3186,6 +5200,20 @@
           ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
+#if CONFIG_EXT_REFS
+        case LAST2_FRAME:
+          ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case LAST3_FRAME:
+          ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case LAST4_FRAME:
+          ref_frame_skip_mask[0] |= LAST4_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#endif  // CONFIG_EXT_REFS
         case GOLDEN_FRAME:
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -3267,7 +5295,7 @@
     } else {
       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
       if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
-                              this_mode, ref_frames))
+                              this_mode, ref_frames, bsize, -1))
         continue;
     }
 
@@ -3275,6 +5303,10 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
@@ -3295,25 +5327,127 @@
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+#if CONFIG_EXT_INTRA
+      is_directional_mode = (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED);
+      if (is_directional_mode) {
+        if (!angle_stats_ready) {
+          const int src_stride = x->plane[0].src.stride;
+          const uint8_t *src = x->plane[0].src.buf;
+          const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+          const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+          double hist[DIRECTIONAL_MODES];
+          PREDICTION_MODE mode;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+            highbd_angle_estimation(src, src_stride, rows, cols, hist);
+          else
+#endif
+            angle_estimation(src, src_stride, rows, cols, hist);
+          for (mode = 0; mode < INTRA_MODES; ++mode) {
+            if (mode != DC_PRED && mode != TM_PRED) {
+              int index = get_angle_index((double)mode_to_angle_map[mode]);
+              double score, weight = 1.0;
+              score = hist[index];
+              if (index > 0) {
+                score += hist[index - 1] * 0.5;
+                weight += 0.5;
+              }
+              if (index < DIRECTIONAL_MODES - 1) {
+                score += hist[index + 1] * 0.5;
+                weight += 0.5;
+              }
+              score /= weight;
+              if (score < ANGLE_SKIP_THRESH)
+                directional_mode_skip_mask[mode] = 1;
+            }
+          }
+          angle_stats_ready = 1;
+        }
+        if (directional_mode_skip_mask[mbmi->mode])
+          continue;
+        rate_overhead = write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0) +
+            intra_mode_cost[mbmi->mode];
+        rate_y = INT_MAX;
+        this_rd =
+            rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                    &skippable, bsize, rate_overhead, best_rd);
+      } else {
+        mbmi->angle_delta[0] = 0;
+        super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                        NULL, bsize, best_rd);
+      }
+
+      // TODO(huisu): ext-intra is turned off in lossless mode for now to
+      // avoid a unit test failure
+      if (mbmi->mode == DC_PRED && !xd->lossless[mbmi->segment_id] &&
+          ALLOW_FILTER_INTRA_MODES) {
+        MB_MODE_INFO mbmi_copy = *mbmi;
+
+        if (rate_y != INT_MAX) {
+          int this_rate = rate_y + intra_mode_cost[mbmi->mode] +
+              vp10_cost_bit(cm->fc->ext_intra_probs[0], 0);
+          this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, distortion_y);
+        } else {
+          this_rd = best_rd;
+        }
+
+        if (!rd_pick_ext_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                   &skippable, bsize,
+                                   intra_mode_cost[mbmi->mode], &this_rd))
+          *mbmi = mbmi_copy;
+      }
+#else
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                       NULL, bsize, best_rd);
+#endif  // CONFIG_EXT_INTRA
+
       if (rate_y == INT_MAX)
         continue;
-
       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
                                   pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+#if CONFIG_EXT_INTRA
+        ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
       }
 
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+      mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+          ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
+      if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
+        mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+            ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
+      }
+#endif  // CONFIG_EXT_INTRA
 
-      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+#if CONFIG_EXT_INTRA
+      if (is_directional_mode)
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS +
+                                    mbmi->angle_delta[0]);
+
+      if (mbmi->mode == DC_PRED && ALLOW_FILTER_INTRA_MODES) {
+        rate2 += vp10_cost_bit(cm->fc->ext_intra_probs[0],
+                               mbmi->ext_intra_mode_info.use_ext_intra_mode[0]);
+        if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
+          EXT_INTRA_MODE ext_intra_mode =
+              mbmi->ext_intra_mode_info.ext_intra_mode[0];
+          rate2 += write_uniform_cost(FILTER_INTRA_MODES, ext_intra_mode);
+        }
+      }
+#endif  // CONFIG_EXT_INTRA
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
@@ -3326,6 +5460,7 @@
                                   single_newmv, single_inter_filter,
                                   single_skippable, &total_sse, best_rd,
                                   &mask_filter, filter_cache);
+
       if (this_rd == INT64_MAX)
         continue;
 
@@ -3347,9 +5482,11 @@
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-
+        rate_y = 0;
+        rate_uv = 0;
         // Cost the skip mb case
         rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+
       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
@@ -3362,6 +5499,8 @@
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
           this_skip2 = 1;
+          rate_y = 0;
+          rate_uv = 0;
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -3408,6 +5547,15 @@
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip) {
+          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+                                              skippable || this_skip2);
+        }
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                            mbmi->ref_frame[0] != INTRA_FRAME);
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
@@ -3417,8 +5565,15 @@
 
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(ctx->blk_skip[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#else
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+#endif
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3522,6 +5677,19 @@
       best_mbmode.mode = ZEROMV;
   }
 
+#if CONFIG_REF_MV
+  if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
+      best_mbmode.mv[0].as_int == 0 &&
+      (best_mbmode.ref_frame[1] == NONE || best_mbmode.mv[1].as_int == 0)) {
+    int16_t mode_ctx = mbmi_ext->mode_context[best_mbmode.ref_frame[0]];
+    if (best_mbmode.ref_frame[1] > NONE)
+      mode_ctx &= (mbmi_ext->mode_context[best_mbmode.ref_frame[1]] | 0x00ff);
+
+    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET))
+      best_mbmode.mode = ZEROMV;
+  }
+#endif
+
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
@@ -3638,6 +5806,12 @@
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
   mbmi->mode = ZEROMV;
   mbmi->uv_mode = DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
@@ -3648,6 +5822,9 @@
   if (cm->interp_filter != BILINEAR) {
     best_filter = EIGHTTAP;
     if (cm->interp_filter == SWITCHABLE &&
+#if CONFIG_EXT_INTERP
+        vp10_is_interp_needed(xd) &&
+#endif  // CONFIG_EXT_INTERP
         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
       int rs;
       int best_rs = INT_MAX;
@@ -3702,14 +5879,17 @@
                        best_pred_diff, best_filter_diff, 0);
 }
 
-void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
-                                   TileDataEnc *tile_data,
-                                   MACROBLOCK *x,
-                                   int mi_row, int mi_col,
-                                   RD_COST *rd_cost,
-                                   BLOCK_SIZE bsize,
-                                   PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far) {
+void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
+                                    TileDataEnc *tile_data,
+                                    struct macroblock *x,
+                                    int mi_row, int mi_col,
+                                    struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                    BLOCK_SIZE bsize,
+                                    PICK_MODE_CONTEXT *ctx,
+                                    int64_t best_rd_so_far) {
   VP10_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -3720,9 +5900,18 @@
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
+  static const int flag_list[REFS_PER_FRAME + 1] = {
+    0,
+    VP9_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_LAST2_FLAG,
+    VP9_LAST3_FLAG,
+    VP9_LAST4_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_GOLD_FLAG,
+    VP9_ALT_FLAG
+  };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -3749,9 +5938,19 @@
   int internal_active_edge =
     vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
 
+#if CONFIG_SUPERTX
+  best_rd_so_far = INT64_MAX;
+  best_rd = best_rd_so_far;
+  best_yrd = best_rd_so_far;
+#endif  // CONFIG_SUPERTX
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
   vp10_zero(best_mbmode);
 
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     filter_cache[i] = INT64_MAX;
 
@@ -3771,8 +5970,12 @@
   rate_uv_intra = INT_MAX;
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    x->mbmi_ext->mode_context[ref_frame] = 0;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV],
@@ -3785,6 +5988,9 @@
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3809,15 +6015,59 @@
           case INTRA_FRAME:
             break;
           case LAST_FRAME:
-            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << LAST4_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << ALTREF_FRAME);
             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
+#if CONFIG_EXT_REFS
+          case LAST2_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << LAST4_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case LAST3_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST4_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case LAST4_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+#endif  // CONFIG_EXT_REFS
           case GOLDEN_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << LAST4_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << ALTREF_FRAME);
             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
           case ALTREF_FRAME:
-            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << LAST4_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << LAST_FRAME);
             break;
           case NONE:
           case MAX_REF_FRAMES:
@@ -3904,6 +6154,10 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_VAR_TX
+    mbmi->inter_tx_size[0] = mbmi->tx_size;
+#endif
+
     if (ref_frame == INTRA_FRAME) {
       int rate;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
@@ -3945,8 +6199,16 @@
       this_rd_thresh = (ref_frame == LAST_FRAME) ?
           rd_opt->threshes[segment_id][bsize][THR_LAST] :
           rd_opt->threshes[segment_id][bsize][THR_ALTR];
+#if CONFIG_EXT_REFS
+      this_rd_thresh = (ref_frame == LAST2_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST2] : this_rd_thresh;
+      this_rd_thresh = (ref_frame == LAST3_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST3] : this_rd_thresh;
+      this_rd_thresh = (ref_frame == LAST4_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST4] : this_rd_thresh;
+#endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
-      rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+          rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
         filter_cache[i] = INT64_MAX;
 
@@ -3976,7 +6238,11 @@
                                               (int) this_rd_thresh, seg_mvs,
                                               bsi, switchable_filter_index,
                                               mi_row, mi_col);
-
+#if CONFIG_EXT_INTERP
+            if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+                mbmi->interp_filter != EIGHTTAP)  // invalid configuration
+              continue;
+#endif  // CONFIG_EXT_INTERP
             if (tmp_rd == INT64_MAX)
               continue;
             rs = vp10_get_switchable_rate(cpi, xd);
@@ -4030,15 +6296,30 @@
 
       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
                              tmp_best_filter : cm->interp_filter);
+
+
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        // switchable list (bilinear) is indicated at the frame level
         tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                           &x->mbmi_ext->ref_mvs[ref_frame][0],
                                           second_ref, best_yrd, &rate, &rate_y,
                                           &distortion, &skippable, &total_sse,
                                           (int) this_rd_thresh, seg_mvs, bsi, 0,
                                           mi_row, mi_col);
+#if CONFIG_EXT_INTERP
+        if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+            mbmi->interp_filter != EIGHTTAP) {
+          mbmi->interp_filter = EIGHTTAP;
+          tmp_rd = rd_pick_best_sub8x8_mode(
+              cpi, x,
+              &x->mbmi_ext->ref_mvs[ref_frame][0],
+              second_ref, best_yrd, &rate, &rate_y,
+              &distortion, &skippable, &total_sse,
+              (int) this_rd_thresh, seg_mvs, bsi, 0,
+              mi_row, mi_col);
+        }
+#endif  // CONFIG_EXT_INTERP
         if (tmp_rd == INT64_MAX)
           continue;
       } else {
@@ -4074,10 +6355,15 @@
         vp10_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
         memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+#if CONFIG_VAR_TX
+        if (!inter_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
+          continue;
+#else
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu))
           continue;
-
+#endif
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -4145,6 +6431,15 @@
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip)
+          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+                                              this_skip2);
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                            mbmi->ref_frame[0] != INTRA_FRAME);
+        assert(*returnrate_nocoef > 0);
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
@@ -4154,8 +6449,14 @@
         best_skip2 = this_skip2;
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk);
+#else
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+#endif
 
         for (i = 0; i < 4; i++)
           best_bmodes[i] = xd->mi[0]->bmi[i];
@@ -4241,6 +6542,9 @@
   if (best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 
@@ -4261,6 +6565,9 @@
     rd_cost->rate = INT_MAX;
     rd_cost->dist = INT64_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 

diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index b1a8036..62b0aea 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h

@@ -43,6 +43,9 @@
                                struct macroblock *x,
                                int mi_row, int mi_col,
                                struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                               int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                int64_t best_rd_so_far);
 
@@ -60,12 +63,27 @@
 int vp10_active_edge_sb(struct VP10_COMP *cpi, int mi_row, int mi_col);
 
 void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
-                                   struct TileDataEnc *tile_data,
-                                   struct macroblock *x,
-                                   int mi_row, int mi_col,
-                                   struct RD_COST *rd_cost,
-                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far);
+                                    struct TileDataEnc *tile_data,
+                                    struct macroblock *x,
+                                    int mi_row, int mi_col,
+                                    struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                    int64_t best_rd_so_far);
+
+#if CONFIG_SUPERTX
+void vp10_txfm_rd_in_plane_supertx(MACROBLOCK *x,
+#if CONFIG_VAR_TX
+                                   const VP10_COMP *cpi,
+#endif  // CONFIG_VAR_TX
+                                   int *rate, int64_t *distortion,
+                                   int *skippable, int64_t *sse,
+                                   int64_t ref_best_rd, int plane,
+                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                   int use_fast_coef_casting);
+#endif  // CONFIG_SUPERTX
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index 6a20ee4..e936775 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c

@@ -58,9 +58,7 @@
     segcounts[4] + segcounts[5], segcounts[6] + segcounts[7]
   };
   const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
-#if CONFIG_MISC_FIXES
   int i;
-#endif
 
   segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
   segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
@@ -70,16 +68,12 @@
   segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
   segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
 
-#if CONFIG_MISC_FIXES
   for (i = 0; i < 7; i++) {
     const unsigned *ct = i == 0 ? ccc : i < 3 ? cc + (i & 2)
         : segcounts + (i - 3) * 2;
     vp10_prob_diff_update_savings_search(ct,
         cur_tree_probs[i], &segment_tree_probs[i], DIFF_UPDATE_PROB);
   }
-#else
-  (void) cur_tree_probs;
-#endif
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
@@ -214,39 +208,22 @@
 
 void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
   struct segmentation *seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   struct segmentation_probs *segp = &cm->fc->seg;
-#else
-  struct segmentation_probs *segp = &cm->segp;
-#endif
 
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
 
   int i, tile_col, mi_row, mi_col;
 
-#if CONFIG_MISC_FIXES
   unsigned (*temporal_predictor_count)[2] = cm->counts.seg.pred;
   unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
   unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
-#else
-  unsigned temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
-  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
-  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
-#endif
 
   vpx_prob no_pred_tree[SEG_TREE_PROBS];
   vpx_prob t_pred_tree[SEG_TREE_PROBS];
   vpx_prob t_nopred_prob[PREDICTION_PROBS];
 
-#if CONFIG_MISC_FIXES
   (void) xd;
-#else
-  // Set default state for the segment tree probabilities and the
-  // temporal coding probabilities
-  memset(segp->tree_probs, 255, sizeof(segp->tree_probs));
-  memset(segp->pred_probs, 255, sizeof(segp->pred_probs));
-#endif
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
@@ -284,13 +261,9 @@
       const int count0 = temporal_predictor_count[i][0];
       const int count1 = temporal_predictor_count[i][1];
 
-#if CONFIG_MISC_FIXES
       vp10_prob_diff_update_savings_search(temporal_predictor_count[i],
                                            segp->pred_probs[i],
                                            &t_nopred_prob[i], DIFF_UPDATE_PROB);
-#else
-      t_nopred_prob[i] = get_binary_prob(count0, count1);
-#endif
 
       // Add in the predictor signaling cost
       t_pred_cost += count0 * vp10_cost_zero(t_nopred_prob[i]) +
@@ -301,30 +274,17 @@
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
     seg->temporal_update = 1;
-#if !CONFIG_MISC_FIXES
-    memcpy(segp->tree_probs, t_pred_tree, sizeof(t_pred_tree));
-    memcpy(segp->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
-#endif
   } else {
     seg->temporal_update = 0;
-#if !CONFIG_MISC_FIXES
-    memcpy(segp->tree_probs, no_pred_tree, sizeof(no_pred_tree));
-#endif
   }
 }
 
 void vp10_reset_segment_features(VP10_COMMON *cm) {
   struct segmentation *seg = &cm->seg;
-#if !CONFIG_MISC_FIXES
-  struct segmentation_probs *segp = &cm->segp;
-#endif
 
   // Set up default state for MB feature flags
   seg->enabled = 0;
   seg->update_map = 0;
   seg->update_data = 0;
-#if !CONFIG_MISC_FIXES
-  memset(segp->tree_probs, 255, sizeof(segp->tree_probs));
-#endif
   vp10_clearall_segfeatures(seg);
 }

diff --git a/vp10/encoder/subexp.c b/vp10/encoder/subexp.c
index d407477..8d279b1 100644
--- a/vp10/encoder/subexp.c
+++ b/vp10/encoder/subexp.c

@@ -25,8 +25,7 @@
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 11 - CONFIG_MISC_FIXES,
-          11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
@@ -86,7 +85,7 @@
 
 static void encode_uniform(vpx_writer *w, int v) {
   const int l = 8;
-  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
+  const int m = (1 << l) - 190;
   if (v < m) {
     vpx_write_literal(w, v, l - 1);
   } else {

diff --git a/vp10/encoder/subexp.h b/vp10/encoder/subexp.h
index 091334f..64eb275 100644
--- a/vp10/encoder/subexp.h
+++ b/vp10/encoder/subexp.h

@@ -36,7 +36,6 @@
                                               vpx_prob *bestp,
                                               vpx_prob upd,
                                               int stepsize);
-
 int vp10_cond_prob_diff_update_savings(vpx_prob *oldp,
                                        const unsigned int ct[2]);
 #ifdef __cplusplus

diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c
index 5278d3b..4dc2122 100644
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c

@@ -135,15 +135,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier  += rounding;
       modifier >>= strength;
 
@@ -182,15 +205,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier += rounding;
       modifier >>= strength;
 
@@ -382,50 +428,50 @@
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
             vp10_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
+                                              f->y_stride,
+                                              predictor, 16, 16, adj_strength,
+                                              filter_weight,
+                                              accumulator, count);
             vp10_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
+                                              f->uv_stride, predictor + 256,
+                                              mb_uv_width, mb_uv_height,
+                                              adj_strength,
+                                              filter_weight, accumulator + 256,
+                                              count + 256);
             vp10_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+                                              f->uv_stride, predictor + 512,
+                                              mb_uv_width, mb_uv_height,
+                                              adj_strength, filter_weight,
+                                              accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16,
-                                      strength, filter_weight,
-                                      accumulator, count);
-            vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 256,
-                                      count + 256);
-            vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 512,
-                                      count + 512);
+            vp10_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                         predictor, 16, 16,
+                                         strength, filter_weight,
+                                         accumulator, count);
+            vp10_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                         f->uv_stride, predictor + 256,
+                                         mb_uv_width, mb_uv_height, strength,
+                                         filter_weight, accumulator + 256,
+                                         count + 256);
+            vp10_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                         f->uv_stride, predictor + 512,
+                                         mb_uv_width, mb_uv_height, strength,
+                                         filter_weight, accumulator + 512,
+                                         count + 512);
           }
 #else
           // Apply the filter (YUV)
-          vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+          vp10_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
                                     strength, filter_weight,
                                     accumulator, count);
-          vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp10_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 256,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 256,
                                     count + 256);
-          vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp10_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 512,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,

diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 1b94190..64211a9 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c

@@ -487,6 +487,39 @@
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
+void vp10_tokenize_palette_sb(struct ThreadData *const td,
+                              BLOCK_SIZE bsize, int plane,
+                              TOKENEXTRA **t) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  uint8_t *color_map = xd->plane[0].color_index_map;
+  PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+  int n = pmi->palette_size[plane != 0];
+  int i, j, k;
+  int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+
+  for (i = 0; i < rows; ++i) {
+    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+      color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
+                                                 color_order);
+      for (k = 0; k < n; ++k)
+        if (color_map[i * cols + j] == color_order[k]) {
+          color_new_idx = k;
+          break;
+        }
+      assert(color_new_idx >= 0 && color_new_idx < n);
+
+      (*t)->token = color_new_idx;
+      (*t)->context_tree = vp10_default_palette_y_color_prob[n - 2][color_ctx];
+      (*t)->skip_eob_node = 0;
+      ++(*t);
+    }
+  }
+}
+
 static void tokenize_b(int plane, int block, int blk_row, int blk_col,
                        BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
@@ -508,8 +541,8 @@
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
-  const TX_TYPE tx_type = get_tx_type(type, xd, block);
-  const scan_order *const so = get_scan(tx_size, tx_type);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order *const so = get_scan(tx_size, tx_type, is_inter_block(mbmi));
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
@@ -612,6 +645,118 @@
   return result;
 }
 
+#if CONFIG_VAR_TX
+void tokenize_tx(ThreadData *td, TOKENEXTRA **t,
+                 int dry_run, TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                 int blk_row, int blk_col, int block, int plane,
+                 void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int blk_idx = (blk_row >> (1 - pd->subsampling_y)) * 8 +
+                (blk_col >> (1 - pd->subsampling_x));
+  TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[blk_idx], bsize, 0, 0) :
+      mbmi->inter_tx_size[blk_idx];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+    if (!dry_run)
+      tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+    else
+      set_entropy_context_b(plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size, arg);
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      tokenize_tx(td, t, dry_run, tx_size - 1, plane_bsize,
+                  offsetr, offsetc, block + i * step, plane, arg);
+    }
+  }
+}
+
+void vp10_tokenize_sb_inter(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                            int dry_run, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp10_get_skip_context(xd);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  int plane;
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (mbmi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    td->counts->skip[ctx][0] += skip_inc;
+  else
+    *t = t_backup;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_tx_size * 2);
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        tokenize_tx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
+                    block, plane, &arg);
+        block += step;
+      }
+    }
+
+    if (!dry_run) {
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  }
+}
+#endif
+
 void vp10_tokenize_sb(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                      int dry_run, BLOCK_SIZE bsize) {
   VP10_COMMON *const cm = &cpi->common;
@@ -644,3 +789,40 @@
     vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
   }
 }
+
+#if CONFIG_SUPERTX
+void vp10_tokenize_sb_supertx(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                              int dry_run, BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp10_get_skip_context(xd);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  if (mbmi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run) {
+    int plane;
+    td->counts->skip[ctx][0] += skip_inc;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                              &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  } else {
+    vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+    *t = t_backup;
+  }
+}
+#endif  // CONFIG_SUPERTX

diff --git a/vp10/encoder/tokenize.h b/vp10/encoder/tokenize.h
index 5bad415..c68e6f2 100644
--- a/vp10/encoder/tokenize.h
+++ b/vp10/encoder/tokenize.h

@@ -51,8 +51,21 @@
 struct VP10_COMP;
 struct ThreadData;
 
+#if CONFIG_VAR_TX
+void vp10_tokenize_sb_inter(struct VP10_COMP *cpi, struct ThreadData *td,
+                            TOKENEXTRA **t, int dry_run, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize);
+#endif
+
+void vp10_tokenize_palette_sb(struct ThreadData *const td,
+                              BLOCK_SIZE bsize, int plane,
+                              TOKENEXTRA **t);
 void vp10_tokenize_sb(struct VP10_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp10_tokenize_sb_supertx(struct VP10_COMP *cpi, struct ThreadData *td,
+                              TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+#endif
 
 extern const int16_t *vp10_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to

diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index e111157..976fe45 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c

@@ -18,16 +18,37 @@
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride) {
+                                   int stride, int flipud, int fliplr) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i mask;
 
-  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 4);
   in[1] = _mm_slli_epi16(in[1], 4);
@@ -160,23 +181,55 @@
       vpx_fdct4x4_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fdct4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fdct4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case ADST_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#endif  // CONFIG_EXT_TX
    default:
      assert(0);
      break;
@@ -627,15 +680,37 @@
 
 // load 8x8 array
 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 2);
   in[1] = _mm_slli_epi16(in[1], 2);
@@ -1144,26 +1219,63 @@
       vpx_fdct8x8_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fdct8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case ADST_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -1171,15 +1283,37 @@
 }
 
 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
-                                     __m128i *in1, int stride) {
-  // load first 8 columns
-  load_buffer_8x8(input, in0, stride);
-  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+                                     __m128i *in1, int stride,
+                                     int flipud, int fliplr) {
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
 
-  input += 8;
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL; topL = botL; botL = tmp;
+    // Swap right columns
+    tmp = topR; topR = botR; botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL; topL = topR; topR = tmp;
+    // Swap bottom rows
+    tmp = botL; botL = botR; botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, in0,     stride, flipud, fliplr);
+  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
+
   // load second 8 columns
-  load_buffer_8x8(input, in1, stride);
-  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+  load_buffer_8x8(topR, in1,     stride, flipud, fliplr);
+  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
 }
 
 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
@@ -2031,26 +2165,63 @@
       vpx_fdct16x16_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fdct16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;

diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 2eb3488..f8c2112 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk

@@ -63,6 +63,17 @@
 VP10_COMMON_SRCS-yes += common/scan.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
+VP10_COMMON_SRCS-yes += common/vp10_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm1d.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm1d.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm1d.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm1d.c
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d.c
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d_cfg.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
 
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c

diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index 21c9c03..c90b936 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c

@@ -91,9 +91,6 @@
   size_t                  pending_cx_data_sz;
   int                     pending_frame_count;
   size_t                  pending_frame_sizes[8];
-#if !CONFIG_MISC_FIXES
-  size_t                  pending_frame_magnitude;
-#endif
   vpx_image_t             preview_img;
   vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
@@ -783,39 +780,30 @@
   uint8_t marker = 0xc0;
   unsigned int mask;
   int mag, index_sz;
-#if CONFIG_MISC_FIXES
   int i;
   size_t max_frame_sz = 0;
-#endif
 
   assert(ctx->pending_frame_count);
   assert(ctx->pending_frame_count <= 8);
 
   // Add the number of frames to the marker byte
   marker |= ctx->pending_frame_count - 1;
-#if CONFIG_MISC_FIXES
   for (i = 0; i < ctx->pending_frame_count - 1; i++) {
     const size_t frame_sz = (unsigned int) ctx->pending_frame_sizes[i] - 1;
     max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
   }
-#endif
 
   // Choose the magnitude
   for (mag = 0, mask = 0xff; mag < 4; mag++) {
-#if CONFIG_MISC_FIXES
     if (max_frame_sz <= mask)
       break;
-#else
-    if (ctx->pending_frame_magnitude < mask)
-      break;
-#endif
     mask <<= 8;
     mask |= 0xff;
   }
   marker |= mag << 3;
 
   // Write the index
-  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - CONFIG_MISC_FIXES);
+  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - 1);
   if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
     uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
     int i, j;
@@ -835,11 +823,11 @@
 #endif
 
     *x++ = marker;
-    for (i = 0; i < ctx->pending_frame_count - CONFIG_MISC_FIXES; i++) {
+    for (i = 0; i < ctx->pending_frame_count - 1; i++) {
       unsigned int this_sz;
 
       assert(ctx->pending_frame_sizes[i] > 0);
-      this_sz = (unsigned int)ctx->pending_frame_sizes[i] - CONFIG_MISC_FIXES;
+      this_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
       for (j = 0; j <= mag; j++) {
         *x++ = this_sz & 0xff;
         this_sz >>= 8;
@@ -993,9 +981,6 @@
             ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-#if !CONFIG_MISC_FIXES
-          ctx->pending_frame_magnitude |= size;
-#endif
           cx_data += size;
           cx_data_sz -= size;
 
@@ -1012,9 +997,6 @@
             ctx->pending_cx_data = NULL;
             ctx->pending_cx_data_sz = 0;
             ctx->pending_frame_count = 0;
-#if !CONFIG_MISC_FIXES
-            ctx->pending_frame_magnitude = 0;
-#endif
             ctx->output_cx_pkt_cb.output_cx_pkt(
                 &pkt, ctx->output_cx_pkt_cb.user_priv);
           }
@@ -1031,9 +1013,6 @@
 
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-#if !CONFIG_MISC_FIXES
-          ctx->pending_frame_magnitude |= size;
-#endif
           ctx->pending_cx_data_sz += size;
           // write the superframe only for the case when
           if (!ctx->output_cx_pkt_cb.output_cx_pkt)
@@ -1043,9 +1022,6 @@
           ctx->pending_cx_data = NULL;
           ctx->pending_cx_data_sz = 0;
           ctx->pending_frame_count = 0;
-#if !CONFIG_MISC_FIXES
-          ctx->pending_frame_magnitude = 0;
-#endif
         } else {
           pkt.data.frame.buf = cx_data;
           pkt.data.frame.sz  = size;

diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index dc3b271..7ae2fb2 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk

@@ -23,6 +23,8 @@
 VP10_CX_SRCS-yes += encoder/cost.h
 VP10_CX_SRCS-yes += encoder/cost.c
 VP10_CX_SRCS-yes += encoder/dct.c
+VP10_CX_SRCS-yes += encoder/hybrid_fwd_txfm.c
+VP10_CX_SRCS-yes += encoder/hybrid_fwd_txfm.h
 VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.c
 VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.h
 VP10_CX_SRCS-yes += encoder/encodeframe.c
@@ -51,6 +53,8 @@
 VP10_CX_SRCS-yes += encoder/treewriter.h
 VP10_CX_SRCS-yes += encoder/mcomp.c
 VP10_CX_SRCS-yes += encoder/encoder.c
+VP10_CX_SRCS-yes += encoder/palette.h
+VP10_CX_SRCS-yes += encoder/palette.c
 VP10_CX_SRCS-yes += encoder/picklpf.c
 VP10_CX_SRCS-yes += encoder/picklpf.h
 VP10_CX_SRCS-yes += encoder/quantize.c

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 61eb591..41994dc 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -98,8 +98,13 @@
   return mbmi->ref_frame[0] > INTRA_FRAME;
 }
 
+static INLINE int is_compound_ref(const MV_REFERENCE_FRAME *ref_frame) {
+  assert(ref_frame != NULL);
+  return ref_frame[1] > INTRA_FRAME;
+}
+
 static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
-  return mbmi->ref_frame[1] > INTRA_FRAME;
+  return is_compound_ref(mbmi->ref_frame);
 }
 
 PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,

diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 1f16325..e4c349c 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c

@@ -192,6 +192,7 @@
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
+
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.

diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index db78d6b..033326d 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c

@@ -379,11 +379,11 @@
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
       for (k = 0; k < 2; k++)
-      accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
+        accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
 
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      accum->comp_ref[i][j] += counts->comp_ref[i][j];
+        accum->comp_ref[i][j] += counts->comp_ref[i][j];
 
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     for (j = 0; j < TX_SIZES; j++)

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index f5da07e..ed49a69 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -237,15 +237,16 @@
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
   lock_buffer_pool(pool);
+
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     const int old_idx = cm->ref_frame_map[ref_index];
     // Current thread releases the holding of reference frame.
     decrease_ref_count(old_idx, frame_bufs, pool);
 
     // Release the reference frame in reference map.
-    if (mask & 1) {
+    if (mask & 1)
       decrease_ref_count(old_idx, frame_bufs, pool);
-    }
+
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
@@ -267,7 +268,7 @@
   }
 
   // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < 3; ref_index++)
+  for (ref_index = 0; ref_index < REFS_PER_FRAME; ref_index++)
     cm->frame_refs[ref_index].idx = -1;
 }
 
@@ -325,7 +326,6 @@
     pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
   }
 
-
   if (setjmp(cm->error.jmp)) {
     const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
     int i;
@@ -350,9 +350,8 @@
         decrease_ref_count(old_idx, frame_bufs, pool);
 
         // Release the reference frame in reference map.
-        if (mask & 1) {
+        if (mask & 1)
           decrease_ref_count(old_idx, frame_bufs, pool);
-        }
         ++ref_index;
       }
 

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 63db214..c85e4b0 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -20,7 +20,6 @@
 
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
-
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
   size_t consec_zero_mv_size;

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index c07eee9..7f94e19 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -4072,7 +4072,7 @@
     // either compound, single or hybrid prediction as per whatever has
     // worked best for that type of frame in the past.
     // It also predicts whether another coding mode would have worked
-    // better that this coding mode. If that is the case, it remembers
+    // better than this coding mode. If that is the case, it remembers
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index eda7743..21e57c4 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -590,6 +590,12 @@
 
   rd->thresh_mult[THR_NEARMV] += 1000;
   rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
+
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
 
@@ -597,13 +603,9 @@
 
   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
-  rd->thresh_mult[THR_NEARG] += 1000;
   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
 
-  rd->thresh_mult[THR_ZEROMV] += 2000;
-  rd->thresh_mult[THR_ZEROG] += 2000;
-  rd->thresh_mult[THR_ZEROA] += 2000;
   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
 
@@ -618,9 +620,10 @@
 }
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
-  static const int thresh_mult[2][MAX_REFS] =
-      {{2500, 2500, 2500, 4500, 4500, 2500},
-       {2000, 2000, 2000, 4000, 4000, 2000}};
+  static const int thresh_mult[2][MAX_REFS] = {
+    {2500, 2500, 2500, 4500, 4500, 2500},
+    {2000, 2000, 2000, 4000, 4000, 2000}
+  };
   RD_OPT *const rd = &cpi->rd;
   const int idx = cpi->oxcf.mode == BEST;
   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index b8d1720..973d8f5 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1559,8 +1559,8 @@
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int refs[2] = {mbmi->ref_frame[0],
-                       mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv ref_mv[2];
   int ite, ref;
   const InterpKernel *kernel = vp9_filter_kernels[mbmi->interp_filter];
@@ -2401,8 +2401,8 @@
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
   int i;
-  int refs[2] = { mbmi->ref_frame[0],
-    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv cur_mv[2];
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
@@ -3135,10 +3135,14 @@
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
     int early_term = 0;
+    const MV_REFERENCE_FRAME refs[2] = {
+      vp9_mode_order[mode_index].ref_frame[0],
+      vp9_mode_order[mode_index].ref_frame[1]
+    };
 
     this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+    ref_frame = refs[0];
+    second_ref_frame = refs[1];
 
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
@@ -3227,7 +3231,7 @@
           continue;
     }
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
+    comp_pred = is_compound_ref(refs);
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter)
         continue;
@@ -3520,7 +3524,7 @@
   if (best_mbmode.mode == NEWMV) {
     const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
         best_mbmode.ref_frame[1]};
-    int comp_pred_mode = refs[1] > INTRA_FRAME;
+    int comp_pred_mode = is_compound_ref(refs);
 
     if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
         ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
@@ -3814,9 +3818,13 @@
     int64_t total_sse = INT_MAX;
     int early_term = 0;
     struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+    const MV_REFERENCE_FRAME refs[2] = {
+      vp9_ref_order[ref_index].ref_frame[0],
+      vp9_ref_order[ref_index].ref_frame[1]
+    };
 
-    ref_frame = vp9_ref_order[ref_index].ref_frame[0];
-    second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
+    ref_frame = refs[0];
+    second_ref_frame = refs[1];
 
 #if CONFIG_BETTER_HW_COMPATIBILITY
     // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
@@ -3865,7 +3873,7 @@
                             tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
+    comp_pred = is_compound_ref(refs);
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter)
         continue;
@@ -3982,7 +3990,7 @@
           rd_opt->threshes[segment_id][bsize][THR_LAST] :
           rd_opt->threshes[segment_id][bsize][THR_ALTR];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
-      rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+          rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
         filter_cache[i] = INT64_MAX;
 
@@ -4004,6 +4012,7 @@
             int64_t rs_rd;
             MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
             mbmi->interp_filter = switchable_filter_index;
+
             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                               &mbmi_ext->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
@@ -4143,7 +4152,7 @@
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
-    if (second_ref_frame > INTRA_FRAME) {
+    if (is_compound_ref(mbmi->ref_frame)) {
       rate2 += ref_costs_comp[ref_frame];
     } else {
       rate2 += ref_costs_single[ref_frame];

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index bd99c6d..7504c0e 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -806,9 +806,12 @@
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
 
-VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
+/*!\brief
+ *
+ * TODO(rbultje) : add support of the control in ffmpeg
+ */
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
-
+VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus

diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c
index d7b55cf..595b9bb 100644
--- a/vpx_dsp/bitreader_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c

@@ -43,11 +43,7 @@
 
 int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
                                    int bits) {
-#if CONFIG_MISC_FIXES
   const int nbits = sizeof(unsigned) * 8 - bits - 1;
   const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
   return ((int) value) >> nbits;
-#else
-  return vpx_rb_read_signed_literal(rb, bits);
-#endif
 }

diff --git a/vpx_dsp/bitwriter_buffer.c b/vpx_dsp/bitwriter_buffer.c
index 6182a72..8633372 100644
--- a/vpx_dsp/bitwriter_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c

@@ -39,10 +39,5 @@
 
 void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
                                      int data, int bits) {
-#if CONFIG_MISC_FIXES
   vpx_wb_write_literal(wb, data, bits + 1);
-#else
-  vpx_wb_write_literal(wb, abs(data), bits);
-  vpx_wb_write_bit(wb, data < 0);
-#endif
 }

diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c
index a9669e5..18bcd87 100644
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c

@@ -832,11 +832,9 @@
 intra_pred_no_4x4(d207)
 intra_pred_no_4x4(d63)
 intra_pred_no_4x4(d45)
-#if CONFIG_MISC_FIXES
 intra_pred_allsizes(d207e)
 intra_pred_allsizes(d63e)
 intra_pred_no_4x4(d45e)
-#endif
 intra_pred_no_4x4(d117)
 intra_pred_no_4x4(d135)
 intra_pred_no_4x4(d153)