Merge "add vp10 ActiveMap/ActiveMapRefreshTest" into nextgenv2
diff --git a/test/assertion_helpers.h b/test/assertion_helpers.h
new file mode 100644
index 0000000..108c40a
--- /dev/null
+++ b/test/assertion_helpers.h
@@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef TEST_ASSERTION_HELPERS_H_
+#define TEST_ASSERTION_HELPERS_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace libvpx_test {
+namespace assertion_helpers {
+
+// Arrays (1D) are element-wise equal
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEq(const E (&a)[n],
+                                    const E (&b)[n]) {
+  for (size_t i = 0; i < n; i++) {
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// within the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n],
+                                          const E (&b)[n],
+                                          const size_t lo,
+                                          const size_t hi) {
+  assert(hi > lo);
+  assert(hi <= n);
+
+  for (size_t i = lo; i < hi; i++) {
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// outside the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n],
+                                           const E (&b)[n],
+                                           const size_t lo,
+                                           const size_t hi) {
+  assert(hi > lo);
+  assert(hi <= n);
+
+  for (size_t i = 0; i < n; i++) {
+    if (lo <= i && i < hi)
+      continue;
+
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEq(const E (&a)[n][m],
+                                    const E (&b)[n][m]) {
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < m; j++) {
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// within the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n][m],
+                                          const E (&b)[n][m],
+                                          const size_t lo0,
+                                          const size_t hi0,
+                                          const size_t lo1,
+                                          const size_t hi1) {
+  assert(hi0 > lo0);
+  assert(hi0 <= n);
+  assert(hi1 > lo1);
+  assert(hi1 <= m);
+
+  for (size_t i = lo0; i < hi0; i++) {
+    for (size_t j = lo1; j < hi1; j++) {
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// outside the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n][m],
+                                           const E (&b)[n][m],
+                                           const size_t lo0,
+                                           const size_t hi0,
+                                           const size_t lo1,
+                                           const size_t hi1) {
+  assert(hi0 > lo0);
+  assert(hi0 <= n);
+  assert(hi1 > lo1);
+  assert(hi1 <= m);
+
+  for (size_t i = 0; i < n; i++) {
+    if (lo0 <= i && i < hi0)
+      continue;
+
+    for (size_t j = 0; j < m; j++) {
+      if (lo1 <= j && j < hi1)
+        continue;
+
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// at corresponding linear indices specified by rows/cols/stride/offset
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqWithin(const E (&a)[n][m],
+                                           const E (&b)[n][m],
+                                           const size_t stridea,
+                                           const size_t strideb,
+                                           const size_t offseta,
+                                           const size_t offsetb,
+                                           const size_t rows,
+                                           const size_t cols) {
+  assert(rows <= n);
+  assert(cols <= m);
+  assert(stridea <= m);
+  assert(strideb <= m);
+  assert(cols <= stridea);
+  assert(cols <= strideb);
+  assert(offseta < n * m);
+  assert(offsetb < n * m);
+  assert(offseta + (rows - 1) * stridea + (cols - 1) < n * m);
+  assert(offsetb + (rows - 1) * strideb + (cols - 1) < n * m);
+
+  const E *pa = &a[0][0] + offseta;
+  const E *pb = &b[0][0] + offsetb;
+
+  for (size_t r = 0 ; r < rows ; r++) {
+    for (size_t c = 0 ; c < cols ; c++) {
+      const E &va = pa[c];
+      const E &vb = pb[c];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at linear index "
+          << "[" << pa - &a[0][0]  << "] vs [" << pb - &b[0][0]  << "]"
+          << " row=" << r << " col=" << c
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+    pa += stridea;
+    pb += strideb;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// except at corresponding linear indices specified by
+// rows/cols/stride/offset.
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqOutside(const E (&a)[n][m],
+                                            const E (&b)[n][m],
+                                            const size_t stride,
+                                            const size_t offset,
+                                            const size_t rows,
+                                            const size_t cols ) {
+  assert(rows <= n);
+  assert(cols <= m);
+  assert(stride <= m);
+  assert(cols <= stride);
+  assert(offset < n * m);
+  assert(offset + (rows - 1) * stride + (cols - 1) < n * m);
+
+  const E *const pa = &a[0][0];
+  const E *const pb = &b[0][0];
+
+  size_t idx = 0;
+  size_t r = 0;
+  size_t end = offset;  // beginning of first row
+
+  while (idx < n * m) {
+    while (idx < end) {   // until beginning of row or end of buffer
+      const E &va = pa[idx];
+      const E &vb = pb[idx];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << idx / m << "][" << idx % m << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+
+      idx++;
+    }
+
+    // Move past row end
+    idx += cols;
+
+    if (++r < rows) {
+      // Move to next row
+      end += stride;
+    } else {
+      // Move to end of buffer
+      end = n * m;
+    }
+  }
+
+  // Sanity check
+  assert(idx == n * m + cols);
+
+  return ::testing::AssertionSuccess();
+}
+
+}   // namespace assertion_helpers
+}   // namespace libvpx_test
+
+#endif  // TEST_ASSERTION_HELPERS_H_
diff --git a/test/blend_mask6_test.cc b/test/blend_mask6_test.cc
new file mode 100644
index 0000000..d737ddd
--- /dev/null
+++ b/test/blend_mask6_test.cc
@@ -0,0 +1,311 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/randomise.h"
+#include "test/snapshot.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/assertion_helpers.h"
+#include "vp10/common/enums.h"
+
+using libvpx_test::assertion_helpers::BuffersEqWithin;
+using libvpx_test::assertion_helpers::BuffersEqOutside;
+using libvpx_test::assertion_helpers::ArraysEq;
+using libvpx_test::FunctionEquivalenceTest;
+using libvpx_test::Snapshot;
+using libvpx_test::Randomise;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendMask6Test : public FunctionEquivalenceTest<F> {
+ protected:
+  virtual ~BlendMask6Test() {}
+
+  virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+  void Common() {
+    w = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+    h = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+
+    randomise(subx);
+    randomise(suby);
+
+    randomise(dst_offset, 0, 32);
+    randomise(dst_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(src0_offset, 0, 32);
+    randomise(src0_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(src1_offset, 0, 32);
+    randomise(src1_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(mask_stride, w * (subx ? 2: 1), 2 * MAX_SB_SIZE + 1);
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (randomise.uniform<int>(3)) {
+      case 0:   // Separate sources
+        p_src0 = &src0[0][0];
+        p_src1 = &src1[0][0];
+        break;
+      case 1:   // src0 == dst
+        p_src0 = &dst_tst[0][0];
+        src0_stride = dst_stride;
+        src0_offset = dst_offset;
+        p_src1 = &src1[0][0];
+        break;
+      case 2:   // src1 == dst
+        p_src0 = &src0[0][0];
+        p_src1 = &dst_tst[0][0];
+        src1_stride = dst_stride;
+        src1_offset = dst_offset;
+        break;
+      default:
+        FAIL();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // Prepare
+    //////////////////////////////////////////////////////////////////////////
+
+    snapshot(dst_ref);
+    snapshot(dst_tst);
+
+    snapshot(src0);
+    snapshot(src1);
+
+    snapshot(mask);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Execute
+    //////////////////////////////////////////////////////////////////////////
+
+    Execute(p_src0, p_src1);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Check
+    //////////////////////////////////////////////////////////////////////////
+
+    ASSERT_TRUE(BuffersEqWithin(dst_ref, dst_tst,
+                                dst_stride, dst_stride,
+                                dst_offset, dst_offset,
+                                h, w));
+
+    ASSERT_TRUE(ArraysEq(snapshot.get(src0), src0));
+    ASSERT_TRUE(ArraysEq(snapshot.get(src1), src1));
+    ASSERT_TRUE(ArraysEq(snapshot.get(mask), mask));
+
+    ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_ref), dst_ref,
+                                 dst_stride,
+                                 dst_offset,
+                                 h, w));
+
+    ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_tst), dst_tst,
+                                 dst_stride,
+                                 dst_offset,
+                                 h, w));
+  }
+
+  Snapshot snapshot;
+  Randomise randomise;
+
+  T dst_ref[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  T dst_tst[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t dst_stride;
+  size_t dst_offset;
+
+  T src0[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t src0_stride;
+  size_t src0_offset;
+
+  T src1[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t src1_stride;
+  size_t src1_offset;
+
+  uint8_t mask[2 * MAX_SB_SIZE][2 * MAX_SB_SIZE];
+  size_t mask_stride;
+
+  int w;
+  int h;
+
+  bool suby;
+  bool subx;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                      uint8_t *src0, uint32_t src0_stride,
+                      uint8_t *src1, uint32_t src1_stride,
+                      const uint8_t *mask, uint32_t mask_stride,
+                      int h, int w, int suby, int subx);
+
+class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+    ref_func_(&dst_ref[0][dst_offset], dst_stride,
+              p_src0 + src0_offset, src0_stride,
+              p_src1 + src1_offset, src1_stride,
+              &mask[0][0], sizeof(mask[0]),
+              h, w, suby, subx);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(&dst_tst[0][dst_offset], dst_stride,
+                p_src0 + src0_offset, src0_stride,
+                p_src1 + src1_offset, src1_stride,
+                &mask[0][0], sizeof(mask[0]),
+                h, w, suby, subx));
+  }
+};
+
+TEST_P(BlendMask6Test8B, RandomValues) {
+  for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    randomise(dst_ref);
+    randomise(dst_tst);
+
+    randomise(src0);
+    randomise(src1);
+
+    randomise(mask, 65);
+
+    Common();
+  }
+}
+
+TEST_P(BlendMask6Test8B, ExtremeValues) {
+  for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    randomise(dst_ref, 254, 256);
+    randomise(dst_tst, 254, 256);
+
+    randomise(src0, 254, 256);
+    randomise(src1, 254, 256);
+
+    randomise(mask, 63, 65);
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendMask6Test8B,
+  ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                       uint8_t *src0, uint32_t src0_stride,
+                       uint8_t *src1, uint32_t src1_stride,
+                       const uint8_t *mask, uint32_t mask_stride,
+                       int h, int w, int suby, int subx, int bd);
+
+class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+ protected:
+  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+    ref_func_(CONVERT_TO_BYTEPTR(&dst_ref[0][dst_offset]), dst_stride,
+              CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+              CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+              &mask[0][0], sizeof(mask[0]),
+              h, w, suby, subx, bit_depth);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(CONVERT_TO_BYTEPTR(&dst_tst[0][dst_offset]), dst_stride,
+                CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+                CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+                &mask[0][0], sizeof(mask[0]),
+                h, w, suby, subx, bit_depth));
+  }
+
+  int bit_depth;
+};
+
+TEST_P(BlendMask6TestHBD, RandomValues) {
+  for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    bit_depth = randomise.choice(8, 10, 12);
+
+    const int hi = 1 << bit_depth;
+
+    randomise(dst_ref, hi);
+    randomise(dst_tst, hi);
+
+    randomise(src0, hi);
+    randomise(src1, hi);
+
+    randomise(mask, 65);
+
+    Common();
+  }
+}
+
+TEST_P(BlendMask6TestHBD, ExtremeValues) {
+  for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    bit_depth = randomise.choice(8, 10, 12);
+
+    const int hi = 1 << bit_depth;
+    const int lo = hi - 2;
+
+    randomise(dst_ref, lo, hi);
+    randomise(dst_tst, lo, hi);
+
+    randomise(src0, lo, hi);
+    randomise(src1, lo, hi);
+
+    randomise(mask, 63, 65);
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendMask6TestHBD,
+  ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
+                               &vpx_highbd_blend_mask6_sse4_1)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index ad861c3..7fb3e37 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -18,6 +18,7 @@
 namespace libvpx_test {
 
 const char kVP8Name[] = "WebM Project VP8";
+const char kVP10Name[] = "WebM Project VP10";
 
 vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
                                     vpx_codec_stream_info_t *stream_info) {
@@ -46,6 +47,11 @@
   return strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
 }
 
+bool Decoder::IsVP10() const {
+  const char *codec_name = GetDecoderName();
+  return strncmp(kVP10Name, codec_name, sizeof(kVP10Name) - 1) == 0;
+}
+
 void DecoderTest::HandlePeekResult(Decoder *const decoder,
                                    CompressedVideoSource *video,
                                    const vpx_codec_err_t res_peek) {
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index f566c53..1492c5a 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -107,6 +107,8 @@
 
   bool IsVP8() const;
 
+  bool IsVP10() const;
+
   vpx_codec_ctx_t * GetDecoder() {
     return &decoder_;
   }
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index e24c9bf..f4c4c4b 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -58,8 +58,10 @@
 #endif
     {
 #if CONFIG_VP8_ENCODER
-      ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
-          << "Unknown Codec Interface";
+      if (CodecInterface() == &vpx_codec_vp8_cx_algo) {
+        ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
+            << "Unknown Codec Interface";
+      }
 #endif
     }
   }
@@ -261,12 +263,6 @@
 void EncoderTest::RunLoop(VideoSource *video) {
   vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
 
-#if CONFIG_EXT_TILE
-  // Decode all tiles.
-  dec_cfg.tile_col = -1;
-  dec_cfg.tile_row = -1;
-#endif  // CONFIG_EXT_TILE
-
   stats_.Reset();
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
@@ -295,6 +291,15 @@
     if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION)
       dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
     Decoder* const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (decoder->IsVP10()) {
+      // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
+      // frame is decoded.
+      decoder->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      decoder->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
+
     bool again;
     for (again = true; again; video->Next()) {
       again = (video->img() != NULL);
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
new file mode 100644
index 0000000..50ad4c5
--- /dev/null
+++ b/test/function_equivalence_test.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+namespace libvpx_test {
+template <typename T>
+class FunctionEquivalenceTest :
+  public ::testing::TestWithParam< std::tr1::tuple< T, T > > {
+ public:
+  virtual ~FunctionEquivalenceTest() {}
+
+  virtual void SetUp() {
+    ref_func_ = std::tr1::get<0>(this->GetParam());
+    tst_func_ = std::tr1::get<1>(this->GetParam());
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  T ref_func_;
+  T tst_func_;
+};
+
+}   // namespace libvpx_test
+#endif  // TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/test/randomise.h b/test/randomise.h
new file mode 100644
index 0000000..fbf419c
--- /dev/null
+++ b/test/randomise.h
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_RANDOMISE_H_
+#define TEST_RANDOMISE_H_
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+
+namespace libvpx_test {
+
+// TODO(any): Replace this when built with C++11
+#define STATIC_ASSERT_INTEGER_TYPE_(T) \
+  GTEST_COMPILE_ASSERT_(std::numeric_limits<T>::is_integer, \
+    integer_type_required);
+
+/**
+ * Deterministic random number generator with various convenience methods.
+ */
+class Randomise {
+ public:
+  Randomise() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual ~Randomise() { }
+
+  // Uniformly distributed random number from the range
+  // [std::numeric_limits<R>::min(), and std::numeric_limits<R>::max()]
+  template<typename R>
+  R uniform() {
+    STATIC_ASSERT_INTEGER_TYPE_(R);
+  }
+
+  // Uniformly distributed random number from the range
+  // [0, hi)
+  template<typename R, typename H>
+  R uniform(H hi) {
+    assert(hi > 0);
+    R v = uniform<R>();
+    if (std::numeric_limits<R>::is_signed && v < 0)
+      return -v % hi;
+    else
+      return v % hi;
+  }
+
+  // Uniformly distributed random number from the range
+  // [lo, hi)
+  template<typename R, typename L, typename H>
+  R uniform(L lo, H hi) {
+    assert(hi > lo);
+    return uniform<R, H>(hi - lo) + lo;
+  }
+
+  // Randomly pick and return one of the arguments
+  template<typename T>
+  T choice(T v0, T v1) {
+    switch (uniform<int>(2)) {
+      case 0: return v0;
+      default: return v1;
+    }
+  }
+
+  // Randomly pick and return one of the arguments
+  template<typename T>
+  T choice(T v0, T v1, T v2) {
+    switch (uniform<int>(3)) {
+      case 0: return v0;
+      case 1: return v1;
+      default: return v2;
+    }
+  }
+
+  template<typename T>
+  void operator()(T &e) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T>();
+  }
+
+  template<typename T, typename H>
+  void operator()(T &e, H hi) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T, H>(hi);
+  }
+
+  template<typename T, typename L, typename H>
+  void operator()(T &e, L lo, H hi) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T, L, H>(lo, hi);
+  }
+
+  template<typename T, size_t n>
+  void operator()(T (&arr)[n]) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T>();
+    }
+  }
+
+  template<typename T, size_t n, typename H>
+  void operator()(T (&arr)[n], H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T, H>(hi);
+    }
+  }
+
+  template<typename T, size_t n, typename L, typename H>
+  void operator()(T (&arr)[n], L lo, H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T, L, H>(lo, hi);
+    }
+  }
+
+  template<typename T, size_t n, size_t m>
+  void operator()(T (&arr)[n][m]) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T>();
+      }
+    }
+  }
+
+  template<typename T, size_t n, size_t m, typename H>
+  void operator()(T (&arr)[n][m], H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T, H>(hi);
+      }
+    }
+  }
+
+  template<typename T, size_t n, size_t m, typename L, typename H>
+  void operator()(T (&arr)[n][m], L lo, H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T, L, H>(lo, hi);
+      }
+    }
+  }
+
+ private:
+  libvpx_test::ACMRandom rnd_;
+};
+
+// Add further specialisations as necessary
+
+template<>
+bool Randomise::uniform<bool>() {
+  return rnd_.Rand8() & 1 ? true : false;
+}
+
+template<>
+uint8_t Randomise::uniform<uint8_t>() {
+  return rnd_.Rand8();
+}
+
+template<>
+uint16_t Randomise::uniform<uint16_t>() {
+  return rnd_.Rand16();
+}
+
+template<>
+uint32_t Randomise::uniform<uint32_t>() {
+  const uint32_t l = uniform<uint16_t>();
+  const uint32_t h = uniform<uint16_t>();
+  return h << 16 | l;
+}
+
+template<>
+uint64_t Randomise::uniform<uint64_t>() {
+  const uint64_t l = uniform<uint32_t>();
+  const uint64_t h = uniform<uint32_t>();
+  return h << 32 | l;
+}
+
+template<>
+int8_t Randomise::uniform<int8_t>() { return uniform<uint8_t>(); }
+
+template<>
+int16_t Randomise::uniform<int16_t>() { return uniform<uint16_t>(); }
+
+template<>
+int32_t Randomise::uniform<int32_t>() { return uniform<uint32_t>(); }
+
+template<>
+int64_t Randomise::uniform<int64_t>() { return uniform<uint64_t>(); }
+
+}  // namespace libvpx_test
+
+#endif  // TEST_RANDOMISE_H_
diff --git a/test/snapshot.h b/test/snapshot.h
new file mode 100644
index 0000000..b67edde
--- /dev/null
+++ b/test/snapshot.h
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_SNAPSHOT_H_
+#define TEST_SNAPSHOT_H_
+
+#include <map>
+
+namespace libvpx_test {
+
+/**
+ * Allows capturing and retrieving snapshots of arbitrary blobs of memory,
+ * blob size is based on compile time type information.
+ *
+ * Usage:
+ * void example() {
+ *   Snapshot snapshot;
+ *
+ *   int foo = 4;
+ *
+ *   snapshot(foo);
+ *
+ *   foo = 10;
+ *
+ *   assert(snapshot.get(foo) == 4);     // Pass
+ *   assert(snapshot.get(foo) == foo);   // Fail (4 != 10)
+ *
+ *   char bar[10][10];
+ *   memset(bar, 3, sizeof(bar));
+ *
+ *   snapshot(bar);
+ *
+ *   memset(bar, 8, sizeof(bar));
+ *
+ *   assert(sum(bar) == 800);                 // Pass
+ *   assert(sum(snapshot.get(bar)) == 300);   // Pass
+ * }
+ */
+class Snapshot {
+ public:
+  virtual ~Snapshot() {
+    for (snapshot_map_t::iterator it = snapshots_.begin();
+         it != snapshots_.end(); it++) {
+      delete[] it->second;
+    }
+  }
+
+  /**
+   * Take new snapshot for object
+   */
+  template<typename E>
+  void take(const E &e) {
+    const void *const key = reinterpret_cast<const void*>(&e);
+
+    snapshot_map_t::iterator it = snapshots_.find(key);
+
+    if (it != snapshots_.end())
+      delete[] it->second;
+
+    char *const buf = new char[sizeof(E)];
+
+    memcpy(buf, &e, sizeof(E));
+
+    snapshots_[key] = buf;
+  }
+
+  /**
+   * Same as 'take'
+   */
+  template<typename E>
+  void operator()(const E &e) {
+    take(e);
+  }
+
+  /**
+   * Retrieve last snapshot for object
+   */
+  template<typename E>
+  const E& get(const E &e) const {
+    const void *const key = reinterpret_cast<const void*>(&e);
+
+    snapshot_map_t::const_iterator it = snapshots_.find(key);
+
+    assert(it != snapshots_.end());
+
+    return *reinterpret_cast<const E*>(it->second);
+  }
+
+ private:
+  typedef std::map<const void*, const char*> snapshot_map_t;
+
+  snapshot_map_t snapshots_;
+};
+
+}   // namespace libvpx_test
+
+#endif  // TEST_SNAPSHOT_H_
diff --git a/test/svc_test.cc b/test/svc_test.cc
index e573e10..1ad17be 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -61,12 +61,14 @@
     codec_enc_.kf_max_dist = 100;
 
     vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
-#if CONFIG_EXT_TILE
-    dec_cfg.tile_col = -1;
-    dec_cfg.tile_row = -1;
-#endif  // CONFIG_EXT_TILE
     VP9CodecFactory codec_factory;
     decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (decoder_->IsVP10()) {
+      decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
 
     tile_columns_ = 0;
     tile_rows_ = 0;
diff --git a/test/test.mk b/test/test.mk
index 8682a88..8eda2dd 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -174,6 +174,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_EXT_TILE)     += vp10_ext_tile_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
@@ -181,7 +182,12 @@
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
 endif
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_highbd_iht_test.cc
+endif # CONFIG_VP9_HIGHBITDEPTH
 endif # VP10
 
 ## Multi-codec / unconditional whitebox tests.
@@ -193,6 +199,7 @@
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm1d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 9a049bf..dc31d06 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -35,13 +35,18 @@
     cfg.w = 704;
     cfg.h = 144;
     cfg.threads = 1;
-#if CONFIG_EXT_TILE
-    cfg.tile_col = -1;
-    cfg.tile_row = -1;
-#endif  // CONFIG_EXT_TILE
     fw_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
+
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (fw_dec_->IsVP10() && inv_dec_->IsVP10()) {
+      fw_dec_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      fw_dec_->Control(VP10_SET_DECODE_TILE_COL, -1);
+      inv_dec_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      inv_dec_->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
   }
 
   virtual ~TileIndependenceTest() {
diff --git a/test/vp10_ext_tile_test.cc b/test/vp10_ext_tile_test.cc
new file mode 100644
index 0000000..ad04eeb
--- /dev/null
+++ b/test/vp10_ext_tile_test.cc
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+
+namespace {
+// The number of frames to be encoded/decoded
+const int kLimit = 8;
+// Skip 1 frame to check the frame decoding independency.
+const int kSkip = 5;
+const int kTileSize = 1;
+const int kTIleSizeInPixels = (kTileSize << 6);
+// Fake width and height so that they can be multiples of the tile size.
+const int kImgWidth = 704;
+const int kImgHeight = 576;
+
+class VP10ExtTileTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VP10ExtTileTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.w = kImgWidth;
+    cfg.h = kImgHeight;
+
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+    decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+
+    // Allocate buffer to store tile image.
+    vpx_img_alloc(&tile_img_, VPX_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+
+    md5_.clear();
+    tile_md5_.clear();
+  }
+
+  virtual ~VP10ExtTileTest() {
+    vpx_img_free(&tile_img_);
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.g_error_resilient = 1;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      // Encode setting
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+
+      // The tile size is 64x64.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, kTileSize);
+      encoder->Control(VP9E_SET_TILE_ROWS, kTileSize);
+#if CONFIG_EXT_PARTITION
+      // Always use 64x64 max partition.
+      encoder->Control(VP10E_SET_SUPERBLOCK_SIZE, VPX_SUPERBLOCK_SIZE_64X64);
+#endif
+    }
+
+    if (video->frame() == 1) {
+      frame_flags_ = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF;
+    }
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t pts) {
+    // Skip 1 already decoded frame to be consistent with the decoder in this
+    // test.
+    if (pts == (vpx_codec_pts_t)kSkip)
+      return;
+
+    // Calculate MD5 as the reference.
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Skip decoding 1 frame.
+    if (pkt->data.frame.pts == (vpx_codec_pts_t)kSkip)
+      return;
+
+    bool IsLastFrame = (pkt->data.frame.pts == (vpx_codec_pts_t)(kLimit - 1));
+
+    // Decode the first (kLimit - 1) frames as whole frame, and decode the last
+    // frame in single tiles.
+    for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) {
+      for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) {
+        if (!IsLastFrame) {
+          decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+          decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+        } else {
+          decoder_->Control(VP10_SET_DECODE_TILE_ROW, r);
+          decoder_->Control(VP10_SET_DECODE_TILE_COL, c);
+        }
+
+        const vpx_codec_err_t res = decoder_->DecodeFrame(
+            reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+            pkt->data.frame.sz);
+        if (res != VPX_CODEC_OK) {
+          abort_ = true;
+          ASSERT_EQ(VPX_CODEC_OK, res);
+        }
+        const vpx_image_t *img = decoder_->GetDxData().Next();
+
+        if (!IsLastFrame) {
+          if (img) {
+            ::libvpx_test::MD5 md5_res;
+            md5_res.Add(img);
+            tile_md5_.push_back(md5_res.Get());
+          }
+          break;
+        }
+
+        const int kMaxMBPlane = 3;
+        for (int plane = 0; plane < kMaxMBPlane; ++plane) {
+          const int shift = (plane == 0) ? 0 : 1;
+          int tile_height = kTIleSizeInPixels >> shift;
+          int tile_width = kTIleSizeInPixels >> shift;
+
+          for (int tr = 0; tr < tile_height; ++tr) {
+            memcpy(tile_img_.planes[plane] +
+                   tile_img_.stride[plane] * (r * tile_height + tr) +
+                   c * tile_width,
+                   img->planes[plane] + img->stride[plane] * tr, tile_width);
+          }
+        }
+      }
+
+      if (!IsLastFrame)
+        break;
+    }
+
+    if (IsLastFrame) {
+      ::libvpx_test::MD5 md5_res;
+      md5_res.Add(&tile_img_);
+      tile_md5_.push_back(md5_res.Get());
+    }
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libvpx_test::Decoder *decoder_;
+  vpx_image_t tile_img_;
+  std::vector<std::string> md5_;
+  std::vector<std::string> tile_md5_;
+};
+
+TEST_P(VP10ExtTileTest, DecoderResultTest) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv",
+                                       kImgWidth, kImgHeight, 30, 1, 0, kLimit);
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 1;
+
+  // Tile encoding
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(md5_, tile_md5_);
+}
+
+VP10_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    VP10ExtTileTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Range(0, 4));
+}  // namespace
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index 8833250..deccc81 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc
@@ -132,7 +132,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int i, j;
   const int stride = 16;
-  const int num_tests = 200000;
+  const int num_tests = 1000;
 
   for (i = 0; i < num_tests; ++i) {
     for (j = 0; j < num_coeffs_; ++j) {
@@ -207,7 +207,19 @@
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 10),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
-    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12)
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, VP10HighbdTrans16x16HT,
diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
index 5b81095..c5a4382 100644
--- a/test/vp10_fht4x4_test.cc
+++ b/test/vp10_fht4x4_test.cc
@@ -38,8 +38,10 @@
 typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                               int tx_type, int bd);
 typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
-                        int tx_type, int bd);
-// Target optimized function, tx_type, bit depth
+                           int tx_type, int bd);
+
+// HighbdHt4x4Param argument list:
+// <Target optimized function, tx_type, bit depth>
 typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
 
 void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
@@ -96,12 +98,12 @@
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = 16;
 
-    input_ = reinterpret_cast<int16_t *>
-       (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
-    output_ = reinterpret_cast<int32_t *>
-        (vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
-    output_ref_ = reinterpret_cast<int32_t *>
-        (vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
   }
 
   virtual void TearDown() {
@@ -130,7 +132,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int i, j;
   const int stride = 4;
-  const int num_tests = 200000;
+  const int num_tests = 1000;
   const int num_coeffs = 16;
 
   for (i = 0; i < num_tests; ++i) {
@@ -197,9 +199,7 @@
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, VP10HighbdTrans4x4HT,
-    ::testing::Values(
+const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10),
@@ -207,7 +207,25 @@
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10),
-         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12)));
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdTrans4x4HT,
+      ::testing::ValuesIn(kArrayHighbdHt4x4Param));
+
 #endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
 
 }  // namespace
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
index aadd77d..da278c4 100644
--- a/test/vp10_fht8x8_test.cc
+++ b/test/vp10_fht8x8_test.cc
@@ -131,7 +131,7 @@
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   int i, j;
   const int stride = 8;
-  const int num_tests = 200000;
+  const int num_tests = 1000;
   const int num_coeffs = 64;
 
   for (i = 0; i < num_tests; ++i) {
@@ -207,7 +207,19 @@
     make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 10),
     make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 12),
     make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 10),
-    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12)
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, VP10HighbdTrans8x8HT,
diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
index 8e90dc2..0e35782 100644
--- a/test/vp10_fwd_txfm2d_test.cc
+++ b/test/vp10_fwd_txfm2d_test.cc
@@ -13,8 +13,9 @@
 #include <stdlib.h>
 
 #include "test/acm_random.h"
+#include "test/util.h"
 #include "test/vp10_txfm_test.h"
-#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "vp10/common/vp10_txfm.h"
 #include "./vp10_rtcd.h"
 
 using libvpx_test::ACMRandom;
@@ -23,95 +24,156 @@
 using libvpx_test::compute_avg_abs_error;
 using libvpx_test::Fwd_Txfm2d_Func;
 using libvpx_test::TYPE_TXFM;
-using libvpx_test::TYPE_DCT;
-using libvpx_test::TYPE_ADST;
 
 namespace {
-
 #if CONFIG_VP9_HIGHBITDEPTH
-const int txfm_size_num = 5;
-const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
-const TXFM_2D_CFG* fwd_txfm_cfg_ls[5][4] = {
-    {&fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_adst_4,
-     &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_dct_4},
-    {&fwd_txfm_2d_cfg_dct_dct_8, &fwd_txfm_2d_cfg_dct_adst_8,
-     &fwd_txfm_2d_cfg_adst_adst_8, &fwd_txfm_2d_cfg_adst_dct_8},
-    {&fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_adst_16,
-     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_dct_16},
-    {&fwd_txfm_2d_cfg_dct_dct_32, &fwd_txfm_2d_cfg_dct_adst_32,
-     &fwd_txfm_2d_cfg_adst_adst_32, &fwd_txfm_2d_cfg_adst_dct_32},
-    {&fwd_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tr1::tuple<TX_TYPE, TX_SIZE, double, double> VP10FwdTxfm2dParam;
 
-const Fwd_Txfm2d_Func fwd_txfm_func_ls[5] = {
-    vp10_fwd_txfm2d_4x4_c, vp10_fwd_txfm2d_8x8_c, vp10_fwd_txfm2d_16x16_c,
-    vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c};
+class VP10FwdTxfm2d : public ::testing::TestWithParam<VP10FwdTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    count_ = 500;
+    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg =
+        vp10_get_fwd_txfm_cfg(tx_type_, tx_size_);
+    const TXFM_2D_CFG *fwd_txfm_cfg = fwd_txfm_flip_cfg.cfg;
+    int amplify_bit = fwd_txfm_cfg->shift[0] + fwd_txfm_cfg->shift[1] +
+                      fwd_txfm_cfg->shift[2];
+    ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
+    lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
+    amplify_factor_ =
+        amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
 
-const int txfm_type_num = 4;
-const TYPE_TXFM type_ls_0[4] = {TYPE_DCT, TYPE_DCT, TYPE_ADST, TYPE_ADST};
-const TYPE_TXFM type_ls_1[4] = {TYPE_DCT, TYPE_ADST, TYPE_ADST, TYPE_DCT};
+    fwd_txfm_ = libvpx_test::fwd_txfm_func_ls[tx_size_];
+    txfm1d_size_ = libvpx_test::get_txfm1d_size(tx_size_);
+    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
+    get_txfm1d_type(tx_type_, &type0_, &type1_);
+    input_ = reinterpret_cast<int16_t *>
+       (vpx_memalign(16, sizeof(int16_t) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>
+        (vpx_memalign(16, sizeof(int32_t) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<double *>
+        (vpx_memalign(16, sizeof(double) * txfm2d_size_));
+    ref_output_ = reinterpret_cast<double *>
+        (vpx_memalign(16, sizeof(double) * txfm2d_size_));
+  }
 
-TEST(vp10_fwd_txfm2d, accuracy) {
-  for (int txfm_size_idx = 0; txfm_size_idx < txfm_size_num; ++txfm_size_idx) {
-    int txfm_size = txfm_size_ls[txfm_size_idx];
-    int sqr_txfm_size = txfm_size * txfm_size;
-    int16_t* input = new int16_t[sqr_txfm_size];
-    int32_t* output = new int32_t[sqr_txfm_size];
-    double* ref_input = new double[sqr_txfm_size];
-    double* ref_output = new double[sqr_txfm_size];
-
-    for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
-         ++txfm_type_idx) {
-      const TXFM_2D_CFG* fwd_txfm_cfg =
-          fwd_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
-      if (fwd_txfm_cfg != NULL) {
-        Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
-        TYPE_TXFM type0 = type_ls_0[txfm_type_idx];
-        TYPE_TXFM type1 = type_ls_1[txfm_type_idx];
-        int amplify_bit = fwd_txfm_cfg->shift[0] + fwd_txfm_cfg->shift[1] +
-                          fwd_txfm_cfg->shift[2];
-        double amplify_factor =
-            amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
-        int tx_type = libvpx_test::get_tx_type(fwd_txfm_cfg);
-
-        ACMRandom rnd(ACMRandom::DeterministicSeed());
-        int count = 500;
-        double avg_abs_error = 0;
-        for (int ci = 0; ci < count; ci++) {
-          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-            input[ni] = rnd.Rand16() % input_base;
-            ref_input[ni] = static_cast<double>(input[ni]);
-            output[ni] = 0;
-            ref_output[ni] = 0;
-          }
-
-          fwd_txfm_func(input, output, txfm_size, tx_type, bd);
-          reference_hybrid_2d(ref_input, ref_output, txfm_size, type0, type1);
-
-          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-            ref_output[ni] = round(ref_output[ni] * amplify_factor);
-            EXPECT_LE(fabs(output[ni] - ref_output[ni]) / amplify_factor, 70);
-          }
-          avg_abs_error += compute_avg_abs_error<int32_t, double>(
-              output, ref_output, sqr_txfm_size);
-        }
-
-        avg_abs_error /= amplify_factor;
-        avg_abs_error /= count;
-        // max_abs_avg_error comes from upper bound of avg_abs_error
-        // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
-        // %f\n",
-        // type0, type1, txfm_size, avg_abs_error);
-        double max_abs_avg_error = 7;
-        EXPECT_LE(avg_abs_error, max_abs_avg_error);
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double avg_abs_error = 0;
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        input_[ni] = rnd.Rand16() % input_base;
+        ref_input_[ni] = static_cast<double>(input_[ni]);
+        output_[ni] = 0;
+        ref_output_[ni] = 0;
       }
+
+      fwd_txfm_(input_, output_, txfm1d_size_, tx_type_, bd);
+
+      if (lr_flip_ && ud_flip_)
+        libvpx_test::fliplrud(ref_input_, txfm1d_size_, txfm1d_size_);
+      else if (lr_flip_)
+        libvpx_test::fliplr(ref_input_, txfm1d_size_, txfm1d_size_);
+      else if (ud_flip_)
+        libvpx_test::flipud(ref_input_, txfm1d_size_, txfm1d_size_);
+
+      reference_hybrid_2d(ref_input_, ref_output_, txfm1d_size_,
+                          type0_, type1_);
+
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        ref_output_[ni] = round(ref_output_[ni] * amplify_factor_);
+        EXPECT_GE(max_error_,
+                  fabs(output_[ni] - ref_output_[ni]) / amplify_factor_);
+      }
+      avg_abs_error += compute_avg_abs_error<int32_t, double>(
+          output_, ref_output_, txfm2d_size_);
     }
 
-    delete[] input;
-    delete[] output;
-    delete[] ref_input;
-    delete[] ref_output;
+    avg_abs_error /= amplify_factor_;
+    avg_abs_error /= count_;
+    // max_abs_avg_error comes from upper bound of avg_abs_error
+    // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
+    // %f\n", type0_, type1_, txfm1d_size_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error);
   }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(ref_input_);
+    vpx_free(ref_output_);
+  }
+
+ private:
+  double max_error_;
+  double max_avg_error_;
+  int count_;
+  double amplify_factor_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int txfm1d_size_;
+  int txfm2d_size_;
+  Fwd_Txfm2d_Func fwd_txfm_;
+  TYPE_TXFM type0_;
+  TYPE_TXFM type1_;
+  int16_t* input_;
+  int32_t* output_;
+  double* ref_input_;
+  double* ref_output_;
+  int ud_flip_;  // flip upside down
+  int lr_flip_;  // flip left to right
+};
+
+TEST_P(VP10FwdTxfm2d, RunFwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
 }
+
+INSTANTIATE_TEST_CASE_P(
+    C, VP10FwdTxfm2d,
+    ::testing::Values(
+#if CONFIG_EXT_TX
+        VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(ADST_FLIPADST, TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(FLIPADST_ADST, TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(ADST_FLIPADST, TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(FLIPADST_ADST, TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(ADST_FLIPADST, TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(FLIPADST_ADST, TX_32X32, 70, 7),
+#endif
+        VP10FwdTxfm2dParam(DCT_DCT,   TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(ADST_DCT,  TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(DCT_ADST,  TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.2),
+        VP10FwdTxfm2dParam(DCT_DCT,   TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(ADST_DCT,  TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(DCT_ADST,  TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(ADST_ADST, TX_8X8, 5, 0.6),
+        VP10FwdTxfm2dParam(DCT_DCT,   TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(ADST_DCT,  TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(DCT_ADST,  TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(ADST_ADST, TX_16X16, 11, 1.5),
+        VP10FwdTxfm2dParam(DCT_DCT,   TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(ADST_DCT,  TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(DCT_ADST,  TX_32X32, 70, 7),
+        VP10FwdTxfm2dParam(ADST_ADST, TX_32X32, 70, 7)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 }  // namespace
diff --git a/test/vp10_highbd_iht_test.cc b/test/vp10_highbd_iht_test.cc
new file mode 100644
index 0000000..0b7597d
--- /dev/null
+++ b/test/vp10_highbd_iht_test.cc
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+namespace {
+
+using std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
+                           int tx_type, int bd);
+
+// Test parameter argument list:
+//   <transform reference function,
+//    optimized inverse transform function,
+//    inverse transform reference function,
+//    num_coeffs,
+//    tx_type,
+//    bit_depth>
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, int, int> IHbdHtParam;
+
+class VP10HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
+ public:
+  virtual ~VP10HighbdInvHTNxN() {}
+
+  virtual void SetUp() {
+    txfm_ref_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    inv_txfm_ref_ = GET_PARAM(2);
+    num_coeffs_ = GET_PARAM(3);
+    tx_type_ = GET_PARAM(4);
+    bit_depth_ = GET_PARAM(5);
+
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(input_[0]) * num_coeffs_));
+
+    // Note:
+    // Inverse transform input buffer is 32-byte aligned
+    // Refer to <root>/vp10/encoder/context_tree.c, function,
+    // void alloc_mode_context().
+    coeffs_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+    output_ = reinterpret_cast<uint16_t *>(
+        vpx_memalign(32, sizeof(output_[0]) * num_coeffs_));
+    output_ref_ = reinterpret_cast<uint16_t *>(
+        vpx_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(coeffs_);
+    vpx_free(output_);
+    vpx_free(output_ref_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  int GetStride() const {
+    if (16 == num_coeffs_) {
+      return 4;
+    } else if (64 == num_coeffs_) {
+      return 8;
+    } else if (256 == num_coeffs_) {
+      return 16;
+    } else {
+      return 0;
+    }
+  }
+
+  HbdHtFunc txfm_ref_;
+  IHbdHtFunc inv_txfm_;
+  IHbdHtFunc inv_txfm_ref_;
+  int num_coeffs_;
+  int tx_type_;
+  int bit_depth_;
+
+  int16_t *input_;
+  int32_t *coeffs_;
+  uint16_t *output_;
+  uint16_t *output_ref_;
+};
+
+void VP10HighbdInvHTNxN::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int stride = GetStride();
+  const int num_tests = 20000;
+  const uint16_t mask = (1 << bit_depth_) - 1;
+
+  for (int i = 0; i < num_tests; ++i) {
+    for (int j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+      output_ref_[j] = rnd.Rand16() & mask;
+      output_[j] = output_ref_[j];
+    }
+
+    txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
+    inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeffs_, output_, stride, tx_type_,
+                                       bit_depth_));
+
+    for (int j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j
+          << " At test block: " << i;
+    }
+  }
+}
+
+TEST_P(VP10HighbdInvHTNxN, InvTransResultCheck) {
+  RunBitexactCheck();
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+#define PARAM_LIST_4X4 &vp10_fwd_txfm2d_4x4_c, \
+             &vp10_inv_txfm2d_add_4x4_sse4_1,  \
+             &vp10_inv_txfm2d_add_4x4_c, 16
+
+#define PARAM_LIST_8X8 &vp10_fwd_txfm2d_8x8_c, \
+             &vp10_inv_txfm2d_add_8x8_sse4_1,  \
+             &vp10_inv_txfm2d_add_8x8_c, 64
+
+#define PARAM_LIST_16X16 &vp10_fwd_txfm2d_16x16_c, \
+             &vp10_inv_txfm2d_add_16x16_sse4_1,    \
+             &vp10_inv_txfm2d_add_16x16_c, 256
+
+const IHbdHtParam kArrayIhtParam[] = {
+  // 16x16
+  make_tuple(PARAM_LIST_16X16, 0, 10),
+  make_tuple(PARAM_LIST_16X16, 0, 12),
+  make_tuple(PARAM_LIST_16X16, 1, 10),
+  make_tuple(PARAM_LIST_16X16, 1, 12),
+  make_tuple(PARAM_LIST_16X16, 2, 10),
+  make_tuple(PARAM_LIST_16X16, 2, 12),
+  make_tuple(PARAM_LIST_16X16, 3, 10),
+  make_tuple(PARAM_LIST_16X16, 3, 12),
+  // 8x8
+  make_tuple(PARAM_LIST_8X8, 0, 10),
+  make_tuple(PARAM_LIST_8X8, 0, 12),
+  make_tuple(PARAM_LIST_8X8, 1, 10),
+  make_tuple(PARAM_LIST_8X8, 1, 12),
+  make_tuple(PARAM_LIST_8X8, 2, 10),
+  make_tuple(PARAM_LIST_8X8, 2, 12),
+  make_tuple(PARAM_LIST_8X8, 3, 10),
+  make_tuple(PARAM_LIST_8X8, 3, 12),
+  // 4x4
+  make_tuple(PARAM_LIST_4X4, 0, 10),
+  make_tuple(PARAM_LIST_4X4, 0, 12),
+  make_tuple(PARAM_LIST_4X4, 1, 10),
+  make_tuple(PARAM_LIST_4X4, 1, 12),
+  make_tuple(PARAM_LIST_4X4, 2, 10),
+  make_tuple(PARAM_LIST_4X4, 2, 12),
+  make_tuple(PARAM_LIST_4X4, 3, 10),
+  make_tuple(PARAM_LIST_4X4, 3, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdInvHTNxN,
+    ::testing::ValuesIn(kArrayIhtParam));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
index 80ac78b..fef4629 100644
--- a/test/vp10_inv_txfm2d_test.cc
+++ b/test/vp10_inv_txfm2d_test.cc
@@ -14,8 +14,8 @@
 
 #include "./vp10_rtcd.h"
 #include "test/acm_random.h"
+#include "test/util.h"
 #include "test/vp10_txfm_test.h"
-#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
 #include "vp10/common/vp10_inv_txfm2d_cfg.h"
 
 using libvpx_test::ACMRandom;
@@ -28,86 +28,131 @@
 namespace {
 
 #if CONFIG_VP9_HIGHBITDEPTH
-const int txfm_size_num = 5;
-const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
-const int txfm_type[4] = {DCT_DCT, DCT_ADST, ADST_ADST, ADST_DCT};
-const TXFM_2D_CFG* inv_txfm_cfg_ls[5][4] = {
-    {&inv_txfm_2d_cfg_dct_dct_4, &inv_txfm_2d_cfg_dct_adst_4,
-     &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_dct_4},
-    {&inv_txfm_2d_cfg_dct_dct_8, &inv_txfm_2d_cfg_dct_adst_8,
-     &inv_txfm_2d_cfg_adst_adst_8, &inv_txfm_2d_cfg_adst_dct_8},
-    {&inv_txfm_2d_cfg_dct_dct_16, &inv_txfm_2d_cfg_dct_adst_16,
-     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_dct_16},
-    {&inv_txfm_2d_cfg_dct_dct_32, &inv_txfm_2d_cfg_dct_adst_32,
-     &inv_txfm_2d_cfg_adst_adst_32, &inv_txfm_2d_cfg_adst_dct_32},
-    {&inv_txfm_2d_cfg_dct_dct_64, NULL, NULL, NULL}};
+// VP10InvTxfm2dParam argument list:
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tr1::tuple<TX_TYPE, TX_SIZE, double, double> VP10InvTxfm2dParam;
 
-const Fwd_Txfm2d_Func fwd_txfm_func_ls[5] = {
-    vp10_fwd_txfm2d_4x4_c, vp10_fwd_txfm2d_8x8_c, vp10_fwd_txfm2d_16x16_c,
-    vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c};
-const Inv_Txfm2d_Func inv_txfm_func_ls[5] = {
-    vp10_inv_txfm2d_add_4x4_c, vp10_inv_txfm2d_add_8x8_c,
-    vp10_inv_txfm2d_add_16x16_c, vp10_inv_txfm2d_add_32x32_c,
-    vp10_inv_txfm2d_add_64x64_c};
+class VP10InvTxfm2d : public ::testing::TestWithParam<VP10InvTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    txfm1d_size_ = libvpx_test::get_txfm1d_size(tx_size_);
+    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
+    count_ = 500;
+    input_ = new int16_t[txfm2d_size_];
+    ref_input_ = new uint16_t[txfm2d_size_];
+    output_ = new int32_t[txfm2d_size_];
 
-const int txfm_type_num = 4;
+    input_ = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>
+        (vpx_memalign(16, sizeof(int32_t) * txfm2d_size_));
+  }
 
-TEST(vp10_inv_txfm2d, round_trip) {
-  for (int txfm_size_idx = 0; txfm_size_idx < txfm_size_num; ++txfm_size_idx) {
-    const int txfm_size = txfm_size_ls[txfm_size_idx];
-    const int sqr_txfm_size = txfm_size * txfm_size;
-    int16_t* input = new int16_t[sqr_txfm_size];
-    uint16_t* ref_input = new uint16_t[sqr_txfm_size];
-    int32_t* output = new int32_t[sqr_txfm_size];
-
-    for (int txfm_type_idx = 0; txfm_type_idx < txfm_type_num;
-         ++txfm_type_idx) {
-      const TXFM_2D_CFG* inv_txfm_cfg =
-          inv_txfm_cfg_ls[txfm_size_idx][txfm_type_idx];
-      if (inv_txfm_cfg != NULL) {
-        int tx_type = txfm_type[txfm_type_idx];
-        const Fwd_Txfm2d_Func fwd_txfm_func = fwd_txfm_func_ls[txfm_size_idx];
-        const Inv_Txfm2d_Func inv_txfm_func = inv_txfm_func_ls[txfm_size_idx];
-        const int count = 1000;
-        double avg_abs_error = 0;
-        ACMRandom rnd(ACMRandom::DeterministicSeed());
-        for (int ci = 0; ci < count; ci++) {
-          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-            if (ci == 0) {
-              int extreme_input = input_base - 1;
-              input[ni] = extreme_input;  // extreme case
-              ref_input[ni] = 0;
-            } else {
-              input[ni] = rnd.Rand16() % input_base;
-              ref_input[ni] = 0;
-            }
-          }
-
-          fwd_txfm_func(input, output, txfm_size, tx_type, bd);
-          inv_txfm_func(output, ref_input, txfm_size, tx_type, bd);
-
-          for (int ni = 0; ni < sqr_txfm_size; ++ni) {
-            EXPECT_LE(abs(input[ni] - ref_input[ni]), 4);
-          }
-          avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
-              input, ref_input, sqr_txfm_size);
+  void RunRoundtripCheck() {
+    const Fwd_Txfm2d_Func fwd_txfm_func =
+        libvpx_test::fwd_txfm_func_ls[tx_size_];
+    const Inv_Txfm2d_Func inv_txfm_func =
+        libvpx_test::inv_txfm_func_ls[tx_size_];
+    double avg_abs_error = 0;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        if (ci == 0) {
+          int extreme_input = input_base - 1;
+          input_[ni] = extreme_input;  // extreme case
+          ref_input_[ni] = 0;
+        } else {
+          input_[ni] = rnd.Rand16() % input_base;
+          ref_input_[ni] = 0;
         }
-
-        avg_abs_error /= count;
-        // max_abs_avg_error comes from upper bound of
-        // printf("txfm_size: %d accuracy_avg_abs_error: %f\n",
-        // txfm_size, avg_abs_error);
-        // TODO(angiebird): this upper bound is from adst_adst_8
-        const double max_abs_avg_error = 0.4;
-        EXPECT_LE(avg_abs_error, max_abs_avg_error);
       }
+
+      fwd_txfm_func(input_, output_, txfm1d_size_, tx_type_, bd);
+      inv_txfm_func(output_, ref_input_, txfm1d_size_, tx_type_, bd);
+
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        EXPECT_GE(max_error_, abs(input_[ni] - ref_input_[ni]));
+      }
+      avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
+          input_, ref_input_, txfm2d_size_);
     }
 
-    delete[] input;
-    delete[] ref_input;
-    delete[] output;
+    avg_abs_error /= count_;
+    // max_abs_avg_error comes from upper bound of
+    // printf("txfm1d_size: %d accuracy_avg_abs_error: %f\n",
+    // txfm1d_size_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error);
   }
-}
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(ref_input_);
+  }
+
+ private:
+  int count_;
+  int max_error_;
+  double max_avg_error_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int txfm1d_size_;
+  int txfm2d_size_;
+  int16_t* input_;
+  uint16_t* ref_input_;
+  int32_t* output_;
+};
+
+TEST_P(VP10InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+
+INSTANTIATE_TEST_CASE_P(
+    C, VP10InvTxfm2d,
+    ::testing::Values(
+#if CONFIG_EXT_TX
+        VP10InvTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(FLIPADST_DCT, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(DCT_FLIPADST, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(ADST_FLIPADST, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(FLIPADST_ADST, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(FLIPADST_DCT, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(DCT_FLIPADST, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 0.04),
+        VP10InvTxfm2dParam(ADST_FLIPADST, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(FLIPADST_ADST, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(FLIPADST_DCT, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(DCT_FLIPADST, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(ADST_FLIPADST, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(FLIPADST_ADST, TX_32X32, 4, 0.4),
+#endif
+        VP10InvTxfm2dParam(DCT_DCT, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(ADST_DCT, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(DCT_ADST, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.002),
+        VP10InvTxfm2dParam(DCT_DCT, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(ADST_DCT, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(DCT_ADST, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(ADST_ADST, TX_8X8, 2, 0.02),
+        VP10InvTxfm2dParam(DCT_DCT, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(ADST_DCT, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(DCT_ADST, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(ADST_ADST, TX_16X16, 2, 0.04),
+        VP10InvTxfm2dParam(DCT_DCT, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(ADST_DCT, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(DCT_ADST, TX_32X32, 4, 0.4),
+        VP10InvTxfm2dParam(ADST_ADST, TX_32X32, 4, 0.4)));
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 }  // namespace
diff --git a/test/vp10_txfm_test.cc b/test/vp10_txfm_test.cc
new file mode 100644
index 0000000..6b36126
--- /dev/null
+++ b/test/vp10_txfm_test.cc
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "test/vp10_txfm_test.h"
+
+namespace libvpx_test {
+
+int get_txfm1d_size(TX_SIZE tx_size) {
+  return 1 << (tx_size + 2);
+}
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM* type0,
+                     TYPE_TXFM* type1) {
+  switch (txfm2d_type) {
+    case DCT_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      break;
+    case ADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_ADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_FLIPADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      assert(0);
+      break;
+  }
+}
+
+double invSqrt2 = 1 / pow(2, 0.5);
+
+void reference_dct_1d(const double* in, double* out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * invSqrt2;
+  }
+}
+
+void reference_adst_1d(const double* in, double* out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_hybrid_1d(double* in, double* out, int size, int type) {
+  if (type == TYPE_DCT)
+    reference_dct_1d(in, out, size);
+  else
+    reference_adst_1d(in, out, size);
+}
+
+void reference_hybrid_2d(double* in, double* out, int size,
+                         int type0, int type1) {
+  double* tempOut = new double[size * size];
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = in[c * size + r];
+    }
+  }
+
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type0);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type1);
+  }
+  delete[] tempOut;
+}
+
+template<typename Type>
+void fliplr(Type *dest, int stride, int length) {
+  int i, j;
+  for (i = 0; i < length; ++i) {
+    for (j = 0; j < length / 2; ++j) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + length - 1 - j];
+      dest[i * stride + length - 1 - j] = tmp;
+    }
+  }
+}
+
+template<typename Type>
+void flipud(Type *dest, int stride, int length) {
+  int i, j;
+  for (j = 0; j < length; ++j) {
+    for (i = 0; i < length / 2; ++i) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(length - 1 - i) * stride + j];
+      dest[(length - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+template<typename Type>
+void fliplrud(Type *dest, int stride, int length) {
+  int i, j;
+  for (i = 0; i < length / 2; ++i) {
+    for (j = 0; j < length; ++j) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(length - 1 - i) * stride + length - 1 - j];
+      dest[(length - 1 - i) * stride + length - 1 - j] = tmp;
+    }
+  }
+}
+
+template void fliplr<double>(double *dest, int stride, int length);
+template void flipud<double>(double *dest, int stride, int length);
+template void fliplrud<double>(double *dest, int stride, int length);
+
+}  // namespace libvpx_test
diff --git a/test/vp10_txfm_test.h b/test/vp10_txfm_test.h
index c4d03ce..fb9e12e 100644
--- a/test/vp10_txfm_test.h
+++ b/test/vp10_txfm_test.h
@@ -23,6 +23,7 @@
 #include "test/acm_random.h"
 #include "vp10/common/enums.h"
 #include "vp10/common/vp10_txfm.h"
+#include "./vp10_rtcd.h"
 
 namespace libvpx_test {
 typedef enum {
@@ -33,63 +34,19 @@
   TYPE_LAST
 } TYPE_TXFM;
 
-static double invSqrt2 = 1 / pow(2, 0.5);
+int get_txfm1d_size(TX_SIZE tx_size);
 
-static void reference_dct_1d(const double* in, double* out, int size) {
-  for (int k = 0; k < size; ++k) {
-    out[k] = 0;
-    for (int n = 0; n < size; ++n) {
-      out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (2 * size));
-    }
-    if (k == 0) out[k] = out[k] * invSqrt2;
-  }
-}
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM* type0,
+                     TYPE_TXFM* type1);
 
-static void reference_adst_1d(const double* in, double* out, int size) {
-  for (int k = 0; k < size; ++k) {
-    out[k] = 0;
-    for (int n = 0; n < size; ++n) {
-      out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
-    }
-  }
-}
+void reference_dct_1d(const double* in, double* out, int size);
 
-static void reference_hybrid_1d(double* in, double* out, int size, int type) {
-  if (type == TYPE_DCT)
-    reference_dct_1d(in, out, size);
-  else
-    reference_adst_1d(in, out, size);
-}
+void reference_adst_1d(const double* in, double* out, int size);
 
-static INLINE void reference_hybrid_2d(double* in, double* out, int size,
-                                       int type0, int type1) {
-  double* tempOut = new double[size * size];
+void reference_hybrid_1d(double* in, double* out, int size, int type);
 
-  for (int r = 0; r < size; r++) {
-    // out ->tempOut
-    for (int c = 0; c < size; c++) {
-      tempOut[r * size + c] = in[c * size + r];
-    }
-  }
-
-  // dct each row: in -> out
-  for (int r = 0; r < size; r++) {
-    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type0);
-  }
-
-  for (int r = 0; r < size; r++) {
-    // out ->tempOut
-    for (int c = 0; c < size; c++) {
-      tempOut[r * size + c] = out[c * size + r];
-    }
-  }
-
-  for (int r = 0; r < size; r++) {
-    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type1);
-  }
-  delete[] tempOut;
-}
-
+void reference_hybrid_2d(double* in, double* out, int size,
+                         int type0, int type1);
 template <typename Type1, typename Type2>
 static double compute_avg_abs_error(const Type1* a, const Type2* b,
                                     const int size) {
@@ -101,6 +58,15 @@
   return error;
 }
 
+template<typename Type>
+void fliplr(Type *dest, int stride, int length);
+
+template<typename Type>
+void flipud(Type *dest, int stride, int length);
+
+template<typename Type>
+void fliplrud(Type *dest, int stride, int length);
+
 typedef void (*TxfmFunc)(const int32_t* in, int32_t* out, const int8_t* cos_bit,
                          const int8_t* range_bit);
 
@@ -110,22 +76,15 @@
 static const int bd = 10;
 static const int input_base = (1 << bd);
 
-static INLINE int get_tx_type(const TXFM_2D_CFG *cfg) {
-  int tx_type;
-  if (cfg->txfm_type_col <= TXFM_TYPE_DCT64) {
-    if (cfg->txfm_type_row <= TXFM_TYPE_DCT64) {
-      tx_type = DCT_DCT;
-    } else {
-      tx_type = DCT_ADST;
-    }
-  } else {
-    if (cfg->txfm_type_row <= TXFM_TYPE_DCT64) {
-      tx_type = ADST_DCT;
-    } else {
-      tx_type = ADST_ADST;
-    }
-  }
-  return tx_type;
-}
+#if CONFIG_VP9_HIGHBITDEPTH
+static const Fwd_Txfm2d_Func fwd_txfm_func_ls[TX_SIZES] = {
+    vp10_fwd_txfm2d_4x4_c, vp10_fwd_txfm2d_8x8_c, vp10_fwd_txfm2d_16x16_c,
+    vp10_fwd_txfm2d_32x32_c};
+
+static const Inv_Txfm2d_Func inv_txfm_func_ls[TX_SIZES] = {
+    vp10_inv_txfm2d_add_4x4_c, vp10_inv_txfm2d_add_8x8_c,
+    vp10_inv_txfm2d_add_16x16_c, vp10_inv_txfm2d_add_32x32_c};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 }  // namespace libvpx_test
 #endif  // VP10_TXFM_TEST_H_
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index d6b6951..35a6619 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -33,11 +33,13 @@
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.w = 1280;
     cfg.h = 720;
-#if CONFIG_EXT_TILE
-    cfg.tile_col = -1;
-    cfg.tile_row = -1;
-#endif  // CONFIG_EXT_TILE
     decoder_ = codec_->CreateDecoder(cfg, 0);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (decoder_->IsVP10()) {
+      decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
 
     size_enc_.clear();
     md5_dec_.clear();
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 21147af..87e5d1c 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -53,43 +53,6 @@
 }
 
 #if CONFIG_EXT_INTER
-#define WEDGE_BITS_SML    2
-#define WEDGE_BITS_MED    3
-#define WEDGE_BITS_BIG    4
-#define WEDGE_NONE       -1
-#define WEDGE_WEIGHT_BITS 6
-
-static const int get_wedge_bits_lookup[BLOCK_SIZES] = {
-  0,
-  0,
-  0,
-  WEDGE_BITS_SML,
-  WEDGE_BITS_MED,
-  WEDGE_BITS_MED,
-  WEDGE_BITS_MED,
-  WEDGE_BITS_MED,
-  WEDGE_BITS_MED,
-  WEDGE_BITS_MED,
-  WEDGE_BITS_BIG,
-  WEDGE_BITS_BIG,
-  WEDGE_BITS_BIG,
-#if CONFIG_EXT_PARTITION
-  WEDGE_BITS_BIG,
-  WEDGE_BITS_BIG,
-  WEDGE_BITS_BIG,
-#endif  // CONFIG_EXT_PARTITION
-};
-
-static INLINE int is_interinter_wedge_used(BLOCK_SIZE sb_type) {
-  (void) sb_type;
-  return get_wedge_bits_lookup[sb_type] > 0;
-}
-
-static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
-  (void) sb_type;
-  return get_wedge_bits_lookup[sb_type] > 0;
-}
-
 static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWFROMNEARMV;
 }
@@ -133,7 +96,9 @@
 #define NONE           -1
 #define INTRA_FRAME     0
 #define LAST_FRAME      1
+
 #if CONFIG_EXT_REFS
+
 #define LAST2_FRAME     2
 #define LAST3_FRAME     3
 #define LAST4_FRAME     4
@@ -141,10 +106,24 @@
 #define ALTREF_FRAME    6
 #define MAX_REF_FRAMES  7
 #define LAST_REF_FRAMES (LAST4_FRAME - LAST_FRAME + 1)
-#else
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+
+#define GOLDEN_FRAME    2
+#define BWDREF_FRAME    3
+#define ALTREF_FRAME    4
+#define MAX_REF_FRAMES  5
+
+#else  // CONFIG_BIDIR_PRED
+
 #define GOLDEN_FRAME    2
 #define ALTREF_FRAME    3
 #define MAX_REF_FRAMES  4
+
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
 
 typedef int8_t MV_REFERENCE_FRAME;
diff --git a/vp10/common/divide.c b/vp10/common/divide.c
index 00b43a0..3f144d7 100644
--- a/vp10/common/divide.c
+++ b/vp10/common/divide.c
@@ -26,68 +26,68 @@
 }
 */
 const struct fastdiv_elem vp10_fastdiv_tab[256] = {
-    {0, 0},          {0, 0},          {0, 1},          {1431655766, 2},
-    {0, 2},          {2576980378, 3}, {1431655766, 3}, {613566757, 3},
-    {0, 3},          {3340530120, 4}, {2576980378, 4}, {1952257862, 4},
-    {1431655766, 4}, {991146300, 4},  {613566757, 4},  {286331154, 4},
-    {0, 4},          {3789677026, 5}, {3340530120, 5}, {2938661835, 5},
-    {2576980378, 5}, {2249744775, 5}, {1952257862, 5}, {1680639377, 5},
-    {1431655766, 5}, {1202590843, 5}, {991146300, 5},  {795364315, 5},
-    {613566757, 5},  {444306962, 5},  {286331154, 5},  {138547333, 5},
-    {0, 5},          {4034666248, 6}, {3789677026, 6}, {3558687189, 6},
-    {3340530120, 6}, {3134165325, 6}, {2938661835, 6}, {2753184165, 6},
-    {2576980378, 6}, {2409371898, 6}, {2249744775, 6}, {2097542168, 6},
-    {1952257862, 6}, {1813430637, 6}, {1680639377, 6}, {1553498810, 6},
-    {1431655766, 6}, {1314785907, 6}, {1202590843, 6}, {1094795586, 6},
-    {991146300, 6},  {891408307, 6},  {795364315, 6},  {702812831, 6},
-    {613566757, 6},  {527452125, 6},  {444306962, 6},  {363980280, 6},
-    {286331154, 6},  {211227900, 6},  {138547333, 6},  {68174085, 6},
-    {0, 6},          {4162814457, 7}, {4034666248, 7}, {3910343360, 7},
-    {3789677026, 7}, {3672508268, 7}, {3558687189, 7}, {3448072337, 7},
-    {3340530120, 7}, {3235934265, 7}, {3134165325, 7}, {3035110223, 7},
-    {2938661835, 7}, {2844718599, 7}, {2753184165, 7}, {2663967058, 7},
-    {2576980378, 7}, {2492141518, 7}, {2409371898, 7}, {2328596727, 7},
-    {2249744775, 7}, {2172748162, 7}, {2097542168, 7}, {2024065048, 7},
-    {1952257862, 7}, {1882064321, 7}, {1813430637, 7}, {1746305385, 7},
-    {1680639377, 7}, {1616385542, 7}, {1553498810, 7}, {1491936009, 7},
-    {1431655766, 7}, {1372618415, 7}, {1314785907, 7}, {1258121734, 7},
-    {1202590843, 7}, {1148159575, 7}, {1094795586, 7}, {1042467791, 7},
-    {991146300, 7},  {940802361, 7},  {891408307, 7},  {842937507, 7},
-    {795364315, 7},  {748664025, 7},  {702812831, 7},  {657787785, 7},
-    {613566757, 7},  {570128403, 7},  {527452125, 7},  {485518043, 7},
-    {444306962, 7},  {403800345, 7},  {363980280, 7},  {324829460, 7},
-    {286331154, 7},  {248469183, 7},  {211227900, 7},  {174592167, 7},
-    {138547333, 7},  {103079216, 7},  {68174085, 7},   {33818641, 7},
-    {0, 7},          {4228378656, 8}, {4162814457, 8}, {4098251237, 8},
-    {4034666248, 8}, {3972037425, 8}, {3910343360, 8}, {3849563281, 8},
-    {3789677026, 8}, {3730665024, 8}, {3672508268, 8}, {3615188300, 8},
-    {3558687189, 8}, {3502987511, 8}, {3448072337, 8}, {3393925206, 8},
-    {3340530120, 8}, {3287871517, 8}, {3235934265, 8}, {3184703642, 8},
-    {3134165325, 8}, {3084305374, 8}, {3035110223, 8}, {2986566663, 8},
-    {2938661835, 8}, {2891383213, 8}, {2844718599, 8}, {2798656110, 8},
-    {2753184165, 8}, {2708291480, 8}, {2663967058, 8}, {2620200175, 8},
-    {2576980378, 8}, {2534297473, 8}, {2492141518, 8}, {2450502814, 8},
-    {2409371898, 8}, {2368739540, 8}, {2328596727, 8}, {2288934667, 8},
-    {2249744775, 8}, {2211018668, 8}, {2172748162, 8}, {2134925265, 8},
-    {2097542168, 8}, {2060591247, 8}, {2024065048, 8}, {1987956292, 8},
-    {1952257862, 8}, {1916962805, 8}, {1882064321, 8}, {1847555765, 8},
-    {1813430637, 8}, {1779682582, 8}, {1746305385, 8}, {1713292966, 8},
-    {1680639377, 8}, {1648338801, 8}, {1616385542, 8}, {1584774030, 8},
-    {1553498810, 8}, {1522554545, 8}, {1491936009, 8}, {1461638086, 8},
-    {1431655766, 8}, {1401984144, 8}, {1372618415, 8}, {1343553873, 8},
-    {1314785907, 8}, {1286310003, 8}, {1258121734, 8}, {1230216764, 8},
-    {1202590843, 8}, {1175239808, 8}, {1148159575, 8}, {1121346142, 8},
-    {1094795586, 8}, {1068504060, 8}, {1042467791, 8}, {1016683080, 8},
-    {991146300, 8},  {965853890, 8},  {940802361, 8},  {915988286, 8},
-    {891408307, 8},  {867059126, 8},  {842937507, 8},  {819040276, 8},
-    {795364315, 8},  {771906565, 8},  {748664025, 8},  {725633745, 8},
-    {702812831, 8},  {680198441, 8},  {657787785, 8},  {635578121, 8},
-    {613566757, 8},  {591751050, 8},  {570128403, 8},  {548696263, 8},
-    {527452125, 8},  {506393524, 8},  {485518043, 8},  {464823301, 8},
-    {444306962, 8},  {423966729, 8},  {403800345, 8},  {383805589, 8},
-    {363980280, 8},  {344322273, 8},  {324829460, 8},  {305499766, 8},
-    {286331154, 8},  {267321616, 8},  {248469183, 8},  {229771913, 8},
-    {211227900, 8},  {192835267, 8},  {174592167, 8},  {156496785, 8},
-    {138547333, 8},  {120742053, 8},  {103079216, 8},  {85557118, 8},
-    {68174085, 8},   {50928466, 8},   {33818641, 8},   {16843010, 8},
+    {0, 0},           {0, 0},           {0, 1},           {1431655766, 2},
+    {0, 2},           {2576980378u, 3}, {1431655766, 3},  {613566757, 3},
+    {0, 3},           {3340530120u, 4}, {2576980378u, 4}, {1952257862, 4},
+    {1431655766, 4},  {991146300, 4},   {613566757, 4},   {286331154u, 4},
+    {0, 4},           {3789677026u, 5}, {3340530120u, 5}, {2938661835u, 5},
+    {2576980378u, 5}, {2249744775u, 5}, {1952257862, 5},  {1680639377, 5},
+    {1431655766, 5},  {1202590843, 5},  {991146300, 5},   {795364315, 5},
+    {613566757, 5},   {444306962, 5},   {286331154, 5},   {138547333, 5},
+    {0, 5},           {4034666248u, 6}, {3789677026u, 6}, {3558687189u, 6},
+    {3340530120u, 6}, {3134165325u, 6}, {2938661835u, 6}, {2753184165u, 6},
+    {2576980378u, 6}, {2409371898u, 6}, {2249744775u, 6}, {2097542168u, 6},
+    {1952257862, 6},  {1813430637, 6}, {1680639377, 6}, {1553498810, 6},
+    {1431655766, 6},  {1314785907, 6}, {1202590843, 6}, {1094795586, 6},
+    {991146300, 6},   {891408307, 6},  {795364315, 6},  {702812831, 6},
+    {613566757, 6},   {527452125, 6},  {444306962, 6},  {363980280, 6},
+    {286331154, 6},   {211227900, 6},  {138547333, 6},  {68174085, 6},
+    {0, 6},           {4162814457u, 7}, {4034666248u, 7}, {3910343360u, 7},
+    {3789677026u, 7}, {3672508268u, 7}, {3558687189u, 7}, {3448072337u, 7},
+    {3340530120u, 7}, {3235934265u, 7}, {3134165325u, 7}, {3035110223u, 7},
+    {2938661835u, 7}, {2844718599u, 7}, {2753184165u, 7}, {2663967058u, 7},
+    {2576980378u, 7}, {2492141518u, 7}, {2409371898u, 7}, {2328596727u, 7},
+    {2249744775u, 7}, {2172748162u, 7}, {2097542168, 7},   {2024065048, 7},
+    {1952257862, 7},  {1882064321, 7}, {1813430637, 7},   {1746305385, 7},
+    {1680639377, 7},  {1616385542, 7}, {1553498810, 7}, {1491936009, 7},
+    {1431655766, 7},  {1372618415, 7}, {1314785907, 7}, {1258121734, 7},
+    {1202590843, 7},  {1148159575, 7}, {1094795586, 7}, {1042467791, 7},
+    {991146300, 7},   {940802361, 7},  {891408307, 7},  {842937507, 7},
+    {795364315, 7},   {748664025, 7},  {702812831, 7},  {657787785, 7},
+    {613566757, 7},   {570128403, 7},  {527452125, 7},  {485518043, 7},
+    {444306962, 7},   {403800345, 7},  {363980280, 7},  {324829460, 7},
+    {286331154, 7},   {248469183, 7},  {211227900, 7},  {174592167, 7},
+    {138547333, 7},   {103079216, 7},  {68174085, 7},   {33818641, 7},
+    {0, 7},           {4228378656u, 8}, {4162814457u, 8}, {4098251237u, 8},
+    {4034666248u, 8}, {3972037425u, 8}, {3910343360u, 8}, {3849563281u, 8},
+    {3789677026u, 8}, {3730665024u, 8}, {3672508268u, 8}, {3615188300u, 8},
+    {3558687189u, 8}, {3502987511u, 8}, {3448072337u, 8}, {3393925206u, 8},
+    {3340530120u, 8}, {3287871517u, 8}, {3235934265u, 8}, {3184703642u, 8},
+    {3134165325u, 8}, {3084305374u, 8}, {3035110223u, 8}, {2986566663u, 8},
+    {2938661835u, 8}, {2891383213u, 8}, {2844718599u, 8}, {2798656110u, 8},
+    {2753184165u, 8}, {2708291480u, 8}, {2663967058u, 8}, {2620200175u, 8},
+    {2576980378u, 8}, {2534297473u, 8}, {2492141518u, 8}, {2450502814u, 8},
+    {2409371898u, 8}, {2368739540u, 8}, {2328596727u, 8}, {2288934667u, 8},
+    {2249744775u, 8}, {2211018668u, 8}, {2172748162u, 8}, {2134925265u, 8},
+    {2097542168, 8},  {2060591247, 8}, {2024065048, 8}, {1987956292, 8},
+    {1952257862, 8},  {1916962805, 8}, {1882064321, 8}, {1847555765, 8},
+    {1813430637, 8},  {1779682582, 8}, {1746305385, 8}, {1713292966, 8},
+    {1680639377, 8},  {1648338801, 8}, {1616385542, 8}, {1584774030, 8},
+    {1553498810, 8},  {1522554545, 8}, {1491936009, 8}, {1461638086, 8},
+    {1431655766, 8},  {1401984144, 8}, {1372618415, 8}, {1343553873, 8},
+    {1314785907, 8},  {1286310003, 8}, {1258121734, 8}, {1230216764, 8},
+    {1202590843, 8},  {1175239808, 8}, {1148159575, 8}, {1121346142, 8},
+    {1094795586, 8},  {1068504060, 8}, {1042467791, 8}, {1016683080, 8},
+    {991146300, 8},   {965853890, 8},  {940802361, 8},  {915988286, 8},
+    {891408307, 8},   {867059126, 8},  {842937507, 8},  {819040276, 8},
+    {795364315, 8},   {771906565, 8},  {748664025, 8},  {725633745, 8},
+    {702812831, 8},   {680198441, 8},  {657787785, 8},  {635578121, 8},
+    {613566757, 8},   {591751050, 8},  {570128403, 8},  {548696263, 8},
+    {527452125, 8},   {506393524, 8},  {485518043, 8},  {464823301, 8},
+    {444306962, 8},   {423966729, 8},  {403800345, 8},  {383805589, 8},
+    {363980280, 8},   {344322273, 8},  {324829460, 8},  {305499766, 8},
+    {286331154, 8},   {267321616, 8},  {248469183, 8},  {229771913, 8},
+    {211227900, 8},   {192835267, 8},  {174592167, 8},  {156496785, 8},
+    {138547333, 8},   {120742053, 8},  {103079216, 8},  {85557118, 8},
+    {68174085, 8},    {50928466, 8},   {33818641, 8},   {16843010, 8},
 };
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 0ae2572..29fb27e 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -281,16 +281,16 @@
 };
 
 static const vpx_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
-  208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
 #if CONFIG_EXT_PARTITION
   208, 208, 208
 #endif  // CONFIG_EXT_PARTITION
 };
 
 static const vpx_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
-  208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208, 208,
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
 #if CONFIG_EXT_PARTITION
-  208, 208, 208
+  255, 255, 255
 #endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_EXT_INTER
@@ -383,19 +383,36 @@
   239, 183, 119,  96,  41
 };
 
-static const vpx_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
+
 #if CONFIG_EXT_REFS
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
   // TODO(zoeliu): To adjust the initial prob values.
   {  33,  16,  16,  16 },
   {  77,  74,  74,  74 },
   { 142, 142, 142, 142 },
   { 172, 170, 170, 170 },
   { 238, 247, 247, 247 }
-#else
-  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
-#endif  // CONFIG_EXT_REFS
 };
 
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+// TODO(zoeliu): To adjust the initial prob values.
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1] = {
+//  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
+  { 33 }, { 77 }, { 142 }, { 172 }, { 238 }
+};
+static const vpx_prob default_comp_bwdref_p[REF_CONTEXTS][BWD_REFS - 1] = {
+  { 16 }, { 74 }, { 142 }, { 170 }, { 247 }
+};
+#else  // CONFIG_BIDIR_PRED
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
+  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
+};
+#endif  // CONFIG_BIDIR_PRED
+
+#endif  // CONFIG_EXT_REFS
+
 static const vpx_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
 #if CONFIG_EXT_REFS
   {  33,  16,  16,  16,  16 },
@@ -403,12 +420,20 @@
   { 142, 142, 142, 142, 142 },
   { 172, 170, 170, 170, 170 },
   { 238, 247, 247, 247, 247 }
-#else
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {  33,  16,  16 },
+  {  77,  74,  74 },
+  { 142, 142, 142 },
+  { 172, 170, 170 },
+  { 238, 247, 247 }
+#else  // CONFIG_BIDIR_PRED
   {  33,  16 },
   {  77,  74 },
   { 142, 142 },
   { 172, 170 },
   { 238, 247 }
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
 };
 
@@ -904,14 +929,68 @@
 #if CONFIG_EXT_INTERP
 static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
+#if CONFIG_DUAL_FILTER
   { 235, 192, 128, 128},
   { 36, 243, 208, 128},
   { 34, 16, 128, 128},
   { 36, 243, 48, 128},
   { 34, 16, 128, 128},
   { 149, 160, 128, 128},
+
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+#else
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+#endif
 };
 #else  // CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 10, 3, },
+
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 10, 3, },
+};
+#else
 static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
@@ -919,6 +998,7 @@
   { 34, 3, },
   { 149, 144, },
 };
+#endif
 #endif  // CONFIG_EXT_INTERP
 
 #if CONFIG_EXT_TX
@@ -1188,6 +1268,9 @@
   vp10_copy(fc->intra_inter_prob, default_intra_inter_p);
   vp10_copy(fc->comp_inter_prob, default_comp_inter_p);
   vp10_copy(fc->comp_ref_prob, default_comp_ref_p);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  vp10_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   vp10_copy(fc->single_ref_prob, default_single_ref_p);
   vp10_copy(fc->tx_size_probs, default_tx_size_prob);
 #if CONFIG_VAR_TX
@@ -1255,10 +1338,23 @@
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
     fc->comp_inter_prob[i] = vp10_mode_mv_merge_probs(
         pre_fc->comp_inter_prob[i], counts->comp_inter[i]);
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (FWD_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->comp_ref_prob[i][j], counts->comp_ref[i][j]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (BWD_REFS - 1); j++)
+      fc->comp_bwdref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->comp_bwdref_prob[i][j], counts->comp_bwdref[i][j]);
+#else
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < (COMP_REFS - 1); j++)
-      fc->comp_ref_prob[i][j] = vp10_mode_mv_merge_probs(
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(
           pre_fc->comp_ref_prob[i][j], counts->comp_ref[i][j]);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < (SINGLE_REFS - 1); j++)
       fc->single_ref_prob[i][j] = vp10_mode_mv_merge_probs(
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 4a6ccae..42b93d6 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -85,7 +85,12 @@
   vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
   vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
   vpx_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS-1];
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  vpx_prob comp_ref_prob[REF_CONTEXTS][FWD_REFS-1];
+  vpx_prob comp_bwdref_prob[REF_CONTEXTS][BWD_REFS-1];
+#else
   vpx_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS-1];
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   vpx_prob tx_size_probs[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES - 1];
 #if CONFIG_VAR_TX
   vpx_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
@@ -155,7 +160,12 @@
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS-1][2];
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS-1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS-1][2];
+#else
   unsigned int comp_ref[REF_CONTEXTS][COMP_REFS-1][2];
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   unsigned int tx_size_totals[TX_SIZES];
   unsigned int tx_size[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 #if CONFIG_VAR_TX
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 3b2ef29..cdebc69 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -204,10 +204,17 @@
   VP9_GOLD_FLAG = 1 << 4,
   VP9_ALT_FLAG = 1 << 5,
   VP9_REFFRAME_ALL = (1 << 6) - 1
-#else
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  VP9_GOLD_FLAG = 1 << 1,
+  VP9_BWD_FLAG = 1 << 2,
+  VP9_ALT_FLAG = 1 << 3,
+  VP9_REFFRAME_ALL = (1 << 4) - 1
+#else  // CONFIG_BIDIR_PRED
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
   VP9_REFFRAME_ALL = (1 << 3) - 1
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
 } VP9_REFFRAME;
 
@@ -367,11 +374,24 @@
 #endif
 
 #if CONFIG_EXT_REFS
+
 #define SINGLE_REFS 6
 #define COMP_REFS 5
-#else
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+#define FWD_REFS 2
+#define BWD_REFS 2
+#define SINGLE_REFS (FWD_REFS + BWD_REFS)
+#define COMP_REFS (FWD_REFS * BWD_REFS)
+
+#else  // CONFIG_BIDIR_PRED
+
 #define SINGLE_REFS 3
 #define COMP_REFS 2
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_SUPERTX
diff --git a/vp10/common/filter.h b/vp10/common/filter.h
index f70d0cc..a51e2d0 100644
--- a/vp10/common/filter.h
+++ b/vp10/common/filter.h
@@ -47,7 +47,13 @@
 
 #define BILINEAR            (SWITCHABLE_FILTERS)
 #define SWITCHABLE          (SWITCHABLE_FILTERS + 1)  /* the last one */
+#if CONFIG_DUAL_FILTER
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET   (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET    ((SWITCHABLE_FILTERS + 1) * 2)
+#else
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
+#endif
 
 typedef uint8_t INTERP_FILTER;
 
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index a5d50bb..717c914 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -1297,7 +1297,8 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_4x4_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -1336,7 +1337,8 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_8x8_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -1375,7 +1377,8 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_16x16_c(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
diff --git a/vp10/common/loopfilter.h b/vp10/common/loopfilter.h
index 2a88003..8fb5ef3 100644
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h
@@ -45,7 +45,7 @@
   uint8_t mode_ref_delta_update;
 
   // 0 = Intra, Last, Last2+Last3+LAST4(CONFIG_EXT_REFS),
-  // GF, ARF
+  // GF, BRF(CONFIG_BIDIR_PRED), ARF
   signed char ref_deltas[MAX_REF_FRAMES];
   signed char last_ref_deltas[MAX_REF_FRAMES];
 
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index 455ca2d..d3b407a 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -186,6 +186,9 @@
   int show_frame;
   int last_show_frame;
   int show_existing_frame;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int is_reference_frame;  // A frame used as a reference
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
   // Flag signaling that the frame is encoded using only INTRA modes.
   uint8_t intra_only;
@@ -270,8 +273,13 @@
   int frame_parallel_decode;  // frame-based threading.
 
   // Context probabilities for reference frame prediction
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
+  MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
+#else
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT *fc;  /* this frame entropy */
diff --git a/vp10/common/pred_common.c b/vp10/common/pred_common.c
index 0c698a7..37ae288 100644
--- a/vp10/common/pred_common.c
+++ b/vp10/common/pred_common.c
@@ -11,45 +11,61 @@
 
 #include "vp10/common/common.h"
 #include "vp10/common/pred_common.h"
+#include "vp10/common/reconinter.h"
 #include "vp10/common/seg_common.h"
 
 // Returns a context number for the given MB prediction signal
 #if CONFIG_DUAL_FILTER
+static INTERP_FILTER get_ref_filter_type(const MODE_INFO *mi,
+                                         const MACROBLOCKD *xd,
+                                         int dir,
+                                         MV_REFERENCE_FRAME ref_frame) {
+  INTERP_FILTER ref_type = SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *ref_mbmi = &mi->mbmi;
+  int use_subpel[2] = {
+      has_subpel_mv_component(mi, xd, dir),
+      has_subpel_mv_component(mi, xd, dir + 2),
+  };
+
+  if (ref_mbmi->ref_frame[0] == ref_frame && use_subpel[0])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01)];
+  else if (ref_mbmi->ref_frame[1] == ref_frame && use_subpel[1])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01) + 2];
+
+  return ref_type;
+}
+
 int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx_offset =
+      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
   MV_REFERENCE_FRAME ref_frame = (dir < 2) ?
       mbmi->ref_frame[0] : mbmi->ref_frame[1];
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
   // The prediction flags in these dummy entries are initialized to 0.
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
   int left_type = SWITCHABLE_FILTERS;
   int above_type = SWITCHABLE_FILTERS;
 
-  if (xd->left_available) {
-    if (left_mbmi->ref_frame[0] == ref_frame)
-      left_type = left_mbmi->interp_filter[(dir & 0x01)];
-    else if (left_mbmi->ref_frame[1] == ref_frame)
-      left_type = left_mbmi->interp_filter[(dir & 0x01) + 2];
-  }
+  if (xd->left_available)
+    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
 
-  if (xd->up_available) {
-    if (above_mbmi->ref_frame[0] == ref_frame)
-      above_type = above_mbmi->interp_filter[(dir & 0x01)];
-    else if (above_mbmi->ref_frame[1] == ref_frame)
-      above_type = above_mbmi->interp_filter[(dir & 0x01) + 2];
-  }
+  if (xd->up_available)
+    above_type = get_ref_filter_type(xd->mi[-xd->mi_stride], xd,
+                                     dir, ref_frame);
 
   if (left_type == above_type)
-    return left_type;
+    filter_type_ctx += left_type;
   else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
-    return above_type;
+    filter_type_ctx += above_type;
   else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
-    return left_type;
+    filter_type_ctx += left_type;
   else
-    return SWITCHABLE_FILTERS;
+    filter_type_ctx += SWITCHABLE_FILTERS;
+
+  return filter_type_ctx;
 }
 #else
 int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
@@ -161,8 +177,57 @@
   }
 }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
+#define CHECK_COMP_BWD_REF(ref_frame) \
+  (((ref_frame) == cm->comp_bwd_ref[0]) || ((ref_frame) == cm->comp_bwd_ref[1]))
+
 int vp10_get_reference_mode_context(const VP10_COMMON *cm,
-                                   const MACROBLOCKD *xd) {
+                                    const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+      // neither edge uses comp pred (0/1)
+      ctx = CHECK_COMP_BWD_REF(above_mbmi->ref_frame[0]) ^
+            CHECK_COMP_BWD_REF(left_mbmi->ref_frame[0]);
+    else if (!has_second_ref(above_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (CHECK_COMP_BWD_REF(above_mbmi->ref_frame[0]) ||
+                 !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (CHECK_COMP_BWD_REF(left_mbmi->ref_frame[0]) ||
+                 !is_inter_block(left_mbmi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!has_second_ref(edge_mbmi))
+      // edge does not use comp pred (0/1)
+      ctx = CHECK_COMP_BWD_REF(edge_mbmi->ref_frame[0]);
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+#else  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
+int vp10_get_reference_mode_context(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
   int ctx;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -203,6 +268,8 @@
   return ctx;
 }
 
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 #if CONFIG_EXT_REFS
 
 // TODO(zoeliu): Future work will be conducted to optimize the context design
@@ -618,6 +685,200 @@
 
 #else  // CONFIG_EXT_REFS
 
+#if CONFIG_BIDIR_PRED
+
+// Returns a context number for the given MB prediction signal
+int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_fwd_ref[1]);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx]
+                                != cm->comp_fwd_ref[1]);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa = a_sg ?
+          above_mbmi->ref_frame[0] : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl = l_sg ?
+          left_mbmi->ref_frame[0] : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && frfa == cm->comp_fwd_ref[1]) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((frfa != frfl) &&
+            (frfa != cm->comp_fwd_ref[1]) && (frfl != cm->comp_fwd_ref[1]))
+          pred_context = 4;
+        else if (frfa == frfl)
+          pred_context = 3;
+        else
+          pred_context = 1;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+        if (frfc == cm->comp_fwd_ref[1] && rfs != cm->comp_fwd_ref[1])
+          pred_context = 1;
+        else if (rfs == cm->comp_fwd_ref[1] && frfc != cm->comp_fwd_ref[1])
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else if (frfa == frfl) {  // comp/comp
+        pred_context = 4;
+      } else {
+        pred_context = 2;
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context = 4 * (edge_mbmi->ref_frame[fwd_ref_sign_idx]
+                            != cm->comp_fwd_ref[1]);
+      else
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_fwd_ref[1]);
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+int vp10_get_pred_context_comp_bwdref_p(const VP10_COMMON *cm,
+                                        const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[1] != cm->comp_bwd_ref[1]);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[bwd_ref_sign_idx]
+                                != cm->comp_bwd_ref[1]);
+    } else {  // inter/inter
+      const int l_comp = has_second_ref(left_mbmi);
+      const int a_comp = has_second_ref(above_mbmi);
+
+      const MV_REFERENCE_FRAME l_brf = l_comp ?
+          left_mbmi->ref_frame[bwd_ref_sign_idx] : NONE;
+      const MV_REFERENCE_FRAME a_brf = a_comp ?
+          above_mbmi->ref_frame[bwd_ref_sign_idx] : NONE;
+
+      const MV_REFERENCE_FRAME l_frf = !l_comp ?
+          left_mbmi->ref_frame[0] : left_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME a_frf = !a_comp ?
+          above_mbmi->ref_frame[0] : above_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (l_comp && a_comp) {
+        if (l_brf == a_brf && l_brf == cm->comp_bwd_ref[1]) {
+          pred_context = 0;
+        } else if (l_brf == cm->comp_bwd_ref[1] ||
+                   a_brf == cm->comp_bwd_ref[1]) {
+          pred_context = 1;
+        } else {
+          // NOTE: Backward ref should be either BWDREF or ALTREF.
+          assert(l_brf == a_brf && l_brf != cm->comp_bwd_ref[1]);
+          pred_context = 3;
+        }
+      } else if (!l_comp && !a_comp) {
+        if (l_frf == a_frf && l_frf == cm->comp_bwd_ref[1]) {
+          pred_context = 0;
+        } else if (l_frf == cm->comp_bwd_ref[1] ||
+                   a_frf == cm->comp_bwd_ref[1]) {
+          pred_context = 1;
+        } else if (l_frf == a_frf) {
+          pred_context = 3;
+        } else {
+          assert(l_frf != a_frf &&
+                 l_frf != cm->comp_bwd_ref[1] && a_frf != cm->comp_bwd_ref[1]);
+          pred_context = 4;
+        }
+      } else {
+        assert((l_comp && !a_comp) || (!l_comp && a_comp));
+
+        if ((l_comp && l_brf == cm->comp_bwd_ref[1] &&
+             a_frf == cm->comp_bwd_ref[1]) ||
+            (a_comp && a_brf == cm->comp_bwd_ref[1] &&
+             l_frf == cm->comp_bwd_ref[1])) {
+          pred_context = 1;
+        } else if ((l_comp && l_brf == cm->comp_bwd_ref[1]) ||
+                   (a_comp && a_brf == cm->comp_bwd_ref[1]) ||
+                   (!l_comp && l_frf == cm->comp_bwd_ref[1]) ||
+                   (!a_comp && a_frf == cm->comp_bwd_ref[1])) {
+          pred_context = 2;
+        } else {
+          pred_context = 4;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[bwd_ref_sign_idx]
+                            != cm->comp_bwd_ref[1]);
+      } else {
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_bwd_ref[1]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#else  // CONFIG_BIDIR_PRED
+
 // Returns a context number for the given MB prediction signal
 int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
                                      const MACROBLOCKD *xd) {
@@ -701,6 +962,8 @@
   return pred_context;
 }
 
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
@@ -1284,7 +1547,7 @@
 
         if (rfs == GOLDEN_FRAME)
           pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-        else if (rfs == ALTREF_FRAME)
+        else if (rfs != GOLDEN_FRAME && rfs != LAST_FRAME)
           pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
         else
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
@@ -1296,8 +1559,8 @@
                                                                   : above0;
           pred_context = 4 * (edge0 == GOLDEN_FRAME);
         } else {
-          pred_context = 2 * (above0 == GOLDEN_FRAME) +
-                             2 * (left0 == GOLDEN_FRAME);
+          pred_context =
+              2 * (above0 == GOLDEN_FRAME) + 2 * (left0 == GOLDEN_FRAME);
         }
       }
     }
@@ -1319,4 +1582,105 @@
   return pred_context;
 }
 
+#if CONFIG_BIDIR_PRED
+
+#define CHECK_BWDREF_OR_ALTREF(ref_frame) \
+  ((ref_frame == BWDREF_FRAME) || (ref_frame == ALTREF_FRAME))
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF_FRAME, conditioning
+// on it is either ALTREF_FRAME or BWDREF_FRAME.
+int vp10_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
+                 edge_mbmi->ref_frame[1] == BWDREF_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context =
+              3 * (above0 == BWDREF_FRAME || above1 == BWDREF_FRAME ||
+                   left0 == BWDREF_FRAME || left1 == BWDREF_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME srf = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf0 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above1 : left1;
+
+        if (srf == BWDREF_FRAME)
+          pred_context = 3 + (crf0 == BWDREF_FRAME || crf0 == BWDREF_FRAME);
+        else if (srf == ALTREF_FRAME)
+          pred_context = (crf0 == BWDREF_FRAME || crf1 == BWDREF_FRAME);
+        else
+          pred_context = 1 + 2 * (crf0 == BWDREF_FRAME || crf1 == BWDREF_FRAME);
+      } else {
+        if (!CHECK_BWDREF_OR_ALTREF(above0) &&
+            !CHECK_BWDREF_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_BWDREF_OR_ALTREF(above0) ||
+                   !CHECK_BWDREF_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_BWDREF_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == BWDREF_FRAME);
+        } else {
+          pred_context =
+              2 * (above0 == BWDREF_FRAME) + 2 * (left0 == BWDREF_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
+                          edge_mbmi->ref_frame[1] == BWDREF_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index f321599..1c3c721 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -87,7 +87,7 @@
                                     const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_reference_mode_prob(const VP10_COMMON *cm,
-                                                   const MACROBLOCKD *xd) {
+                                                    const MACROBLOCKD *xd) {
   return cm->fc->comp_inter_prob[vp10_get_reference_mode_context(cm, xd)];
 }
 
@@ -127,6 +127,20 @@
   const int pred_context = vp10_get_pred_context_comp_ref_p3(cm, xd);
   return cm->fc->comp_ref_prob[pred_context][3];
 }
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+int vp10_get_pred_context_comp_bwdref_p(const VP10_COMMON *cm,
+                                        const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_bwdref_p(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_bwdref_p(cm, xd);
+  return cm->fc->comp_bwdref_prob[pred_context][0];
+}
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
 
 int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
@@ -143,14 +157,16 @@
   return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p2(xd)][1];
 }
 
-#if CONFIG_EXT_REFS
+#if CONFIG_EXT_REFS || CONFIG_BIDIR_PRED
 int vp10_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_single_ref_p3(const VP10_COMMON *cm,
                                                         const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p3(xd)][2];
 }
+#endif  // CONFIG_EXT_REFS || CONFIR_BIDIR_PRED
 
+#if CONFIG_EXT_REFS
 int vp10_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_single_ref_p4(const VP10_COMMON *cm,
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index ecfb54c..713831b 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -11,6 +11,7 @@
 #include <assert.h>
 
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
@@ -23,217 +24,287 @@
 #endif  // CONFIG_OBMC
 
 #if CONFIG_EXT_INTER
-static int get_masked_weight(int m) {
+
+// Set to one to use larger codebooks
+#define USE_LARGE_WEDGE_CODEBOOK  0
+
+#define NSMOOTHERS  1
+static int get_masked_weight(int m, int smoothness) {
 #define SMOOTHER_LEN  32
-  static const uint8_t smoothfn[2 * SMOOTHER_LEN + 1] = {
-    0,  0,  0,  0,  0,  0,  0,  0,
-    0,  0,  0,  0,  0,  0,  0,  0,
-    0,  0,  0,  0,  0,  0,  0,  0,
-    1,  1,  2,  4,  6, 10, 16, 23,
-    32,
-    41, 48, 54, 58, 60, 62, 63, 63,
-    64, 64, 64, 64, 64, 64, 64, 64,
-    64, 64, 64, 64, 64, 64, 64, 64,
-    64, 64, 64, 64, 64, 64, 64, 64,
+  static const uint8_t smoothfn[NSMOOTHERS][2 * SMOOTHER_LEN + 1] = {
+    {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  1,  2,  4,  7, 13, 21,
+      32,
+      43, 51, 57, 60, 62, 63, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+    }
   };
   if (m < -SMOOTHER_LEN)
     return 0;
   else if (m > SMOOTHER_LEN)
     return (1 << WEDGE_WEIGHT_BITS);
   else
-    return smoothfn[m + SMOOTHER_LEN];
+    return smoothfn[smoothness][m + SMOOTHER_LEN];
 }
 
-#define WEDGE_OBLIQUE  1
-#define WEDGE_STRAIGHT 0
 
-#define WEDGE_PARMS    5
+// [smoother][negative][direction]
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
+                  [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
 
-// [negative][transpose][reverse]
-DECLARE_ALIGNED(16, static uint8_t,
-                wedge_mask_obl[2][2][2][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
-// [negative][transpose]
-DECLARE_ALIGNED(16, static uint8_t,
-                wedge_mask_str[2][2][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_signflip_lookup[BLOCK_SIZES][MAX_WEDGE_TYPES]);
 
-// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
-void vp10_init_wedge_masks() {
-  int i, j;
-  const int w = MASK_MASTER_SIZE;
-  const int h = MASK_MASTER_SIZE;
-  const int stride = MASK_MASTER_STRIDE;
-  const int a[4] = {2, 1, 4, 4};
-  for (i = 0; i < h; ++i)
-    for (j = 0; j < w; ++j) {
-      int x = (2 * j + 1 - (a[2] * w) / 4);
-      int y = (2 * i + 1 - (a[3] * h) / 4);
-      int m = (a[0] * x + a[1] * y) / 2;
-      wedge_mask_obl[1][0][0][i * stride + j] =
-      wedge_mask_obl[1][1][0][j * stride + i] =
-          get_masked_weight(m);
-      wedge_mask_obl[1][0][1][i * stride + w - 1 - j] =
-      wedge_mask_obl[1][1][1][(w - 1 - j) * stride + i] =
-          (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m);
-      wedge_mask_obl[0][0][0][i * stride + j] =
-      wedge_mask_obl[0][1][0][j * stride + i] =
-          (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m);
-      wedge_mask_obl[0][0][1][i * stride + w - 1 - j] =
-      wedge_mask_obl[0][1][1][(w - 1 - j) * stride + i] =
-          get_masked_weight(m);
-      wedge_mask_str[1][0][i * stride + j] =
-      wedge_mask_str[1][1][j * stride + i] =
-          get_masked_weight(x);
-      wedge_mask_str[0][0][i * stride + j] =
-      wedge_mask_str[0][1][j * stride + i] =
-          (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(x);
-    }
-}
-
-static const int wedge_params_sml[1 << WEDGE_BITS_SML]
-                                 [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
+// Some unused wedge codebooks left temporarily to facilitate experiments.
+// To be removed when setteld.
+static wedge_code_type wedge_codebook_8_hgtw[8] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
 };
 
-static const int wedge_params_med_hgtw[1 << WEDGE_BITS_MED]
-                                      [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
-
-    {WEDGE_OBLIQUE,  1, 1, 4, 2},
-    {WEDGE_OBLIQUE,  1, 1, 4, 6},
-    {WEDGE_OBLIQUE,  1, 0, 4, 2},
-    {WEDGE_OBLIQUE,  1, 0, 4, 6},
+static wedge_code_type wedge_codebook_8_hltw[8] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
 };
 
-static const int wedge_params_med_hltw[1 << WEDGE_BITS_MED]
-                                      [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
-
-    {WEDGE_OBLIQUE,  0, 1, 2, 4},
-    {WEDGE_OBLIQUE,  0, 1, 6, 4},
-    {WEDGE_OBLIQUE,  0, 0, 2, 4},
-    {WEDGE_OBLIQUE,  0, 0, 6, 4},
+static wedge_code_type wedge_codebook_8_heqw[8] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   6, 4},
 };
 
-static const int wedge_params_med_heqw[1 << WEDGE_BITS_MED]
-                                      [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
-
-    {WEDGE_STRAIGHT, 1, 0, 4, 2},
-    {WEDGE_STRAIGHT, 1, 0, 4, 6},
-    {WEDGE_STRAIGHT, 0, 0, 2, 4},
-    {WEDGE_STRAIGHT, 0, 0, 6, 4},
+#if !USE_LARGE_WEDGE_CODEBOOK
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
 };
 
-static const int wedge_params_big_hgtw[1 << WEDGE_BITS_BIG]
-                                      [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
-
-    {WEDGE_OBLIQUE,  1, 1, 4, 2},
-    {WEDGE_OBLIQUE,  1, 1, 4, 6},
-    {WEDGE_OBLIQUE,  1, 0, 4, 2},
-    {WEDGE_OBLIQUE,  1, 0, 4, 6},
-
-    {WEDGE_OBLIQUE,  0, 1, 2, 4},
-    {WEDGE_OBLIQUE,  0, 1, 6, 4},
-    {WEDGE_OBLIQUE,  0, 0, 2, 4},
-    {WEDGE_OBLIQUE,  0, 0, 6, 4},
-
-    {WEDGE_STRAIGHT, 1, 0, 4, 2},
-    {WEDGE_STRAIGHT, 1, 0, 4, 4},
-    {WEDGE_STRAIGHT, 1, 0, 4, 6},
-    {WEDGE_STRAIGHT, 0, 0, 4, 4},
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
 };
 
-static const int wedge_params_big_hltw[1 << WEDGE_BITS_BIG]
-                                      [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
-
-    {WEDGE_OBLIQUE,  1, 1, 4, 2},
-    {WEDGE_OBLIQUE,  1, 1, 4, 6},
-    {WEDGE_OBLIQUE,  1, 0, 4, 2},
-    {WEDGE_OBLIQUE,  1, 0, 4, 6},
-
-    {WEDGE_OBLIQUE,  0, 1, 2, 4},
-    {WEDGE_OBLIQUE,  0, 1, 6, 4},
-    {WEDGE_OBLIQUE,  0, 0, 2, 4},
-    {WEDGE_OBLIQUE,  0, 0, 6, 4},
-
-    {WEDGE_STRAIGHT, 0, 0, 2, 4},
-    {WEDGE_STRAIGHT, 0, 0, 4, 4},
-    {WEDGE_STRAIGHT, 0, 0, 6, 4},
-    {WEDGE_STRAIGHT, 1, 0, 4, 4},
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
 };
 
-static const int wedge_params_big_heqw[1 << WEDGE_BITS_BIG]
-                                      [WEDGE_PARMS] = {
-    {WEDGE_OBLIQUE,  1, 1, 4, 4},
-    {WEDGE_OBLIQUE,  1, 0, 4, 4},
-    {WEDGE_OBLIQUE,  0, 1, 4, 4},
-    {WEDGE_OBLIQUE,  0, 0, 4, 4},
-
-    {WEDGE_OBLIQUE,  1, 1, 4, 2},
-    {WEDGE_OBLIQUE,  1, 1, 4, 6},
-    {WEDGE_OBLIQUE,  1, 0, 4, 2},
-    {WEDGE_OBLIQUE,  1, 0, 4, 6},
-
-    {WEDGE_OBLIQUE,  0, 1, 2, 4},
-    {WEDGE_OBLIQUE,  0, 1, 6, 4},
-    {WEDGE_OBLIQUE,  0, 0, 2, 4},
-    {WEDGE_OBLIQUE,  0, 0, 6, 4},
-
-    {WEDGE_STRAIGHT, 1, 0, 4, 2},
-    {WEDGE_STRAIGHT, 1, 0, 4, 6},
-    {WEDGE_STRAIGHT, 0, 0, 2, 4},
-    {WEDGE_STRAIGHT, 0, 0, 6, 4},
-};
-
-static const int *get_wedge_params_lookup[BLOCK_SIZES] = {
-  NULL,
-  NULL,
-  NULL,
-  &wedge_params_sml[0][0],
-  &wedge_params_med_hgtw[0][0],
-  &wedge_params_med_hltw[0][0],
-  &wedge_params_med_heqw[0][0],
-  &wedge_params_med_hgtw[0][0],
-  &wedge_params_med_hltw[0][0],
-  &wedge_params_med_heqw[0][0],
-  &wedge_params_big_hgtw[0][0],
-  &wedge_params_big_hltw[0][0],
-  &wedge_params_big_heqw[0][0],
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+  {4, wedge_codebook_16_heqw, wedge_signflip_lookup[3], 0},
+  {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[4], 0},
+  {4, wedge_codebook_16_hltw, wedge_signflip_lookup[5], 0},
+  {4, wedge_codebook_16_heqw, wedge_signflip_lookup[6], 0},
+  {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[7], 0},
+  {4, wedge_codebook_16_hltw, wedge_signflip_lookup[8], 0},
+  {4, wedge_codebook_16_heqw, wedge_signflip_lookup[9], 0},
+  {0, wedge_codebook_8_hgtw, wedge_signflip_lookup[10], 0},
+  {0, wedge_codebook_8_hltw, wedge_signflip_lookup[11], 0},
+  {0, wedge_codebook_8_heqw, wedge_signflip_lookup[12], 0},
 #if CONFIG_EXT_PARTITION
-  &wedge_params_big_hgtw[0][0],
-  &wedge_params_big_hltw[0][0],
-  &wedge_params_big_heqw[0][0],
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
 #endif  // CONFIG_EXT_PARTITION
 };
 
-static const int *get_wedge_params(int wedge_index,
-                                   BLOCK_SIZE sb_type) {
-  const int *a = NULL;
-  if (wedge_index != WEDGE_NONE) {
-    return get_wedge_params_lookup[sb_type] + WEDGE_PARMS * wedge_index;
-  }
-  return a;
-}
+#else
+
+static const wedge_code_type wedge_codebook_32_hgtw[32] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_OBLIQUE27,  4, 1},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 3},
+    {WEDGE_OBLIQUE27,  4, 5},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE27,  4, 7},
+    {WEDGE_OBLIQUE153, 4, 1},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 3},
+    {WEDGE_OBLIQUE153, 4, 5},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE153, 4, 7},
+    {WEDGE_OBLIQUE63,  1, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  3, 4},
+    {WEDGE_OBLIQUE63,  5, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE63,  7, 4},
+    {WEDGE_OBLIQUE117, 1, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 3, 4},
+    {WEDGE_OBLIQUE117, 5, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+    {WEDGE_OBLIQUE117, 7, 4},
+};
+
+static const wedge_code_type wedge_codebook_32_hltw[32] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_OBLIQUE27,  4, 1},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 3},
+    {WEDGE_OBLIQUE27,  4, 5},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE27,  4, 7},
+    {WEDGE_OBLIQUE153, 4, 1},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 3},
+    {WEDGE_OBLIQUE153, 4, 5},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE153, 4, 7},
+    {WEDGE_OBLIQUE63,  1, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  3, 4},
+    {WEDGE_OBLIQUE63,  5, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE63,  7, 4},
+    {WEDGE_OBLIQUE117, 1, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 3, 4},
+    {WEDGE_OBLIQUE117, 5, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+    {WEDGE_OBLIQUE117, 7, 4},
+};
+
+static const wedge_code_type wedge_codebook_32_heqw[32] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_OBLIQUE27,  4, 1},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 3},
+    {WEDGE_OBLIQUE27,  4, 5},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE27,  4, 7},
+    {WEDGE_OBLIQUE153, 4, 1},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 3},
+    {WEDGE_OBLIQUE153, 4, 5},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE153, 4, 7},
+    {WEDGE_OBLIQUE63,  1, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  3, 4},
+    {WEDGE_OBLIQUE63,  5, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE63,  7, 4},
+    {WEDGE_OBLIQUE117, 1, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 3, 4},
+    {WEDGE_OBLIQUE117, 5, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+    {WEDGE_OBLIQUE117, 7, 4},
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+  {5, wedge_codebook_32_heqw, wedge_signflip_lookup[3], 0},
+  {5, wedge_codebook_32_hgtw, wedge_signflip_lookup[4], 0},
+  {5, wedge_codebook_32_hltw, wedge_signflip_lookup[5], 0},
+  {5, wedge_codebook_32_heqw, wedge_signflip_lookup[6], 0},
+  {5, wedge_codebook_32_hgtw, wedge_signflip_lookup[7], 0},
+  {5, wedge_codebook_32_hltw, wedge_signflip_lookup[8], 0},
+  {5, wedge_codebook_32_heqw, wedge_signflip_lookup[9], 0},
+  {0, wedge_codebook_8_hgtw, wedge_signflip_lookup[10], 0},
+  {0, wedge_codebook_8_hltw, wedge_signflip_lookup[11], 0},
+  {0, wedge_codebook_8_heqw, wedge_signflip_lookup[12], 0},
+#if CONFIG_EXT_PARTITION
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+  {0, NULL, NULL, 0},
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // USE_LARGE_WEDGE_CODEBOOK
 
 static const uint8_t *get_wedge_mask_inplace(int wedge_index,
                                              int neg,
@@ -241,14 +312,17 @@
   const uint8_t *master;
   const int bh = 4 << b_height_log2_lookup[sb_type];
   const int bw = 4 << b_width_log2_lookup[sb_type];
-  const int *a = get_wedge_params(wedge_index, sb_type);
+  const wedge_code_type *a =
+      wedge_params_lookup[sb_type].codebook + wedge_index;
+  const int smoother = wedge_params_lookup[sb_type].smoother;
   int woff, hoff;
-  if (!a) return NULL;
-  woff = (a[3] * bw) >> 3;
-  hoff = (a[4] * bh) >> 3;
-  master = (a[0] ?
-            wedge_mask_obl[neg][a[1]][a[2]] :
-            wedge_mask_str[neg][a[1]]) +
+  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+
+  assert(wedge_index >= 0 &&
+         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  woff = (a->x_offset * bw) >> 3;
+  hoff = (a->y_offset * bh) >> 3;
+  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
       MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
       MASK_MASTER_SIZE / 2 - woff;
   return master;
@@ -266,124 +340,76 @@
   return mask;
 }
 
-static void build_masked_compound(uint8_t *dst, int dst_stride,
-                                  uint8_t *dst1, int dst1_stride,
-                                  uint8_t *dst2, int dst2_stride,
-                                  const uint8_t *mask,
-                                  int h, int w, int subh, int subw) {
-  int i, j;
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
+// If the signs for the wedges for various blocksizes are
+// inconsistent flip the sign flag. Do it only once for every
+// wedge codebook.
+static void init_wedge_signs() {
+  BLOCK_SIZE sb_type;
+  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
+  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES; ++sb_type) {
+    const int bw = 4 * num_4x4_blocks_wide_lookup[sb_type];
+    const int bh = 4 * num_4x4_blocks_high_lookup[sb_type];
+    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
+    const int wbits = wedge_params.bits;
+    const int wtypes = 1 << wbits;
+    int i, w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+      int sum = 0;
+      for (i = 0; i < bw; ++i)
+        sum += mask[i];
+      for (i = 0; i < bh; ++i)
+        sum += mask[i * MASK_MASTER_STRIDE];
+      sum = (sum + (bw + bh) / 2) / (bw + bh);
+      wedge_params.signflip[w] = (sum < 32);
+    }
   }
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
-                                         uint8_t *dst1_8, int dst1_stride,
-                                         uint8_t *dst2_8, int dst2_stride,
-                                         const uint8_t *mask,
-                                         int h, int w, int subh, int subw) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *dst1 = CONVERT_TO_SHORTPTR(dst1_8);
-  uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
-  if (subw == 0 && subh == 0) {
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void vp10_init_wedge_masks() {
+  int i, j, s;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+  const int a[2] = {2, 1};
+  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+  for (s = 0; s < NSMOOTHERS; s++) {
     for (i = 0; i < h; ++i)
       for (j = 0; j < w; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
+        int x = (2 * j + 1 - w);
+        int y = (2 * i + 1 - h);
+        int m = (int)rint((a[0] * x + a[1] * y) / asqrt);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] =
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] =
+            get_masked_weight(m, s);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+            (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] =
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] =
+            (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+            get_masked_weight(m, s);
+        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] =
+        wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] =
+            get_masked_weight(x, s);
+        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] =
+        wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] =
+            (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(x, s);
       }
   }
+  init_wedge_signs();
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 
 #if CONFIG_SUPERTX
 static void build_masked_compound_wedge_extend(
     uint8_t *dst, int dst_stride,
-    uint8_t *dst2, int dst2_stride,
+    uint8_t *src0, int src0_stride,
+    uint8_t *src1, int src1_stride,
     int wedge_index,
     int wedge_sign,
     BLOCK_SIZE sb_type,
@@ -393,33 +419,39 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
      wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  build_masked_compound(dst, dst_stride,
-                        dst, dst_stride, dst2, dst2_stride, mask,
-                        h, w, subh, subw);
+  vpx_blend_mask6(dst, dst_stride,
+                  src0, src0_stride,
+                  src1, src1_stride,
+                  mask, MASK_MASTER_STRIDE,
+                  h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_masked_compound_wedge_extend_highbd(
     uint8_t *dst_8, int dst_stride,
-    uint8_t *dst2_8, int dst2_stride,
+    uint8_t *src0_8, int src0_stride,
+    uint8_t *src1_8, int src1_stride,
     int wedge_index, int wedge_sign,
     BLOCK_SIZE sb_type,
     int wedge_offset_x, int wedge_offset_y,
-    int h, int w) {
+    int h, int w, int bd) {
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
       wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  build_masked_compound_highbd(dst_8, dst_stride,
-                               dst_8, dst_stride, dst2_8, dst2_stride, mask,
-                               h, w, subh, subw);
+  vpx_highbd_blend_mask6(dst_8, dst_stride,
+                         src0_8, src0_stride,
+                         src1_8, src1_stride,
+                         mask, MASK_MASTER_STRIDE,
+                         h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #else   // CONFIG_SUPERTX
 
 static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
-                                        uint8_t *dst2, int dst2_stride,
+                                        uint8_t *src0, int src0_stride,
+                                        uint8_t *src1, int src1_stride,
                                         int wedge_index, int wedge_sign,
                                         BLOCK_SIZE sb_type,
                                         int h, int w) {
@@ -429,26 +461,31 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
                                            sb_type, 0, 0);
-  build_masked_compound(dst, dst_stride,
-                        dst, dst_stride, dst2, dst2_stride, mask,
-                        h, w, subh, subw);
+  vpx_blend_mask6(dst, dst_stride,
+                  src0, src0_stride,
+                  src1, src1_stride,
+                  mask, MASK_MASTER_STRIDE,
+                  h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride,
-                                               uint8_t *dst2_8, int dst2_stride,
+                                               uint8_t *src0_8, int src0_stride,
+                                               uint8_t *src1_8, int src1_stride,
                                                int wedge_index, int wedge_sign,
                                                BLOCK_SIZE sb_type,
-                                               int h, int w) {
+                                               int h, int w, int bd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
                                            sb_type, 0, 0);
-  build_masked_compound_highbd(dst_8, dst_stride,
-                               dst_8, dst_stride, dst2_8, dst2_stride, mask,
-                               h, w, subh, subw);
+  vpx_highbd_blend_mask6(dst_8, dst_stride,
+                         src0_8, src0_stride,
+                         src1_8, src1_stride,
+                         mask, MASK_MASTER_STRIDE,
+                         h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_SUPERTX
@@ -493,14 +530,18 @@
 #if CONFIG_SUPERTX
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     build_masked_compound_wedge_extend_highbd(
-        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.interinter_wedge_sign,
         mi->mbmi.sb_type,
-        wedge_offset_x, wedge_offset_y, h, w);
+        wedge_offset_x, wedge_offset_y, h, w, xd->bd);
   else
     build_masked_compound_wedge_extend(
-        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.interinter_wedge_sign,
         mi->mbmi.sb_type,
@@ -508,13 +549,17 @@
 #else
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
     build_masked_compound_wedge_highbd(
-        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.interinter_wedge_sign,
-        mi->mbmi.sb_type, h, w);
+        mi->mbmi.sb_type, h, w, xd->bd);
   else
     build_masked_compound_wedge(
-        dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.interinter_wedge_sign,
         mi->mbmi.sb_type, h, w);
@@ -526,14 +571,18 @@
                             tmp_ipf, xs, ys, xd);
 #if CONFIG_SUPERTX
   build_masked_compound_wedge_extend(
-      dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+      dst, dst_stride,
+      dst, dst_stride,
+      tmp_dst, MAX_SB_SIZE,
       mi->mbmi.interinter_wedge_index,
       mi->mbmi.interinter_wedge_sign,
       mi->mbmi.sb_type,
       wedge_offset_x, wedge_offset_y, h, w);
 #else
   build_masked_compound_wedge(
-      dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+      dst, dst_stride,
+      dst, dst_stride,
+      tmp_dst, MAX_SB_SIZE,
       mi->mbmi.interinter_wedge_index,
       mi->mbmi.interinter_wedge_sign,
       mi->mbmi.sb_type, h, w);
@@ -615,6 +664,74 @@
   const int is_compound = has_second_ref(&mi->mbmi);
   int ref;
 
+#if CONFIG_DUAL_FILTER
+  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
+    int blk_num = 1 << (pd->subsampling_x + pd->subsampling_y);
+    int chr_idx;
+    int x_base = x;
+    int y_base = y;
+    int x_step = w >> pd->subsampling_x;
+    int y_step = h >> pd->subsampling_y;
+
+    for (chr_idx = 0; chr_idx < blk_num; ++chr_idx) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+        struct buf_2d *const pre_buf = &pd->pre[ref];
+        struct buf_2d *const dst_buf = &pd->dst;
+        uint8_t *dst = dst_buf->buf;
+        const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+        const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                                   pd->subsampling_x,
+                                                   pd->subsampling_y);
+        uint8_t *pre;
+        MV32 scaled_mv;
+        int xs, ys, subpel_x, subpel_y;
+        const int is_scaled = vp10_is_scaled(sf);
+
+        x = x_base + (chr_idx & 0x01) * x_step;
+        y = y_base + (chr_idx >> 1) * y_step;
+
+        dst += dst_buf->stride * y + x;
+
+        if (is_scaled) {
+          pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+          scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+          xs = sf->x_step_q4;
+          ys = sf->y_step_q4;
+        } else {
+          pre = pre_buf->buf + y * pre_buf->stride + x;
+          scaled_mv.row = mv_q4.row;
+          scaled_mv.col = mv_q4.col;
+          xs = ys = 16;
+        }
+
+        subpel_x = scaled_mv.col & SUBPEL_MASK;
+        subpel_y = scaled_mv.row & SUBPEL_MASK;
+        pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+               + (scaled_mv.col >> SUBPEL_BITS);
+
+    #if CONFIG_EXT_INTER
+        if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
+            mi->mbmi.use_wedge_interinter)
+          vp10_make_masked_inter_predictor(
+              pre, pre_buf->stride, dst, dst_buf->stride,
+              subpel_x, subpel_y, sf, w, h,
+              mi->mbmi.interp_filter, xs, ys,
+    #if CONFIG_SUPERTX
+              wedge_offset_x, wedge_offset_y,
+    #endif  // CONFIG_SUPERTX
+              xd);
+        else
+    #endif  // CONFIG_EXT_INTER
+          vp10_make_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                                    subpel_x, subpel_y, sf, x_step, y_step, ref,
+                                    mi->mbmi.interp_filter, xs, ys, xd);
+      }
+    }
+    return;
+  }
+#endif
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -1728,10 +1845,11 @@
                                                bsize, 0, 0);
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      build_masked_compound(comppred, compstride,
-                            intrapred, intrastride,
-                            interpred, interstride, mask,
-                            bh, bw, subh, subw);
+      vpx_blend_mask6(comppred, compstride,
+                      intrapred, intrastride,
+                      interpred, interstride,
+                      mask, MASK_MASTER_STRIDE,
+                      bh, bw, subh, subw);
     }
     return;
   }
@@ -1851,7 +1969,6 @@
   uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
   uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
   uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
-  (void) bd;
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
@@ -1859,10 +1976,11 @@
                                                bsize, 0, 0);
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      build_masked_compound_highbd(comppred8, compstride,
-                                   intrapred8, intrastride,
-                                   interpred8, interstride, mask,
-                                   bh, bw, subh, subw);
+      vpx_highbd_blend_mask6(comppred8, compstride,
+                             intrapred8, intrastride,
+                             interpred8, interstride,
+                             mask, MASK_MASTER_STRIDE,
+                             bh, bw, subh, subw, bd);
     }
     return;
   }
@@ -2263,113 +2381,74 @@
                                                  int ext_dst_stride0,
                                                  uint8_t *ext_dst1,
                                                  int ext_dst_stride1) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
-  int ref;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
   (void) block;
   (void) bw;
   (void) bh;
   (void) mi_x;
   (void) mi_y;
 
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-
-    if (ref && is_interinter_wedge_used(mi->mbmi.sb_type)
-        && mi->mbmi.use_wedge_interinter) {
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
-      uint8_t *tmp_dst =
-          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
-          CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
-#else
-      DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          int k;
-          for (k = 0; k < h; ++k)
-            memcpy(tmp_dst_ + 2 * MAX_SB_SIZE * k, ext_dst1 +
-                   ext_dst_stride1 * 2 * k, w * 2);
-        } else {
-          int k;
-          for (k = 0; k < h; ++k)
-            memcpy(tmp_dst_ + MAX_SB_SIZE * k, ext_dst1 +
-                   ext_dst_stride1 * k, w);
-        }
-#else
-        {
-          int k;
-          for (k = 0; k < h; ++k)
-            memcpy(tmp_dst + MAX_SB_SIZE * k, ext_dst1 +
-                   ext_dst_stride1 * k, w);
-        }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
+  if (is_compound
+      && is_interinter_wedge_used(mbmi->sb_type)
+      && mbmi->use_wedge_interinter) {
 #if CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        build_masked_compound_wedge_extend_highbd(
-            dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
-            mi->mbmi.interinter_wedge_index,
-            mi->mbmi.interinter_wedge_sign,
-            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
-      } else {
-        build_masked_compound_wedge_extend(
-            dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
-            mi->mbmi.interinter_wedge_index,
-            mi->mbmi.interinter_wedge_sign,
-            mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
-      }
-#else
-      build_masked_compound_wedge_extend(dst, dst_buf->stride,
-                                         tmp_dst, MAX_SB_SIZE,
-                                         mi->mbmi.interinter_wedge_index,
-                                         mi->mbmi.interinter_wedge_sign,
-                                         mi->mbmi.sb_type,
-                                         wedge_offset_x, wedge_offset_y, h, w);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_wedge_extend_highbd(
+          dst, dst_buf->stride,
+          CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1,
+          mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign,
+          mbmi->sb_type,
+          wedge_offset_x, wedge_offset_y, h, w,
+          xd->bd);
+    else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      build_masked_compound_wedge_extend(
+          dst, dst_buf->stride,
+          ext_dst0, ext_dst_stride0,
+          ext_dst1, ext_dst_stride1,
+          mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign,
+          mbmi->sb_type,
+          wedge_offset_x, wedge_offset_y, h, w);
 #else   // CONFIG_SUPERTX
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        build_masked_compound_wedge_highbd(dst, dst_buf->stride, tmp_dst,
-                                           MAX_SB_SIZE,
-                                           mi->mbmi.interinter_wedge_index,
-                                           mi->mbmi.interinter_wedge_sign,
-                                           mi->mbmi.sb_type, h, w);
-      else
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_wedge_highbd(
+          dst, dst_buf->stride,
+          CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1,
+          mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign,
+          mbmi->sb_type, h, w,
+          xd->bd);
+    else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        build_masked_compound_wedge(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
-                                    mi->mbmi.interinter_wedge_index,
-                                    mi->mbmi.interinter_wedge_sign,
-                                    mi->mbmi.sb_type, h, w);
+      build_masked_compound_wedge(
+          dst, dst_buf->stride,
+          ext_dst0, ext_dst_stride0,
+          ext_dst1, ext_dst_stride1,
+          mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign,
+          mbmi->sb_type, h, w);
 #endif  // CONFIG_SUPERTX
-    } else {
+  } else {
 #if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          int k;
-          for (k = 0; k < h; ++k)
-            memcpy(CONVERT_TO_SHORTPTR(dst + dst_buf->stride * k),
-                   ext_dst0 + ext_dst_stride0 * 2 * k, w * 2);
-        } else {
-          int k;
-          for (k = 0; k < h; ++k)
-            memcpy(dst + dst_buf->stride * k,
-                   ext_dst0 + ext_dst_stride0 * k, w);
-        }
-#else
-        {
-          int k;
-          for (k = 0; k < h; ++k)
-            memcpy(dst + dst_buf->stride * k,
-                   ext_dst0 + ext_dst_stride0 * k, w);
-        }
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      vpx_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0),  ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
+      vpx_convolve_copy(ext_dst0, ext_dst_stride0,
+                        dst, dst_buf->stride, NULL, 0, NULL, 0, w, h);
   }
 }
 
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 1e8679b..5d9a6f9 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -44,7 +44,8 @@
 
 #if CONFIG_DUAL_FILTER
   if (interp_filter_params_x.taps == SUBPEL_TAPS &&
-      interp_filter_params_y.taps == SUBPEL_TAPS) {
+      interp_filter_params_y.taps == SUBPEL_TAPS &&
+      w > 2 && h > 2) {
     const int16_t *kernel_x =
         vp10_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
     const int16_t *kernel_y =
@@ -106,7 +107,8 @@
 
 #if CONFIG_DUAL_FILTER
   if (interp_filter_params_x.taps == SUBPEL_TAPS &&
-      interp_filter_params_y.taps == SUBPEL_TAPS) {
+      interp_filter_params_y.taps == SUBPEL_TAPS &&
+      w > 2 && h > 2) {
     const int16_t *kernel_x =
         vp10_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
     const int16_t *kernel_y =
@@ -146,6 +148,64 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_EXT_INTER
+#define MAX_WEDGE_TYPES   (1 << 5)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE       -1
+
+// Angles are with respect to horizontal anti-clockwise
+typedef enum {
+  WEDGE_HORIZONTAL = 0,
+  WEDGE_VERTICAL = 1,
+  WEDGE_OBLIQUE27 = 2,
+  WEDGE_OBLIQUE63 = 3,
+  WEDGE_OBLIQUE117 = 4,
+  WEDGE_OBLIQUE153 = 5,
+  WEDGE_DIRECTIONS
+} WedgeDirectionType;
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+  WedgeDirectionType direction;
+  int x_offset;
+  int y_offset;
+} wedge_code_type;
+
+typedef struct {
+  int bits;
+  const wedge_code_type *codebook;
+  uint8_t *signflip;
+  int smoother;
+} wedge_params_type;
+
+extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES];
+
+static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+static INLINE int is_interinter_wedge_used(BLOCK_SIZE sb_type) {
+  (void) sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
+  const int wbits = wedge_params_lookup[sb_type].bits;
+  return (wbits > 0) ? wbits + 1 : 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+  (void) sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+#endif  // CONFIG_EXT_INTER
+
 void build_inter_predictors(MACROBLOCKD *xd, int plane,
 #if CONFIG_OBMC
                             int mi_col_offset, int mi_row_offset,
@@ -385,10 +445,10 @@
 #if CONFIG_DUAL_FILTER
 // Detect if the block have sub-pixel level motion vectors
 // per component.
-static INLINE int has_subpel_mv_component(const MACROBLOCKD *const xd,
+static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
+                                          const MACROBLOCKD *const xd,
                                           int dir) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int plane;
   int ref = (dir >> 1);
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 6b4a460..fa20f2c 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -391,7 +391,6 @@
 
 #if CONFIG_EXT_INTRA
 #define FILTER_INTRA_PREC_BITS 10
-#define FILTER_INTRA_ROUND_VAL 511
 
 static const uint8_t ext_intra_extend_modes[FILTER_INTRA_MODES] = {
   NEED_LEFT | NEED_ABOVE,      // FILTER_DC
@@ -774,9 +773,7 @@
     for (c = 1; c < 2 * bs + 1 - r; ++c) {
       ipred = c0 * pred[r - 1][c] + c1 * pred[r][c - 1] +
           c2 * pred[r - 1][c - 1] + c3 * pred[r - 1][c + 1];
-      pred[r][c] = ipred < 0 ?
-          -((-ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS) :
-          ((ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS);
+      pred[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
     }
 
   for (r = 0; r < bs; ++r) {
@@ -1050,9 +1047,7 @@
     for (c = 1; c < 2 * bs + 1 - r; ++c) {
       ipred = c0 * pred[r - 1][c] + c1 * pred[r][c - 1] +
           c2 * pred[r - 1][c - 1] + c3 * pred[r - 1][c + 1];
-      pred[r][c] = ipred < 0 ?
-          -((-ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS) :
-          ((ipred + FILTER_INTRA_ROUND_VAL) >> FILTER_INTRA_PREC_BITS);
+      pred[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
     }
 
   for (r = 0; r < bs; ++r) {
diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
index 6514a60..d7e2eaf 100644
--- a/vp10/common/vp10_convolve.c
+++ b/vp10/common/vp10_convolve.c
@@ -142,13 +142,21 @@
                  MAX_BLOCK_WIDTH];
     int temp_stride = MAX_BLOCK_WIDTH;
 #if CONFIG_DUAL_FILTER
-    InterpFilterParams filter_params =
+    InterpFilterParams filter_params_x =
         vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        vp10_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    InterpFilterParams filter_params = filter_params_x;
+
+    // The filter size implies the required number of reference pixels for
+    // the second stage filtering. It is possible that the two directions
+    // require different filter sizes.
+    int filter_size = filter_params_y.taps;
 #else
     InterpFilterParams filter_params =
         vp10_get_interp_filter_params(interp_filter);
-#endif
     int filter_size = filter_params.taps;
+#endif
     int intermediate_height =
         (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
 
@@ -159,7 +167,7 @@
                    subpel_x_q4, x_step_q4, 0);
 
 #if CONFIG_DUAL_FILTER
-    filter_params = vp10_get_interp_filter_params(interp_filter[2 * ref_idx]);
+    filter_params = filter_params_y;
 #else
     filter_params = vp10_get_interp_filter_params(interp_filter);
 #endif
@@ -312,13 +320,17 @@
     int temp_stride = MAX_BLOCK_WIDTH;
 
 #if CONFIG_DUAL_FILTER
-    InterpFilterParams filter_params =
+    InterpFilterParams filter_params_x =
         vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        vp10_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    InterpFilterParams filter_params = filter_params_x;
+    int filter_size = filter_params_y.taps;
 #else
     InterpFilterParams filter_params =
         vp10_get_interp_filter_params(interp_filter);
-#endif
     int filter_size = filter_params.taps;
+#endif
 
     int intermediate_height =
         (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
@@ -328,9 +340,7 @@
                           filter_params, subpel_x_q4, x_step_q4, 0, bd);
 
 #if CONFIG_DUAL_FILTER
-    filter_params = vp10_get_interp_filter_params(interp_filter[2 * ref_idx]);
-#else
-    filter_params = vp10_get_interp_filter_params(interp_filter);
+    filter_params = filter_params_y;
 #endif
     filter_size = filter_params.taps;
     assert(filter_params.taps <= MAX_FILTER_TAP);
diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
index cd5ce71..91a5357 100644
--- a/vp10/common/vp10_fwd_txfm2d.c
+++ b/vp10/common/vp10_fwd_txfm2d.c
@@ -19,31 +19,22 @@
   switch (txfm_type) {
     case TXFM_TYPE_DCT4:
       return vp10_fdct4_new;
-      break;
     case TXFM_TYPE_DCT8:
       return vp10_fdct8_new;
-      break;
     case TXFM_TYPE_DCT16:
       return vp10_fdct16_new;
-      break;
     case TXFM_TYPE_DCT32:
       return vp10_fdct32_new;
-      break;
     case TXFM_TYPE_DCT64:
       return vp10_fdct64_new;
-      break;
     case TXFM_TYPE_ADST4:
       return vp10_fadst4_new;
-      break;
     case TXFM_TYPE_ADST8:
       return vp10_fadst8_new;
-      break;
     case TXFM_TYPE_ADST16:
       return vp10_fadst16_new;
-      break;
     case TXFM_TYPE_ADST32:
       return vp10_fadst32_new;
-      break;
     default:
       assert(0);
       return NULL;
@@ -51,180 +42,153 @@
 }
 
 static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
-                                const int stride, const TXFM_2D_CFG *cfg,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
                                 int32_t *buf) {
-  int i, j;
-  const int txfm_size = cfg->txfm_size;
-  const int8_t *shift = cfg->shift;
-  const int8_t *stage_range_col = cfg->stage_range_col;
-  const int8_t *stage_range_row = cfg->stage_range_row;
-  const int8_t *cos_bit_col = cfg->cos_bit_col;
-  const int8_t *cos_bit_row = cfg->cos_bit_row;
-  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
-  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+  int c, r;
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->cfg->txfm_type_row);
 
   // use output buffer as temp buffer
   int32_t* temp_in = output;
   int32_t* temp_out = output + txfm_size;
 
   // Columns
-  for (i = 0; i < txfm_size; ++i) {
-    for (j = 0; j < txfm_size; ++j)
-      temp_in[j] = input[j * stride + i];
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size - r - 1) * stride + c];
+    }
     round_shift_array(temp_in, txfm_size, -shift[0]);
     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
     round_shift_array(temp_out, txfm_size, -shift[1]);
-    for (j = 0; j < txfm_size; ++j)
-      buf[j * txfm_size + i] = temp_out[j];
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        buf[r * txfm_size + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip from left to right
+        buf[r * txfm_size + (txfm_size - c - 1)] = temp_out[r];
+    }
   }
 
   // Rows
-  for (i = 0; i < txfm_size; ++i) {
-    txfm_func_row(buf + i * txfm_size, output + i * txfm_size, cos_bit_row,
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(buf + r * txfm_size, output + r * txfm_size, cos_bit_row,
                   stage_range_row);
-    round_shift_array(output + i * txfm_size, txfm_size, -shift[2]);
+    round_shift_array(output + r * txfm_size, txfm_size, -shift[2]);
   }
 }
 
 void vp10_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output,
-                         const int stride, int tx_type,
-                         const int bd) {
+                           const int stride, int tx_type,
+                           const int bd) {
   int32_t txfm_buf[4 * 4];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_4x4_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_4X4);
   (void)bd;
-  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output,
-                         const int stride, int tx_type,
-                         const int bd) {
+                           const int stride, int tx_type,
+                           const int bd) {
   int32_t txfm_buf[8 * 8];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_8x8_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_8X8);
   (void)bd;
-  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output,
-                           const int stride, int tx_type,
-                           const int bd) {
+                             const int stride, int tx_type,
+                             const int bd) {
   int32_t txfm_buf[16 * 16];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_16x16_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_16X16);
   (void)bd;
-  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output,
-                           const int stride, int tx_type,
-                           const int bd) {
+                             const int stride, int tx_type,
+                             const int bd) {
   int32_t txfm_buf[32 * 32];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_32x32_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_32X32);
   (void)bd;
-  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output,
-                           const int stride, int tx_type,
-                           const int bd) {
+                             const int stride, int tx_type,
+                             const int bd) {
   int32_t txfm_buf[64 * 64];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_64x64_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_64x64_cfg(tx_type);
   (void)bd;
-  fwd_txfm2d_c(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
 }
 
-const TXFM_2D_CFG* vp10_get_txfm_4x4_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_4;
-      break;
-    case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
-      break;
-    case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
-      break;
-    case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
-      break;
-    default:
-      assert(0);
-  }
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG* fwd_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+    {&fwd_txfm_2d_cfg_dct_dct_4  , &fwd_txfm_2d_cfg_dct_dct_8,
+     &fwd_txfm_2d_cfg_dct_dct_16  , &fwd_txfm_2d_cfg_dct_dct_32},
+    {&fwd_txfm_2d_cfg_adst_dct_4 , &fwd_txfm_2d_cfg_adst_dct_8,
+     &fwd_txfm_2d_cfg_adst_dct_16 , &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_adst_4 , &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_dct_adst_16 , &fwd_txfm_2d_cfg_dct_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+    {&fwd_txfm_2d_cfg_adst_dct_4 , &fwd_txfm_2d_cfg_adst_dct_8,
+     &fwd_txfm_2d_cfg_adst_dct_16 , &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_adst_4 , &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_dct_adst_16 , &fwd_txfm_2d_cfg_dct_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+};
+#else  // CONFIG_EXT_TX
+static const TXFM_2D_CFG* fwd_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+    {&fwd_txfm_2d_cfg_dct_dct_4  , &fwd_txfm_2d_cfg_dct_dct_8,
+     &fwd_txfm_2d_cfg_dct_dct_16  , &fwd_txfm_2d_cfg_dct_dct_32},
+    {&fwd_txfm_2d_cfg_adst_dct_4 , &fwd_txfm_2d_cfg_adst_dct_8,
+     &fwd_txfm_2d_cfg_adst_dct_16 , &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_adst_4 , &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_dct_adst_16 , &fwd_txfm_2d_cfg_dct_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+};
+#endif  // CONFIG_EXT_TX
+
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = fwd_txfm_cfg_ls[tx_type][tx_size];
   return cfg;
 }
 
-const TXFM_2D_CFG* vp10_get_txfm_8x8_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg;
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_8;
-      break;
-    case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
-      break;
-    case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
-      break;
-    case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
-      break;
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-const TXFM_2D_CFG* vp10_get_txfm_16x16_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_16;
-      break;
-    case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
-      break;
-    case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
-      break;
-    case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
-      break;
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-const TXFM_2D_CFG* vp10_get_txfm_32x32_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_32;
-      break;
-    case ADST_DCT:
-      cfg = &fwd_txfm_2d_cfg_adst_dct_32;
-      break;
-    case DCT_ADST:
-      cfg = &fwd_txfm_2d_cfg_dct_adst_32;
-      break;
-    case ADST_ADST:
-      cfg = &fwd_txfm_2d_cfg_adst_adst_32;
-      break;
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-const TXFM_2D_CFG* vp10_get_txfm_64x64_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &fwd_txfm_2d_cfg_dct_dct_64;
+      cfg.cfg = &fwd_txfm_2d_cfg_dct_dct_64;
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
       break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
     default:
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
       assert(0);
   }
   return cfg;
diff --git a/vp10/common/vp10_fwd_txfm2d_cfg.h b/vp10/common/vp10_fwd_txfm2d_cfg.h
index ed976df..e15e4ba 100644
--- a/vp10/common/vp10_fwd_txfm2d_cfg.h
+++ b/vp10/common/vp10_fwd_txfm2d_cfg.h
@@ -399,11 +399,4 @@
     fwd_cos_bit_row_adst_dct_32,      // .cos_bit_row
     TXFM_TYPE_ADST32,                 // .txfm_type_col
     TXFM_TYPE_DCT32};                 // .txfm_type_row
-
-const TXFM_2D_CFG* vp10_get_txfm_4x4_cfg(int tx_type);
-const TXFM_2D_CFG* vp10_get_txfm_8x8_cfg(int tx_type);
-const TXFM_2D_CFG* vp10_get_txfm_16x16_cfg(int tx_type);
-const TXFM_2D_CFG* vp10_get_txfm_32x32_cfg(int tx_type);
-const TXFM_2D_CFG* vp10_get_txfm_64x64_cfg(int tx_type);
-
 #endif  // VP10_FWD_TXFM2D_CFG_H_
diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c
index 3ae54c9..ccf4614 100644
--- a/vp10/common/vp10_inv_txfm2d.c
+++ b/vp10/common/vp10_inv_txfm2d.c
@@ -17,51 +17,75 @@
   switch (txfm_type) {
     case TXFM_TYPE_DCT4:
       return vp10_idct4_new;
-      break;
     case TXFM_TYPE_DCT8:
       return vp10_idct8_new;
-      break;
     case TXFM_TYPE_DCT16:
       return vp10_idct16_new;
-      break;
     case TXFM_TYPE_DCT32:
       return vp10_idct32_new;
-      break;
     case TXFM_TYPE_DCT64:
       return vp10_idct64_new;
-      break;
     case TXFM_TYPE_ADST4:
       return vp10_iadst4_new;
-      break;
     case TXFM_TYPE_ADST8:
       return vp10_iadst8_new;
-      break;
     case TXFM_TYPE_ADST16:
       return vp10_iadst16_new;
-      break;
     case TXFM_TYPE_ADST32:
       return vp10_iadst32_new;
-      break;
     default:
       assert(0);
       return NULL;
   }
 }
 
-static const TXFM_2D_CFG* vp10_get_inv_txfm_4x4_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG* inv_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+    {&inv_txfm_2d_cfg_dct_dct_4  , &inv_txfm_2d_cfg_dct_dct_8,
+     &inv_txfm_2d_cfg_dct_dct_16  , &inv_txfm_2d_cfg_dct_dct_32},
+    {&inv_txfm_2d_cfg_adst_dct_4 , &inv_txfm_2d_cfg_adst_dct_8,
+     &inv_txfm_2d_cfg_adst_dct_16 , &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_adst_4 , &inv_txfm_2d_cfg_dct_adst_8,
+     &inv_txfm_2d_cfg_dct_adst_16 , &inv_txfm_2d_cfg_dct_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+    {&inv_txfm_2d_cfg_adst_dct_4 , &inv_txfm_2d_cfg_adst_dct_8,
+     &inv_txfm_2d_cfg_adst_dct_16 , &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_adst_4 , &inv_txfm_2d_cfg_dct_adst_8,
+     &inv_txfm_2d_cfg_dct_adst_16 , &inv_txfm_2d_cfg_dct_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+};
+#else
+static const TXFM_2D_CFG* inv_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+    {&inv_txfm_2d_cfg_dct_dct_4  , &inv_txfm_2d_cfg_dct_dct_8,
+      &inv_txfm_2d_cfg_dct_dct_16  , &inv_txfm_2d_cfg_dct_dct_32},
+    {&inv_txfm_2d_cfg_adst_dct_4 , &inv_txfm_2d_cfg_adst_dct_8,
+      &inv_txfm_2d_cfg_adst_dct_16 , &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_adst_4 , &inv_txfm_2d_cfg_dct_adst_8,
+      &inv_txfm_2d_cfg_dct_adst_16 , &inv_txfm_2d_cfg_dct_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+      &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+};
+#endif
+
+TXFM_2D_FLIP_CFG vp10_get_inv_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = inv_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG vp10_get_inv_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg;
   switch (tx_type) {
     case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_4;
-      break;
-    case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_4;
-      break;
-    case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_4;
-      break;
-    case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      cfg.cfg = &inv_txfm_2d_cfg_dct_dct_64;
+      set_flip_cfg(tx_type, &cfg);
       break;
     default:
       assert(0);
@@ -69,95 +93,17 @@
   return cfg;
 }
 
-static const TXFM_2D_CFG* vp10_get_inv_txfm_8x8_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_8;
-      break;
-    case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_8;
-      break;
-    case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_8;
-      break;
-    case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_8;
-      break;
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-static const TXFM_2D_CFG* vp10_get_inv_txfm_16x16_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_16;
-      break;
-    case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_16;
-      break;
-    case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_16;
-      break;
-    case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_16;
-      break;
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-static const TXFM_2D_CFG* vp10_get_inv_txfm_32x32_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_32;
-      break;
-    case ADST_DCT:
-      cfg = &inv_txfm_2d_cfg_adst_dct_32;
-      break;
-    case DCT_ADST:
-      cfg = &inv_txfm_2d_cfg_dct_adst_32;
-      break;
-    case ADST_ADST:
-      cfg = &inv_txfm_2d_cfg_adst_adst_32;
-      break;
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-static const TXFM_2D_CFG* vp10_get_inv_txfm_64x64_cfg(int tx_type) {
-  const TXFM_2D_CFG* cfg = NULL;
-  switch (tx_type) {
-    case DCT_DCT:
-      cfg = &inv_txfm_2d_cfg_dct_dct_64;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-    default:
-      assert(0);
-  }
-  return cfg;
-}
-
-
 static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
-                                    int stride, const TXFM_2D_CFG *cfg,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
                                     int32_t *txfm_buf) {
-  const int txfm_size = cfg->txfm_size;
-  const int8_t *shift = cfg->shift;
-  const int8_t *stage_range_col = cfg->stage_range_col;
-  const int8_t *stage_range_row = cfg->stage_range_row;
-  const int8_t *cos_bit_col = cfg->cos_bit_col;
-  const int8_t *cos_bit_row = cfg->cos_bit_row;
-  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
-  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->cfg->txfm_type_row);
 
   // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
   // it is used for intermediate data buffering
@@ -165,10 +111,10 @@
   int32_t *temp_out = temp_in + txfm_size;
   int32_t *buf = temp_out + txfm_size;
   int32_t *buf_ptr = buf;
-  int i, j;
+  int c, r;
 
   // Rows
-  for (i = 0; i < txfm_size; ++i) {
+  for (r = 0; r < txfm_size; ++r) {
     txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
     round_shift_array(buf_ptr, txfm_size, -shift[0]);
     input += txfm_size;
@@ -176,13 +122,25 @@
   }
 
   // Columns
-  for (i = 0; i < txfm_size; ++i) {
-    for (j = 0; j < txfm_size; ++j)
-      temp_in[j] = buf[j * txfm_size + i];
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = buf[r * txfm_size + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = buf[r * txfm_size + (txfm_size - c - 1)];
+    }
     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
     round_shift_array(temp_out, txfm_size, -shift[1]);
-    for (j = 0; j < txfm_size; ++j)
-      output[j * stride + i] += temp_out[j];
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        output[r * stride + c] += temp_out[r];
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size; ++r)
+        output[r * stride + c] += temp_out[txfm_size - r - 1];
+    }
   }
 }
 
@@ -194,8 +152,8 @@
   // than (1 << bd) - 1
   // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
   // int16_t*
-  const TXFM_2D_CFG* cfg = vp10_get_inv_txfm_4x4_cfg(tx_type);
-  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_4X4);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
   clamp_block((int16_t *)output, 4, stride, 0, (1 << bd) - 1);
 }
 
@@ -207,8 +165,8 @@
   // than (1 << bd) - 1
   // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
   // int16_t*
-  const TXFM_2D_CFG* cfg = vp10_get_inv_txfm_8x8_cfg(tx_type);
-  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_8X8);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
   clamp_block((int16_t *)output, 8, stride, 0, (1 << bd) - 1);
 }
 
@@ -220,8 +178,8 @@
   // than (1 << bd) - 1
   // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
   // int16_t*
-  const TXFM_2D_CFG* cfg = vp10_get_inv_txfm_16x16_cfg(tx_type);
-  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_16X16);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
   clamp_block((int16_t *)output, 16, stride, 0, (1 << bd) - 1);
 }
 
@@ -233,8 +191,8 @@
   // than (1 << bd) - 1
   // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
   // int16_t*
-  const TXFM_2D_CFG* cfg = vp10_get_inv_txfm_32x32_cfg(tx_type);
-  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_32X32);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
   clamp_block((int16_t *)output, 32, stride, 0, (1 << bd) - 1);
 }
 
@@ -246,7 +204,7 @@
   // than (1 << bd) - 1
   // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
   // int16_t*
-  const TXFM_2D_CFG* cfg = vp10_get_inv_txfm_64x64_cfg(tx_type);
-  inv_txfm2d_add_c(input, (int16_t *)output, stride, cfg, txfm_buf);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_64x64_cfg(tx_type);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
   clamp_block((int16_t *)output, 64, stride, 0, (1 << bd) - 1);
 }
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index d843dfe..1e93f7d 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -627,11 +627,11 @@
 
   #inv txfm
   add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
-  specialize qw/vp10_inv_txfm2d_add_4x4/;
+  specialize qw/vp10_inv_txfm2d_add_4x4 sse4_1/;
   add_proto qw/void vp10_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
-  specialize qw/vp10_inv_txfm2d_add_8x8/;
+  specialize qw/vp10_inv_txfm2d_add_8x8 sse4_1/;
   add_proto qw/void vp10_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
-  specialize qw/vp10_inv_txfm2d_add_16x16/;
+  specialize qw/vp10_inv_txfm2d_add_16x16 sse4_1/;
   add_proto qw/void vp10_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
   specialize qw/vp10_inv_txfm2d_add_32x32/;
   add_proto qw/void vp10_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h
index 9944bdd..2ac8f81 100644
--- a/vp10/common/vp10_txfm.h
+++ b/vp10/common/vp10_txfm.h
@@ -7,7 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
 #ifndef VP10_TXFM_H_
 #define VP10_TXFM_H_
 
@@ -15,6 +14,7 @@
 #include <math.h>
 #include <assert.h>
 
+#include "vp10/common/enums.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
@@ -166,4 +166,57 @@
   const TXFM_TYPE txfm_type_row;
 } TXFM_2D_CFG;
 
+typedef struct TXFM_2D_FLIP_CFG {
+  int ud_flip;  // flip upside down
+  int lr_flip;  // flip left to right
+  const TXFM_2D_CFG* cfg;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void set_flip_cfg(int tx_type, TXFM_2D_FLIP_CFG* cfg) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+    case DCT_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_FLIPADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 1;
+      break;
+    case ADST_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_ADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      assert(0);
+  }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_cfg(int tx_type, int tx_size);
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_64x64_cfg(int tx_type);
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
 #endif  // VP10_TXFM_H_
diff --git a/vp10/common/x86/highbd_inv_txfm_sse4.c b/vp10/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 0000000..9ece108
--- /dev/null
+++ b/vp10/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,1245 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
+#include "vp10/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  in[0] = _mm_add_epi32(v0, v3);
+  in[1] = _mm_add_epi32(v1, v2);
+  in[2] = _mm_sub_epi32(v1, v2);
+  in[3] = _mm_sub_epi32(v0, v3);
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  // stage 0
+  // stage 1
+  u1 = _mm_sub_epi32(zero, u1);
+  u3 = _mm_sub_epi32(zero, u3);
+
+  // stage 2
+  v0 = u0;
+  v1 = u3;
+  x = _mm_mullo_epi32(u1, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  v3 = _mm_sub_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(v0, v2);
+  u1 = _mm_add_epi32(v1, v3);
+  u2 = _mm_sub_epi32(v0, v2);
+  u3 = _mm_sub_epi32(v1, v3);
+
+  // stage 4
+  x = _mm_mullo_epi32(u0, cospi8);
+  y = _mm_mullo_epi32(u1, cospi56);
+  in[3] = _mm_add_epi32(x, y);
+  in[3] = _mm_add_epi32(in[3], rnding);
+  in[3] = _mm_srai_epi32(in[3], bit);
+
+  x = _mm_mullo_epi32(u0, cospi56);
+  y = _mm_mullo_epi32(u1, cospim8);
+  in[0] = _mm_add_epi32(x, y);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
+
+  x = _mm_mullo_epi32(u2, cospi40);
+  y = _mm_mullo_epi32(u3, cospi24);
+  in[1] = _mm_add_epi32(x, y);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[1] = _mm_srai_epi32(in[1], bit);
+
+  x = _mm_mullo_epi32(u2, cospi24);
+  y = _mm_mullo_epi32(u3, cospim40);
+  in[2] = _mm_add_epi32(x, y);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[2] = _mm_srai_epi32(in[2], bit);
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[3] = _mm_add_epi32(in[3], rnding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  u0 = _mm_add_epi32(in[0], v0);
+  u1 = _mm_add_epi32(in[1], v1);
+  u2 = _mm_add_epi32(in[2], v2);
+  u3 = _mm_add_epi32(in[3], v3);
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void vp10_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+                                    int stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, -cfg->shift[1], bd);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = _mm_add_epi32(u4, u5);
+    v5 = _mm_sub_epi32(u4, u5);
+    v6 = _mm_sub_epi32(u7, u6);
+    v7 = _mm_add_epi32(u6, u7);
+
+    // stage 4
+    u0 = _mm_add_epi32(v0, v3);
+    u1 = _mm_add_epi32(v1, v2);
+    u2 = _mm_sub_epi32(v1, v2);
+    u3 = _mm_sub_epi32(v0, v3);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
+    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
+    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
+    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
+    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
+    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
+    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
+    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
+}
+
+static void round_shift_8x8(__m128i *in , int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  x0 = _mm_unpacklo_epi16(v0, zero);
+  x1 = _mm_unpackhi_epi16(v0, zero);
+  x0 = _mm_add_epi32(in[0], x0);
+  x1 = _mm_add_epi32(in[1], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u0 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v1, zero);
+  x1 = _mm_unpackhi_epi16(v1, zero);
+  x0 = _mm_add_epi32(in[2], x0);
+  x1 = _mm_add_epi32(in[3], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u1 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v2, zero);
+  x1 = _mm_unpackhi_epi16(v2, zero);
+  x0 = _mm_add_epi32(in[4], x0);
+  x1 = _mm_add_epi32(in[5], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u2 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v3, zero);
+  x1 = _mm_unpackhi_epi16(v3, zero);
+  x0 = _mm_add_epi32(in[6], x0);
+  x1 = _mm_add_epi32(in[7], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u3 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v4, zero);
+  x1 = _mm_unpackhi_epi16(v4, zero);
+  x0 = _mm_add_epi32(in[8], x0);
+  x1 = _mm_add_epi32(in[9], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u4 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v5, zero);
+  x1 = _mm_unpackhi_epi16(v5, zero);
+  x0 = _mm_add_epi32(in[10], x0);
+  x1 = _mm_add_epi32(in[11], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u5 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v6, zero);
+  x1 = _mm_unpackhi_epi16(v6, zero);
+  x0 = _mm_add_epi32(in[12], x0);
+  x1 = _mm_add_epi32(in[13], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u6 = highbd_clamp_epi16(x0, bd);
+
+  x0 = _mm_unpacklo_epi16(v7, zero);
+  x1 = _mm_unpackhi_epi16(v7, zero);
+  x0 = _mm_add_epi32(in[14], x0);
+  x1 = _mm_add_epi32(in[15], x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  u7 = highbd_clamp_epi16(x0, bd);
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void vp10_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+                                    int stride, int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, -cfg->shift[1], bd);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+// 16x16
+static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
+  int i;
+  for (i = 0; i < 64; ++i) {
+    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+  }
+}
+
+static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
+                                        int col) {
+  int i;
+  for (i = 0; i < 16; i += 2) {
+    in8x8[i] = in[col];
+    in8x8[i + 1] = in[col + 1];
+    col += 4;
+  }
+}
+
+static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
+                               int shift, int bd) {
+  __m128i in8x8[16];
+
+  // Left-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 0);
+  write_buffer_8x8(in8x8, &output[0], stride, shift, bd);
+
+  // Right-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 2);
+  write_buffer_8x8(in8x8, &output[8], stride, shift, bd);
+
+  // Left-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 32);
+  write_buffer_8x8(in8x8, &output[8 * stride], stride, shift, bd);
+
+  // Right-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 34);
+  write_buffer_8x8(in8x8, &output[8 * stride + 8], stride, shift, bd);
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = in[8 * 4 + col];
+    u[2] = in[4 * 4 + col];
+    u[3] = in[12 * 4 + col];
+    u[4] = in[2 * 4 + col];
+    u[5] = in[10 * 4 + col];
+    u[6] = in[6 * 4 + col];
+    u[7] = in[14 * 4 + col];
+    u[8] = in[1 * 4 + col];
+    u[9] = in[9 * 4 + col];
+    u[10] = in[5 * 4 + col];
+    u[11] = in[13 * 4 + col];
+    u[12] = in[3 * 4 + col];
+    u[13] = in[11 * 4 + col];
+    u[14] = in[7 * 4 + col];
+    u[15] = in[15 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
+    u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
+    u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
+    u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[10], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[14], v[15]);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[6], u[7]);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[13], v[14]);
+    u[15] = _mm_add_epi32(v[12], v[15]);
+
+    // stage 6
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
+    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
+    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
+    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
+    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
+    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
+    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
+    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
+    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
+    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
+    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
+    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
+    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
+    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
+    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
+    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
+    v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
+    v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
+    v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
+    v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+
+    // stage 9
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
+  }
+}
+
+static void round_shift_16x16(__m128i *in, int shift) {
+  round_shift_8x8(&in[0], shift);
+  round_shift_8x8(&in[16], shift);
+  round_shift_8x8(&in[32], shift);
+  round_shift_8x8(&in[48], shift);
+}
+
+void vp10_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
+                                      int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, -cfg->shift[1], bd);
+      break;
+    default:
+      assert(0);
+  }
+}
diff --git a/vp10/common/x86/highbd_txfm_utility_sse4.h b/vp10/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 0000000..319b50a
--- /dev/null
+++ b/vp10/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
+#define _HIGHBD_TXFM_UTILITY_SSE4_H
+
+#include <smmintrin.h>  /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                \
+    __m128i u0, u1, u2, u3;           \
+    u0 = _mm_unpacklo_epi32(x0, x1);  \
+    u1 = _mm_unpackhi_epi32(x0, x1);  \
+    u2 = _mm_unpacklo_epi32(x2, x3);  \
+    u3 = _mm_unpackhi_epi32(x2, x3);  \
+    y0 = _mm_unpacklo_epi64(u0, u2);  \
+    y1 = _mm_unpackhi_epi64(u0, u2);  \
+    y2 = _mm_unpacklo_epi64(u1, u3);  \
+    y3 = _mm_unpackhi_epi64(u1, u3);  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6],
+                out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7],
+                out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14],
+                out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15],
+                out[9], out[11], out[13], out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12],
+                out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13],
+                out[16], out[20], out[24], out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28],
+                out[1], out[5], out[9], out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29],
+                out[17], out[21], out[25], out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14],
+                out[32], out[36], out[40], out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15],
+                out[48], out[52], out[56], out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30],
+                out[33], out[37], out[41], out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31],
+                out[49], out[53], out[57], out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44],
+                out[2], out[6], out[10], out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45],
+                out[18], out[22], out[26], out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60],
+                out[3], out[7], out[11], out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61],
+                out[19], out[23], out[27], out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46],
+                out[34], out[38], out[42], out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47],
+                out[50], out[54], out[58], out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62],
+                out[35], out[39], out[43], out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63],
+                out[51], out[55], out[59], out[63]);
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0,
+                                      __m128i w1, __m128i n1,
+                                      __m128i rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(w0, n0);
+  y = _mm_mullo_epi32(w1, n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
index 499e58d..1d70f14 100644
--- a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
+++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
@@ -8,7 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "vp10/common/enums.h"
+#include "vp10/common/vp10_txfm.h"
 #include "vp10/common/x86/vp10_txfm1d_sse4.h"
 
 static INLINE void int16_array_with_stride_to_int32_array_without_stride(
@@ -91,16 +92,16 @@
                                   const int stride, int tx_type,
                                   const int bd) {
   int32_t txfm_buf[1024];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_32x32_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_32X32);
   (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
 }
 
 void vp10_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
                                   const int stride, int tx_type,
                                   const int bd) {
   int32_t txfm_buf[4096];
-  const TXFM_2D_CFG* cfg = vp10_get_txfm_64x64_cfg(tx_type);
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_64x64_cfg(tx_type);
   (void)bd;
-  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
 }
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index d3d9780..e3dadaf 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -57,6 +57,14 @@
 }
 
 static void setup_compound_reference_mode(VP10_COMMON *cm) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  cm->comp_fwd_ref[0] = LAST_FRAME;
+  cm->comp_fwd_ref[1] = GOLDEN_FRAME;
+  cm->comp_bwd_ref[0] = BWDREF_FRAME;
+  cm->comp_bwd_ref[1] = ALTREF_FRAME;
+
+#else  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -66,7 +74,7 @@
     cm->comp_var_ref[2] = LAST3_FRAME;
     cm->comp_var_ref[3] = LAST4_FRAME;
     cm->comp_var_ref[4] = GOLDEN_FRAME;
-#else
+#else  // CONFIG_EXT_REFS
     cm->comp_var_ref[1] = GOLDEN_FRAME;
 #endif  // CONFIG_EXT_REFS
   } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
@@ -85,6 +93,7 @@
     cm->comp_var_ref[0] = GOLDEN_FRAME;
     cm->comp_var_ref[1] = ALTREF_FRAME;
   }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 }
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
@@ -171,9 +180,15 @@
 
   if (cm->reference_mode != SINGLE_REFERENCE) {
     for (i = 0; i < REF_CONTEXTS; ++i) {
-      for (j = 0; j < (COMP_REFS - 1); ++j) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      for (j = 0; j < (FWD_REFS - 1); ++j)
         vp10_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
-      }
+      for (j = 0; j < (BWD_REFS - 1); ++j)
+        vp10_diff_update_prob(r, &fc->comp_bwdref_prob[i][j]);
+#else
+      for (j = 0; j < (COMP_REFS - 1); ++j)
+        vp10_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     }
   }
 }
@@ -380,572 +395,6 @@
 }
 #endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
 
-#if CONFIG_SUPERTX
-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void build_mc_border_highbd(const uint8_t *src8, int src_stride,
-                                   uint16_t *dst, int dst_stride,
-                                   int x, int y, int b_w, int b_h,
-                                   int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      vpx_memset16(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
-
-    if (right)
-      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-static void extend_and_predict_highbd(const uint8_t *buf_ptr1,
-                                      int pre_buf_stride,
-                                      int x0, int y0, int b_w, int b_h,
-                                      int frame_width, int frame_height,
-                                      int border_offset,
-                                      uint8_t *const dst, int dst_buf_stride,
-                                      int subpel_x, int subpel_y,
-#if CONFIG_DUAL_FILTER
-                                      const INTERP_FILTER *interp_filter,
-#else
-                                      const INTERP_FILTER interp_filter,
-#endif
-                                      const struct scale_factors *sf,
-#if CONFIG_EXT_INTER
-                                      int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                                      MACROBLOCKD *xd,
-                                      int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint16_t,
-    mc_buf_high[(MAX_SB_SIZE + 16) * 2 * (MAX_SB_SIZE + 16) * 2]);
-  const uint8_t *buf_ptr;
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    build_mc_border_highbd(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
-                           x0, y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
-  } else {
-    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
-                    x0, y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
-  }
-#if CONFIG_EXT_INTER
-  if (ref && is_interinter_wedge_used(xd->mi[0]->mbmi.sb_type) &&
-      xd->mi[0]->mbmi.use_wedge_interinter)
-    vp10_make_masked_inter_predictor(
-        buf_ptr, b_w, dst, dst_buf_stride,
-        subpel_x, subpel_y, sf, w, h,
-        interp_filter, xs, ys,
-        wedge_offset_x, wedge_offset_y,
-        xd);
-  else
-#endif  // CONFIG_EXT_INTER
-    vp10_make_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride,
-                              subpel_x, subpel_y, sf, w, h, ref,
-                              interp_filter, xs, ys, xd);
-}
-
-#else
-
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
-                               int border_offset,
-                               uint8_t *const dst, int dst_buf_stride,
-                               int subpel_x, int subpel_y,
-#if CONFIG_DUAL_FILTER
-                               const INTERP_FILTER *interp_filter,
-#else
-                               const INTERP_FILTER interp_filter,
-#endif
-                               const struct scale_factors *sf,
-#if CONFIG_EXT_INTER
-                               int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                               MACROBLOCKD *xd,
-                               int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint8_t,
-    mc_buf[(MAX_SB_SIZE + 16) * 2 * (MAX_SB_SIZE + 16) * 2]);
-  const uint8_t *buf_ptr;
-
-  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
-                  x0, y0, b_w, b_h, frame_width, frame_height);
-  buf_ptr = mc_buf + border_offset;
-#if CONFIG_EXT_INTER
-  if (ref && is_interinter_wedge_used(xd->mi[0]->mbmi.sb_type) &&
-      xd->mi[0]->mbmi.use_wedge_interinter)
-    vp10_make_masked_inter_predictor(
-        buf_ptr, b_w, dst, dst_buf_stride,
-        subpel_x, subpel_y, sf, w, h,
-        interp_filter, xs, ys,
-        wedge_offset_x, wedge_offset_y,
-        xd);
-  else
-#endif  // CONFIG_EXT_INTER
-    vp10_make_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride,
-                              subpel_x, subpel_y, sf, w, h, ref,
-                              interp_filter, xs, ys, xd);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static void dec_build_inter_predictors(VP10Decoder *const pbi,
-                                       MACROBLOCKD *xd, int plane,
-#if CONFIG_OBMC
-                                       int mi_col_offset, int mi_row_offset,
-#endif  // CONFIG_OBMC
-                                       int bw, int bh,
-                                       int x, int y, int w, int h,
-#if CONFIG_EXT_INTER
-                                       int wedge_offset_x, int wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                                       int mi_x, int mi_y,
-#if CONFIG_DUAL_FILTER
-                                       const INTERP_FILTER *interp_filter,
-#else
-                                       const INTERP_FILTER interp_filter,
-#endif
-                                       const struct scale_factors *sf,
-                                       struct buf_2d *pre_buf,
-                                       struct buf_2d *dst_buf, const MV* mv,
-                                       RefCntBuffer *ref_frame_buf,
-                                       int is_scaled, int ref) {
-  VP10_COMMON *const cm = &pbi->common;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  MV32 scaled_mv;
-  MV mv_q4;
-  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
-      buf_stride, subpel_x, subpel_y;
-  uint8_t *ref_frame, *buf_ptr;
-#if CONFIG_EXT_INTER
-#if CONFIG_OBMC
-  const MODE_INFO *mi = xd->mi[mi_col_offset + xd->mi_stride * mi_row_offset];
-#else
-  const MODE_INFO *mi = xd->mi[0];
-#endif  // CONFIG_OBMC
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_EXT_INTERP
-  const int i_filter = IsInterpolatingFilter(interp_filter);
-#endif  // CONFIG_EXT_INTERP
-#if CONFIG_OBMC
-  (void) mi_col_offset;
-  (void) mi_row_offset;
-#endif  // CONFIG_OBMC
-
-  // Get reference frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = ref_frame_buf->buf.y_crop_width;
-    frame_height = ref_frame_buf->buf.y_crop_height;
-    ref_frame = ref_frame_buf->buf.y_buffer;
-  } else {
-    frame_width = ref_frame_buf->buf.uv_crop_width;
-    frame_height = ref_frame_buf->buf.uv_crop_height;
-    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
-                         : ref_frame_buf->buf.v_buffer;
-  }
-
-  mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
-                                    pd->subsampling_x,
-                                    pd->subsampling_y);
-  if (is_scaled) {
-    // Co-ordinate of containing block to pixel precision.
-    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-    // Co-ordinate of the block to 1/16th pixel precision.
-    x0_16 = (x_start + x) << SUBPEL_BITS;
-    y0_16 = (y_start + y) << SUBPEL_BITS;
-
-    // Co-ordinate of current block in reference frame
-    // to 1/16th pixel precision.
-    x0_16 = sf->scale_value_x(x0_16, sf);
-    y0_16 = sf->scale_value_y(y0_16, sf);
-
-    // Map the top left corner of the block into the reference frame.
-    x0 = sf->scale_value_x(x_start + x, sf);
-    y0 = sf->scale_value_y(y_start + y, sf);
-
-    // Scale the MV and incorporate the sub-pixel offset of the block
-    // in the reference frame.
-    scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-    xs = sf->x_step_q4;
-    ys = sf->y_step_q4;
-  } else {
-    // Co-ordinate of containing block to pixel precision.
-    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-    // Co-ordinate of the block to 1/16th pixel precision.
-    x0_16 = x0 << SUBPEL_BITS;
-    y0_16 = y0 << SUBPEL_BITS;
-
-    scaled_mv.row = mv_q4.row;
-    scaled_mv.col = mv_q4.col;
-    xs = ys = 16;
-  }
-  subpel_x = scaled_mv.col & SUBPEL_MASK;
-  subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-  // Calculate the top left corner of the best matching block in the
-  // reference frame.
-  x0 += scaled_mv.col >> SUBPEL_BITS;
-  y0 += scaled_mv.row >> SUBPEL_BITS;
-  x0_16 += scaled_mv.col;
-  y0_16 += scaled_mv.row;
-
-  // Get reference block pointer.
-  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-  buf_stride = pre_buf->stride;
-
-  // Do border extension if there is motion or the
-  // width/height is not a multiple of 8 pixels.
-  if (is_scaled || scaled_mv.col || scaled_mv.row ||
-#if CONFIG_EXT_INTERP
-      !i_filter ||
-#endif
-      (frame_width & 0x7) || (frame_height & 0x7)) {
-    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-
-    // Get reference block bottom right horizontal coordinate.
-    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    int x_pad = 0, y_pad = 0;
-
-#if CONFIG_DUAL_FILTER
-    InterpFilterParams filter_params_y =
-        vp10_get_interp_filter_params(interp_filter[0]);
-    InterpFilterParams filter_params_x =
-        vp10_get_interp_filter_params(interp_filter[1]);
-    int filter_size = VPXMAX(filter_params_y.taps, filter_params_x.taps);
-#else
-    InterpFilterParams filter_params =
-        vp10_get_interp_filter_params(interp_filter);
-    int filter_size = filter_params.taps;
-#endif
-
-    if (subpel_x ||
-#if CONFIG_EXT_INTERP
-        !i_filter ||
-#endif
-        (sf->x_step_q4 != SUBPEL_SHIFTS)) {
-      x0 -= filter_size / 2 - 1;
-      x1 += filter_size / 2;
-      x_pad = 1;
-    }
-
-    if (subpel_y ||
-#if CONFIG_EXT_INTERP
-        !i_filter ||
-#endif
-        (sf->y_step_q4 != SUBPEL_SHIFTS)) {
-      y0 -= filter_size / 2 - 1;
-      y1 += filter_size / 2;
-      y_pad = 1;
-    }
-
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (cm->frame_parallel_decode)
-      vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-
-    // Skip border extension if block is inside the frame.
-    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
-        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-      // Extend the border.
-      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
-      const int b_w = x1 - x0 + 1;
-      const int b_h = y1 - y0 + 1;
-      const int border_offset = y_pad * (filter_size / 2 - 1) * b_w +
-                                x_pad * (filter_size / 2 - 1);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      extend_and_predict_highbd(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
-                                frame_width, frame_height, border_offset,
-                                dst, dst_buf->stride,
-                                subpel_x, subpel_y,
-                                interp_filter, sf,
-#if CONFIG_EXT_INTER
-                                wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                                xd, w, h, ref, xs, ys);
-#else
-      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
-                         frame_width, frame_height, border_offset,
-                         dst, dst_buf->stride,
-                         subpel_x, subpel_y,
-                         interp_filter, sf,
-#if CONFIG_EXT_INTER
-                         wedge_offset_x, wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                         xd, w, h, ref, xs, ys);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      return;
-    }
-  } else {
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-     if (cm->frame_parallel_decode) {
-       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-       vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                             VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-     }
-  }
-#if CONFIG_EXT_INTER
-  if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
-      mi->mbmi.use_wedge_interinter)
-    vp10_make_masked_inter_predictor(
-        buf_ptr, buf_stride, dst, dst_buf->stride,
-        subpel_x, subpel_y, sf, w, h,
-        interp_filter, xs, ys,
-        wedge_offset_x, wedge_offset_y,
-        xd);
-  else
-#endif  // CONFIG_EXT_INTER
-    vp10_make_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride,
-                              subpel_x, subpel_y, sf, w, h, ref,
-                              interp_filter, xs, ys, xd);
-}
-
-static void dec_build_inter_predictors_sb_extend(
-    VP10Decoder *const pbi, MACROBLOCKD *xd,
-#if CONFIG_EXT_INTER
-    int mi_row_ori, int mi_col_ori,
-#endif  // CONFIG_EXT_INTER
-    int mi_row, int mi_col) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-#if CONFIG_EXT_INTER
-  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
-  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
-#endif  // CONFIG_EXT_INTER
-  const MODE_INFO *mi = xd->mi[0];
-  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
-  const int is_compound = has_second_ref(&mi->mbmi);
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-
-    struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = pd->n4_w;
-    const int num_4x4_h = pd->n4_h;
-
-    const int n4w_x4 = 4 * num_4x4_w;
-    const int n4h_x4 = 4 * num_4x4_h;
-    int ref;
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-      const int idx = xd->block_refs[ref]->idx;
-      BufferPool *const pool = pbi->common.buffer_pool;
-      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-      const int is_scaled = vp10_is_scaled(sf);
-
-      if (sb_type < BLOCK_8X8) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
-        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
-        int x, y;
-        for (y = 0; y < num_4x4_h; ++y) {
-          for (x = 0; x < num_4x4_w; ++x) {
-            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-            dec_build_inter_predictors(
-                pbi, xd, plane,
-#if CONFIG_OBMC
-                0, 0,
-#endif  // CONFIG_OBMC
-                n4w_x4, n4h_x4,
-                4 * x, 4 * y, pw, ph,
-#if CONFIG_EXT_INTER
-                wedge_offset_x,
-                wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                mi_x, mi_y,
-                mi->mbmi.interp_filter, sf, pre_buf, dst_buf,
-                &mv, ref_frame_buf, is_scaled, ref);
-          }
-        }
-      } else {
-        const MV mv = mi->mbmi.mv[ref].as_mv;
-        dec_build_inter_predictors(
-            pbi, xd, plane,
-#if CONFIG_OBMC
-            0, 0,
-#endif  // CONFIG_OBMC
-            n4w_x4, n4h_x4,
-            0, 0, n4w_x4, n4h_x4,
-#if CONFIG_EXT_INTER
-            wedge_offset_x,
-            wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-            mi_x, mi_y,
-            mi->mbmi.interp_filter, sf, pre_buf, dst_buf,
-            &mv, ref_frame_buf,
-            is_scaled, ref);
-      }
-    }
-  }
-#if CONFIG_EXT_INTER
-  if (is_interintra_pred(&mi->mbmi))
-    vp10_build_interintra_predictors(xd,
-                                     xd->plane[0].dst.buf,
-                                     xd->plane[1].dst.buf,
-                                     xd->plane[2].dst.buf,
-                                     xd->plane[0].dst.stride,
-                                     xd->plane[1].dst.stride,
-                                     xd->plane[2].dst.stride,
-                                     sb_type);
-#endif  // CONFIG_EXT_INTER
-}
-
-static void dec_build_inter_predictors_sb_sub8x8_extend(
-    VP10Decoder *const pbi,
-    MACROBLOCKD *xd,
-#if CONFIG_EXT_INTER
-    int mi_row_ori, int mi_col_ori,
-#endif  // CONFIG_EXT_INTER
-    int mi_row, int mi_col,
-    int block) {
-  // Prediction function used in supertx:
-  // Use the mv at current block (which is less than 8x8)
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-#if CONFIG_EXT_INTER
-  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
-  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
-#endif  // CONFIG_EXT_INTER
-  const MODE_INFO *mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
-
-  // For sub8x8 uv:
-  // Skip uv prediction in supertx except the first block (block = 0)
-  int max_plane = block ? 1 : MAX_MB_PLANE;
-
-  for (plane = 0; plane < max_plane; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = pd->n4_w;
-    const int num_4x4_h = pd->n4_h;
-
-    const int n4w_x4 = 4 * num_4x4_w;
-    const int n4h_x4 = 4 * num_4x4_h;
-    int ref;
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-      const int idx = xd->block_refs[ref]->idx;
-      BufferPool *const pool = pbi->common.buffer_pool;
-      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-      const int is_scaled = vp10_is_scaled(sf);
-      const MV mv = average_split_mvs(pd, mi, ref, block);
-      dec_build_inter_predictors(pbi, xd, plane,
-#if CONFIG_OBMC
-                                 0, 0,
-#endif  // CONFIG_OBMC
-                                 n4w_x4, n4h_x4,
-                                 0, 0, n4w_x4, n4h_x4,
-#if CONFIG_EXT_INTER
-                                 wedge_offset_x,
-                                 wedge_offset_y,
-#endif  // CONFIG_EXT_INTER
-                                 mi_x, mi_y,
-                                 mi->mbmi.interp_filter, sf, pre_buf, dst_buf,
-                                 &mv, ref_frame_buf, is_scaled, ref);
-    }
-  }
-#if CONFIG_EXT_INTER
-  if (is_interintra_pred(&mi->mbmi))
-    vp10_build_interintra_predictors(xd,
-                                     xd->plane[0].dst.buf,
-                                     xd->plane[1].dst.buf,
-                                     xd->plane[2].dst.buf,
-                                     xd->plane[0].dst.stride,
-                                     xd->plane[1].dst.stride,
-                                     xd->plane[2].dst.stride,
-                                     mi->mbmi.sb_type);
-#endif  // CONFIG_EXT_INTER
-}
-#endif  // CONFIG_SUPERTX
-
 static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                          int n4_wl, int n4_hl) {
   // get minimum log2 num4x4s dimension
@@ -1171,19 +620,19 @@
                          (c >> xd->plane[2].subsampling_x);
 
   if (!b_sub8x8)
-    dec_build_inter_predictors_sb_extend(
-        pbi, xd,
+    vp10_build_inter_predictors_sb_extend(
+        xd,
 #if CONFIG_EXT_INTER
         mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-        mi_row_pred, mi_col_pred);
+        mi_row_pred, mi_col_pred, bsize_pred);
   else
-    dec_build_inter_predictors_sb_sub8x8_extend(
-        pbi, xd,
+    vp10_build_inter_predictors_sb_sub8x8_extend(
+        xd,
 #if CONFIG_EXT_INTER
         mi_row_ori, mi_col_ori,
 #endif  // CONFIG_EXT_INTER
-        mi_row_pred, mi_col_pred, block);
+        mi_row_pred, mi_col_pred, bsize_pred, block);
 }
 
 static void dec_extend_dir(VP10Decoder *const pbi, MACROBLOCKD *const xd,
@@ -3615,6 +3064,11 @@
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
@@ -3631,9 +3085,11 @@
 #endif
 
   cm->show_existing_frame = vpx_rb_read_bit(rb);
+
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
+
     lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
       unlock_buffer_pool(pool);
@@ -3641,17 +3097,72 @@
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
     }
-
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
     unlock_buffer_pool(pool);
-    pbi->refresh_frame_flags = 0;
+
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    // NOTE(zoeliu): The existing frame to show is adopted as a reference frame.
+    pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+
+    for (i = 0; i < REFS_PER_FRAME; ++i) {
+      const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
+      const int idx = cm->ref_frame_map[ref];
+      RefBuffer *const ref_frame = &cm->frame_refs[i];
+      ref_frame->idx = idx;
+      ref_frame->buf = &frame_bufs[idx].buf;
+      cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb);
+    }
+
+    for (i = 0; i < REFS_PER_FRAME; ++i) {
+      RefBuffer *const ref_buf = &cm->frame_refs[i];
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp10_setup_scale_factors_for_frame(&ref_buf->sf,
+                                         ref_buf->buf->y_crop_width,
+                                         ref_buf->buf->y_crop_height,
+                                         cm->width, cm->height,
+                                         cm->use_highbitdepth);
+#else  // CONFIG_VP9_HIGHBITDEPTH
+      vp10_setup_scale_factors_for_frame(&ref_buf->sf,
+                                         ref_buf->buf->y_crop_width,
+                                         ref_buf->buf->y_crop_height,
+                                         cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+
+    // Generate next_ref_frame_map.
+    lock_buffer_pool(pool);
+    for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+      if (mask & 1) {
+        cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+        ++frame_bufs[cm->new_fb_idx].ref_count;
+      } else {
+        cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+      }
+      // Current thread holds the reference frame.
+      if (cm->ref_frame_map[ref_index] >= 0)
+        ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+      ++ref_index;
+    }
+
+    for (; ref_index < REF_FRAMES; ++ref_index) {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+      // Current thread holds the reference frame.
+      if (cm->ref_frame_map[ref_index] >= 0)
+        ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    }
+    unlock_buffer_pool(pool);
+    pbi->hold_ref_buf = 1;
+#else
+    pbi->refresh_frame_flags = 0;
     if (cm->frame_parallel_decode) {
       for (i = 0; i < REF_FRAMES; ++i)
         cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
     }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
     return 0;
   }
 
@@ -3715,6 +3226,15 @@
       }
     } else if (pbi->need_resync != 1) {  /* Skip if need resync */
       pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      if (!pbi->refresh_frame_flags) {
+        // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
       for (i = 0; i < REFS_PER_FRAME; ++i) {
         const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
@@ -4036,6 +3556,7 @@
 
     if (cm->reference_mode != SINGLE_REFERENCE)
       setup_compound_reference_mode(cm);
+
     read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
@@ -4107,6 +3628,10 @@
                  sizeof(cm->counts.comp_ref)));
   assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
                  sizeof(cm->counts.tx_size)));
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
+                 sizeof(cm->counts.comp_bwdref)));
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
 #if CONFIG_REF_MV
   assert(!memcmp(&cm->counts.mv[0], &zero_counts.mv[0],
@@ -4181,7 +3706,13 @@
 
   if (!first_partition_size) {
     // showing a frame directly
-    *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    if (cm->show_existing_frame)
+      *p_data_end = data + vpx_rb_bytes_read(&rb);
+    else
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+
     return;
   }
 
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index fd14ef5..a25fe7a 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -759,11 +759,28 @@
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (mode == COMPOUND_REFERENCE) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      const int idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#else
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       const int ctx = vp10_get_pred_context_comp_ref_p(cm, xd);
       const int bit = vp10_read(r, fc->comp_ref_prob[ctx][0]);
       if (counts)
         ++counts->comp_ref[ctx][0][bit];
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      ref_frame[!idx] = cm->comp_fwd_ref[bit];
+      {
+        const int ctx1 = vp10_get_pred_context_comp_bwdref_p(cm, xd);
+        const int bit1 = vpx_read(r, fc->comp_bwdref_prob[ctx1][0]);
+        if (counts)
+          ++counts->comp_bwdref[ctx1][0][bit1];
+        ref_frame[idx] = cm->comp_bwd_ref[bit1];
+      }
+
+#else  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
       ref_frame[idx] = cm->comp_fixed_ref;
 
 #if CONFIG_EXT_REFS
@@ -788,9 +805,10 @@
           ref_frame[!idx] = cm->comp_var_ref[4];
         }
       }
-#else
+#else  // CONFIG_EXT_REFS
       ref_frame[!idx] = cm->comp_var_ref[bit];
 #endif  // CONFIG_EXT_REFS
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     } else if (mode == SINGLE_REFERENCE) {
 #if CONFIG_EXT_REFS
       const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
@@ -822,7 +840,7 @@
           ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
         }
       }
-#else
+#else  // CONFIG_EXT_REFS
       const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
       const int bit0 = vp10_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
@@ -832,7 +850,19 @@
         const int bit1 = vp10_read(r, fc->single_ref_prob[ctx1][1]);
         if (counts)
           ++counts->single_ref[ctx1][1][bit1];
+#if CONFIG_BIDIR_PRED
+        if (bit1) {
+          const int ctx2 = vp10_get_pred_context_single_ref_p3(xd);
+          const int bit2 = vpx_read(r, fc->single_ref_prob[ctx2][2]);
+          if (counts)
+            ++counts->single_ref[ctx2][2][bit2];
+          ref_frame[0] = bit2 ? ALTREF_FRAME : BWDREF_FRAME;
+        } else {
+          ref_frame[0] = GOLDEN_FRAME;
+        }
+#else  // CONFIG_BIDIR_PRED
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
+#endif  // CONFIG_BIDIR_PRED
       } else {
         ref_frame[0] = LAST_FRAME;
       }
@@ -1577,7 +1607,7 @@
           xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
         if (mbmi->use_wedge_interintra) {
           mbmi->interintra_wedge_index =
-              vp10_read_literal(r, get_wedge_bits_lookup[bsize]);
+              vp10_read_literal(r, get_wedge_bits_lookup(bsize));
           mbmi->interintra_wedge_sign = 0;
         }
       }
@@ -1610,22 +1640,27 @@
       xd->counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
     if (mbmi->use_wedge_interinter) {
       mbmi->interinter_wedge_index =
-          vp10_read_literal(r, get_wedge_bits_lookup[bsize]);
+          vp10_read_literal(r, get_wedge_bits_lookup(bsize));
       mbmi->interinter_wedge_sign = vp10_read_bit(r);
     }
   }
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_DUAL_FILTER
-  for (ref = 0; ref < 4; ++ref) {
-    const int frame_idx = (ref >> 1);
+  for (ref = 0; ref < 2; ++ref) {
     mbmi->interp_filter[ref] = (cm->interp_filter == SWITCHABLE) ?
         EIGHTTAP_REGULAR : cm->interp_filter;
 
-    if (mbmi->ref_frame[frame_idx] > INTRA_FRAME &&
-        has_subpel_mv_component(xd, ref))
+    if (has_subpel_mv_component(xd->mi[0], xd, ref) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, ref + 2)))
       mbmi->interp_filter[ref] = read_interp_filter(cm, xd, ref, r);
   }
+  // The index system worsk as:
+  // (0, 1) -> (vertical, horizontal) filter types for the first ref frame.
+  // (2, 3) -> (vertical, horizontal) filter types for the second ref frame.
+  mbmi->interp_filter[2] = mbmi->interp_filter[0];
+  mbmi->interp_filter[3] = mbmi->interp_filter[1];
 #else
 #if CONFIG_EXT_INTERP
   mbmi->interp_filter = read_interp_filter(cm, xd, r);
diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index 6d567d6..573266e 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@@ -214,6 +214,9 @@
   // #else  // CONFIG_EXT_REFS
   //   cpi->gld_fb_idx = 1;
   //   cpi->alt_fb_idx = 2;
+
+  // TODO(zoeliu): To revisit following code and reconsider what assumption we
+  // may take on the reference frame buffer virtual indexes
   if (ref_frame_flag == VP9_LAST_FLAG) {
     idx = cm->ref_frame_map[0];
 #if CONFIG_EXT_REFS
@@ -227,11 +230,18 @@
     idx = cm->ref_frame_map[4];
   } else if (ref_frame_flag == VP9_ALT_FLAG) {
     idx = cm->ref_frame_map[5];
-#else
+#else  // CONFIG_EXT_REFS
   } else if (ref_frame_flag == VP9_GOLD_FLAG) {
     idx = cm->ref_frame_map[1];
+#if CONFIG_BIDIR_PRED
+  } else if (ref_frame_flag == VP9_BWD_FLAG) {
+    idx = cm->ref_frame_map[2];
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
+    idx = cm->ref_frame_map[3];
+#else  // CONFIG_BIDIR_PRED
   } else if (ref_frame_flag == VP9_ALT_FLAG) {
     idx = cm->ref_frame_map[2];
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
@@ -281,15 +291,25 @@
   }
 
   // Current thread releases the holding of reference frame.
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+#else
   for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
     const int old_idx = cm->ref_frame_map[ref_index];
     decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
   }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
 
+  // TODO(zoeliu): To fix the ref frame buffer update for the scenario of
+  //               cm->frame_parellel_decode == 1
   if (!cm->frame_parallel_decode || !cm->show_frame) {
     lock_buffer_pool(pool);
     --frame_bufs[cm->new_fb_idx].ref_count;
@@ -297,8 +317,10 @@
   }
 
   // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < REFS_PER_FRAME; ref_index++)
-    cm->frame_refs[ref_index].idx = -1;
+  for (ref_index = 0; ref_index < REFS_PER_FRAME; ref_index++) {
+    cm->frame_refs[ref_index].idx = INVALID_IDX;
+    cm->frame_refs[ref_index].buf = NULL;
+  }
 }
 
 int vp10_receive_compressed_data(VP10Decoder *pbi,
@@ -327,12 +349,16 @@
 
   pbi->ready_for_new_data = 0;
 
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+
   // Check if the previous frame was a frame without any references to it.
   // Release frame buffer if not decoding in frame parallel mode.
   if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0
       && frame_bufs[cm->new_fb_idx].ref_count == 0)
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX)
@@ -386,10 +412,17 @@
       }
 
       // Current thread releases the holding of reference frame.
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      for (; ref_index < REF_FRAMES; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+#else
       for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
         const int old_idx = cm->ref_frame_map[ref_index];
         decrease_ref_count(old_idx, frame_bufs, pool);
       }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       pbi->hold_ref_buf = 0;
     }
     // Release current frame.
@@ -417,7 +450,13 @@
 
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
-    cm->prev_frame = cm->cur_frame;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    // NOTE: It is not supposed to ref to any frame not used as reference
+    if (cm->is_reference_frame)
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cm->prev_frame = cm->cur_frame;
+
     if (cm->seg.enabled && !cm->frame_parallel_decode)
       vp10_swap_current_and_last_seg_map(cm);
   }
@@ -483,6 +522,17 @@
   return ret;
 }
 
+int vp10_get_frame_to_show(VP10Decoder *pbi,
+                           YV12_BUFFER_CONFIG *frame) {
+  VP10_COMMON *const cm = &pbi->common;
+
+  if (!cm->show_frame || !cm->frame_to_show)
+    return -1;
+
+  *frame = *cm->frame_to_show;
+  return 0;
+}
+
 vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
                                            size_t data_sz,
                                            uint32_t sizes[8], int *count,
diff --git a/vp10/decoder/decoder.h b/vp10/decoder/decoder.h
index 0839e46..b34b009 100644
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@@ -104,6 +104,8 @@
 int vp10_get_raw_frame(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp10_ppflags_t *flags);
 
+int vp10_get_frame_to_show(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
 vpx_codec_err_t vp10_copy_reference_dec(struct VP10Decoder *pbi,
                                        VP9_REFFRAME ref_frame_flag,
                                        YV12_BUFFER_CONFIG *sd);
diff --git a/vp10/decoder/dthread.c b/vp10/decoder/dthread.c
index 4206adc..a4555c8 100644
--- a/vp10/decoder/dthread.c
+++ b/vp10/decoder/dthread.c
@@ -159,6 +159,10 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
 #endif
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // TODO(zoeliu): To handle parallel decoding
+  assert(0);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   dst_cm->prev_frame = src_cm->show_existing_frame ?
                        src_cm->prev_frame : src_cm->cur_frame;
   dst_cm->last_width = !src_cm->show_existing_frame ?
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 4f8e89c..bca36df 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -835,8 +835,11 @@
       const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
                        mbmi->ref_frame[0] == LAST3_FRAME ||
                        mbmi->ref_frame[0] == LAST4_FRAME);
-#else
+#else  // CONFIG_EXT_REFS
       const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+#if CONFIG_BIDIR_PRED
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
       vp10_write(w, bit, vp10_get_pred_prob_comp_ref_p(cm, xd));
 
@@ -852,6 +855,10 @@
           vp10_write(w, bit3, vp10_get_pred_prob_comp_ref_p3(cm, xd));
         }
       }
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+      vpx_write(w, bit_bwd, vp10_get_pred_prob_comp_bwdref_p(cm, xd));
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
     } else {
 #if CONFIG_EXT_REFS
@@ -875,12 +882,18 @@
           vp10_write(w, bit4, vp10_get_pred_prob_single_ref_p5(cm, xd));
         }
       }
-#else
+#else  // CONFIG_EXT_REFS
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
       vp10_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
         vp10_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+#if CONFIG_BIDIR_PRED
+        if (bit1) {
+          const int bit2 = mbmi->ref_frame[0] != BWDREF_FRAME;
+          vp10_write(w, bit2, vp10_get_pred_prob_single_ref_p3(cm, xd));
+        }
+#endif  // CONFIG_BIDIR_PRED
       }
 #endif  // CONFIG_EXT_REFS
     }
@@ -938,10 +951,10 @@
 #endif  // CONFIG_DUAL_FILTER
 #endif  // CONFIG_EXT_INTERP
 #if CONFIG_DUAL_FILTER
-    for (dir = 0; dir < 4; ++dir) {
-      const int frame_idx = (dir >> 1);
-      if (mbmi->ref_frame[frame_idx] > INTRA_FRAME &&
-          has_subpel_mv_component(xd, dir)) {
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
         const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
         vp10_write_token(w, vp10_switchable_interp_tree,
               cm->fc->switchable_interp_prob[ctx],
@@ -1338,7 +1351,7 @@
                      cm->fc->wedge_interintra_prob[bsize]);
           if (mbmi->use_wedge_interintra) {
             vp10_write_literal(w, mbmi->interintra_wedge_index,
-                              get_wedge_bits_lookup[bsize]);
+                              get_wedge_bits_lookup(bsize));
             assert(mbmi->interintra_wedge_sign == 0);
           }
         }
@@ -1368,7 +1381,7 @@
                  cm->fc->wedge_interinter_prob[bsize]);
       if (mbmi->use_wedge_interinter) {
         vp10_write_literal(w, mbmi->interinter_wedge_index,
-                           get_wedge_bits_lookup[bsize]);
+                           get_wedge_bits_lookup(bsize));
         vp10_write_bit(w, mbmi->interinter_wedge_sign);
       }
     }
@@ -1578,7 +1591,31 @@
     // up if they are scaled. vp10_is_interp_needed is in turn needed by
     // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
     set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
-#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_EXT_INTERP
+#if 0
+    // NOTE(zoeliu): For debug
+    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+      const PREDICTION_MODE mode = m->mbmi.mode;
+      const int segment_id = m->mbmi.segment_id;
+      const BLOCK_SIZE bsize = m->mbmi.sb_type;
+
+      // For sub8x8, simply dump out the first sub8x8 block info
+      const PREDICTION_MODE b_mode =
+          (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1;
+      const int mv_x = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row;
+      const int mv_y = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col;
+
+      printf("Before pack_inter_mode_mvs(): "
+             "Frame=%d, (mi_row,mi_col)=(%d,%d), "
+             "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, "
+             "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n",
+             cm->current_video_frame, mi_row, mi_col,
+             mode, segment_id, bsize, b_mode, mv_x, mv_y,
+             m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+    }
+#endif  // 0
     pack_inter_mode_mvs(cpi, m,
 #if CONFIG_SUPERTX
                         supertx_enabled,
@@ -2643,8 +2680,12 @@
   //     LAST4_FRAME.
   refresh_mask |= (cpi->refresh_last_frame <<
       cpi->lst_fb_idxes[LAST4_FRAME - LAST_FRAME]);
-#else
+#else  // CONFIG_EXT_REFS
   refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
+
+#if CONFIG_BIDIR_PRED
+  refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
 
   if (vp10_preserve_existing_gf(cpi)) {
@@ -2997,7 +3038,46 @@
 
   write_profile(cm->profile, wb);
 
-  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+
+  if (cm->show_existing_frame) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    const int frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a reconstructed frame",
+                         frame_to_show);
+    }
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+    vpx_wb_write_bit(wb, 1);  // show_existing_frame
+    vpx_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+    cpi->refresh_frame_mask = get_refresh_mask(cpi);
+    vpx_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+    {
+      MV_REFERENCE_FRAME ref_frame;
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                             REF_FRAMES_LOG2);
+        // TODO(zoeliu): To further explore whether sign bias bits are needed.
+        vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+      }
+    }
+
+    return;
+  } else {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    vpx_wb_write_bit(wb, 0);  // show_existing_frame
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   vpx_wb_write_bit(wb, cm->frame_type);
   vpx_wb_write_bit(wb, cm->show_frame);
   vpx_wb_write_bit(wb, cm->error_resilient_mode);
@@ -3025,15 +3105,37 @@
       }
     }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    cpi->refresh_frame_mask = get_refresh_mask(cpi);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
     if (cm->intra_only) {
       write_sync_code(wb);
       write_bitdepth_colorspace_sampling(cm, wb);
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      vpx_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
       vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      vpx_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
       vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      if (!cpi->refresh_frame_mask) {
+        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
         vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
@@ -3076,6 +3178,7 @@
     cm->tx_mode = TX_4X4;
   else
     write_txfm_mode(cm->tx_mode, wb);
+
   if (cpi->allow_comp_inter_inter) {
     const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
     const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
@@ -3221,10 +3324,21 @@
 
     if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
-        for (j = 0; j < (COMP_REFS - 1); j ++) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+        for (j = 0; j < (FWD_REFS - 1); j++) {
           vp10_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
                                      counts->comp_ref[i][j]);
         }
+        for (j = 0; j < (BWD_REFS - 1); j++) {
+          vp10_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
+                                     counts->comp_bwdref[i][j]);
+        }
+#else  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+        for (j = 0; j < (COMP_REFS - 1); j++) {
+          vp10_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                     counts->comp_ref[i][j]);
+        }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       }
     }
 
@@ -3418,6 +3532,13 @@
   // Write the uncompressed header
   write_uncompressed_header(cpi, &wb);
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (cm->show_existing_frame) {
+    *size = vpx_wb_bytes_written(&wb);
+    return;
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   // We do not know these in advance. Output placeholder bit.
   saved_wb = wb;
   // Write tile size magnitudes
diff --git a/vp10/encoder/denoiser.c b/vp10/encoder/denoiser.c
index 43c94b1..5a6ae4a 100644
--- a/vp10/encoder/denoiser.c
+++ b/vp10/encoder/denoiser.c
@@ -388,6 +388,9 @@
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
                                     int refresh_last_frame,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+                                    int refresh_bwd_ref_frame,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
                                     int refresh_alt_ref_frame,
                                     int refresh_golden_frame) {
   if (frame_type == KEY_FRAME) {
@@ -411,6 +414,12 @@
     swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
                       &denoiser->running_avg_y[INTRA_FRAME]);
   }
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (refresh_bwd_ref_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[BWDREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 }
 
 void vp10_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
diff --git a/vp10/encoder/denoiser.h b/vp10/encoder/denoiser.h
index 8182762..ceef451 100644
--- a/vp10/encoder/denoiser.h
+++ b/vp10/encoder/denoiser.h
@@ -36,6 +36,9 @@
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
                                     int refresh_last_frame,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+                                    int refresh_bwd_ref_frame,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
                                     int refresh_alt_ref_frame,
                                     int refresh_golden_frame);
 
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index a6ff9b6..e68de82 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1023,12 +1023,13 @@
 static void reset_intmv_filter_type(VP10_COMMON *cm,
                                     MACROBLOCKD *xd, MB_MODE_INFO *mbmi) {
   int dir;
-  for (dir = 0; dir < 4; ++dir) {
-    const int frame_idx = (dir >> 1);
-    if (mbmi->ref_frame[frame_idx] > INTRA_FRAME &&
-        !has_subpel_mv_component(xd, dir))
+  for (dir = 0; dir < 2; ++dir) {
+    if (!has_subpel_mv_component(xd->mi[0], xd, dir) &&
+        (mbmi->ref_frame[1] == NONE ||
+         !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
       mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE) ?
           EIGHTTAP_REGULAR : cm->interp_filter;
+    mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir];
   }
 }
 
@@ -1036,10 +1037,10 @@
                                      const MACROBLOCKD *xd,
                                      const MB_MODE_INFO *mbmi) {
   int dir;
-  for (dir = 0; dir < 4; ++dir) {
-    const int frame_idx = (dir >> 1);
-    if (mbmi->ref_frame[frame_idx] > INTRA_FRAME &&
-        has_subpel_mv_component(xd, dir)) {
+  for (dir = 0; dir < 2; ++dir) {
+    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
       const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
       ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
     }
@@ -1882,6 +1883,9 @@
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+        const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
         if (cm->reference_mode == REFERENCE_MODE_SELECT)
           counts->comp_inter[vp10_get_reference_mode_context(cm, xd)]
                             [has_second_ref(mbmi)]++;
@@ -1902,9 +1906,13 @@
                               [ref0 == LAST3_FRAME]++;
             }
           }
-#else
+#else  // CONFIG_EXT_REFS
           counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
+#if CONFIG_BIDIR_PRED
+          counts->comp_bwdref[vp10_get_pred_context_comp_bwdref_p(cm, xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
         } else {
 #if CONFIG_EXT_REFS
@@ -1925,12 +1933,19 @@
                                 [ref0 != LAST3_FRAME]++;
             }
           }
-#else
+#else  // CONFIG_EXT_REFS
           counts->single_ref[vp10_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
-          if (ref0 != LAST_FRAME)
+          if (ref0 != LAST_FRAME) {
             counts->single_ref[vp10_get_pred_context_single_ref_p2(xd)][1]
                               [ref0 != GOLDEN_FRAME]++;
+#if CONFIG_BIDIR_PRED
+            if (ref0 != GOLDEN_FRAME) {
+              counts->single_ref[vp10_get_pred_context_single_ref_p3(xd)][2]
+                                [ref0 != BWDREF_FRAME]++;
+            }
+#endif  // CONFIG_BIDIR_PRED
+          }
 #endif  // CONFIG_EXT_REFS
         }
 
@@ -4331,6 +4346,10 @@
             !!(ref_flags & VP9_LAST2_FLAG) +
             !!(ref_flags & VP9_LAST3_FLAG) +
             !!(ref_flags & VP9_LAST4_FLAG) +
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+            !!(ref_flags & VP9_BWD_FLAG) +
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
             !!(ref_flags & VP9_ALT_FLAG)) >= 2;
   }
@@ -4359,7 +4378,7 @@
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
-    // TODO(zoeliu): TO investigate whether a frame_type other than
+    // TODO(zoeliu): To investigate whether a frame_type other than
     // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     return LAST_FRAME;
 }
@@ -4513,6 +4532,7 @@
                            cm->height == cm->last_height &&
                            !cm->intra_only &&
                            cm->last_show_frame;
+
   // Special case: set prev_mi to NULL when the previous mode info
   // context cannot be used.
   cm->prev_mi = cm->use_prev_frame_mvs ?
@@ -4560,13 +4580,6 @@
 #endif
 }
 
-#if !CONFIG_DUAL_FILTER
-static INTERP_FILTER get_cm_interp_filter(VP10_COMP *cpi) {
-  (void)cpi;
-  return SWITCHABLE;
-}
-#endif
-
 void vp10_encode_frame(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
 
@@ -4584,6 +4597,13 @@
       cpi->allow_comp_inter_inter = 0;
     } else {
       cpi->allow_comp_inter_inter = 1;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cm->comp_fwd_ref[0] = LAST_FRAME;
+      cm->comp_fwd_ref[1] = GOLDEN_FRAME;
+      cm->comp_bwd_ref[0] = BWDREF_FRAME;
+      cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
 #if CONFIG_EXT_REFS
@@ -4591,9 +4611,10 @@
       cm->comp_var_ref[2] = LAST3_FRAME;
       cm->comp_var_ref[3] = LAST4_FRAME;
       cm->comp_var_ref[4] = GOLDEN_FRAME;
-#else
+#else  // CONFIG_EXT_REFS
       cm->comp_var_ref[1] = GOLDEN_FRAME;
 #endif  // CONFIG_EXT_REFS
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     }
   } else {
     cpi->allow_comp_inter_inter = 0;
@@ -4613,7 +4634,7 @@
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
     //
-    // TODO(zoeliu): TO investigate whether a frame_type other than
+    // TODO(zoeliu): To investigate whether a frame_type other than
     // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
@@ -4633,10 +4654,8 @@
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
-#if !CONFIG_DUAL_FILTER
-    if (cm->interp_filter == SWITCHABLE) {
-      cm->interp_filter = get_cm_interp_filter(cpi);
-    }
+#if CONFIG_DUAL_FILTER
+    cm->interp_filter = SWITCHABLE;
 #endif
 
     encode_frame_internal(cpi);
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 43b5401..dda82de 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -716,9 +716,9 @@
     cpi->lookahead = vp10_lookahead_init(oxcf->width, oxcf->height,
                                         cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                      cm->use_highbitdepth,
+                                        cm->use_highbitdepth,
 #endif
-                                      oxcf->lag_in_frames);
+                                        oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
@@ -902,10 +902,15 @@
     cpi->lst_fb_idxes[fb_idx] = fb_idx;
   cpi->gld_fb_idx = LAST_REF_FRAMES;
   cpi->alt_fb_idx = cpi->gld_fb_idx + 1;
-#else
+#else  // CONFIG_EXT_REFS
   cpi->lst_fb_idx = 0;
   cpi->gld_fb_idx = 1;
+#if CONFIG_BIDIR_PRED
+  cpi->bwd_fb_idx = 2;
+  cpi->alt_fb_idx = 3;
+#else  // CONFIG_BIDIR_PRED
   cpi->alt_fb_idx = 2;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
 }
 
@@ -2232,6 +2237,9 @@
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  cpi->refresh_bwd_ref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
   cm->refresh_frame_context =
       (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) ?
@@ -2301,6 +2309,12 @@
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  rc->is_bwd_ref_frame = 0;
+  rc->is_last_nonref_frame = 0;
+  rc->is_nonref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 #if 0
   // Experimental RD Code
   cpi->frame_distortion = 0;
@@ -2409,6 +2423,7 @@
   cm->current_video_frame = 0;
   cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
+  cpi->last_show_frame_buf_idx = INVALID_IDX;
 
   realloc_segmentation_maps(cpi);
 
@@ -2766,6 +2781,7 @@
 
   return cpi;
 }
+
 #define SNPRINT(H, T) \
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
 
@@ -2967,8 +2983,8 @@
   cpi->ext_refresh_frame_flags_pending = 1;
 }
 
-static YV12_BUFFER_CONFIG *get_vp10_ref_frame_buffer(VP10_COMP *cpi,
-                                VP9_REFFRAME ref_frame_flag) {
+static YV12_BUFFER_CONFIG *get_vp10_ref_frame_buffer(
+    VP10_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
   MV_REFERENCE_FRAME ref_frame = NONE;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
@@ -2982,6 +2998,10 @@
 #endif  // CONFIG_EXT_REFS
   else if (ref_frame_flag == VP9_GOLD_FLAG)
     ref_frame = GOLDEN_FRAME;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  else if (ref_frame_flag == VP9_BWD_FLAG)
+    ref_frame = BWDREF_FRAME;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   else if (ref_frame_flag == VP9_ALT_FLAG)
     ref_frame = ALTREF_FRAME;
 
@@ -3322,12 +3342,44 @@
   int ref_frame;
 #endif  // CONFIG_EXT_REFS
 
-  if (use_upsampled_ref) {
-    // Up-sample the current encoded frame.
-    RefCntBuffer *bufs = pool->frame_bufs;
-    const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame)
+    cpi->last_show_frame_buf_idx = cm->new_fb_idx;
 
-    new_uidx = upsample_ref_frame(cpi, ref);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // TODO(zoeliu): To remove the reference buffer update for the
+  //               show_existing_frame==1 case.
+#if 0
+  if (cpi->rc.is_last_nonref_frame) {
+    // NOTE: After the encoding of the LAST_NONREF_FRAME, the flag of
+    //       show_existing_frame will be set, to notify the decoder to show the
+    //       coded BWDREF_FRAME. During the handling of the show_existing_frame,
+    //       no update will be conducted on the reference frame buffer.
+    //       Following is to get the BWDREF_FRAME to show to be taken as the
+    //       LAST_FRAME, preparing for the encoding of the next BWDREF_FRAME.
+    cpi->lst_fb_idx = cpi->bwd_fb_idx;
+    return;
+  }
+#endif  // 0
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
+  if (use_upsampled_ref) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    if (cm->show_existing_frame) {
+      new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
+      // TODO(zoeliu): Once following is confirmed, remove it.
+      assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
+    } else {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      // Up-sample the current encoded frame.
+      RefCntBuffer *bufs = pool->frame_bufs;
+      const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
+
+      new_uidx = upsample_ref_frame(cpi, ref);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   }
 
   // At this point the new frame has been encoded.
@@ -3335,12 +3387,20 @@
   if (cm->frame_type == KEY_FRAME) {
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->bwd_fb_idx], cm->new_fb_idx);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
 
     if (use_upsampled_ref) {
       uref_cnt_fb(cpi->upsampled_ref_bufs,
                   &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       uref_cnt_fb(cpi->upsampled_ref_bufs,
                   &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
     }
@@ -3364,6 +3424,9 @@
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
+
+    // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+    // cpi->interp_filter_selected[GOLDEN_FRAME]?
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
@@ -3399,6 +3462,20 @@
                cpi->interp_filter_selected[ALTREF_FRAME],
                sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
     }
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    if (cpi->refresh_bwd_ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->bwd_fb_idx], cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+
+      memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   }
 
   if (cpi->refresh_last_frame) {
@@ -3474,17 +3551,43 @@
              sizeof(cpi->interp_filter_selected[0]));
     }
 #else  // CONFIG_EXT_REFS
-    ref_cnt_fb(pool->frame_bufs,
-               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
-    if (use_upsampled_ref)
-      uref_cnt_fb(cpi->upsampled_ref_bufs,
-                  &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+#if CONFIG_BIDIR_PRED
+    // TODO(zoeliu): To remove the reference buffer update for the
+    // show_existing_frame==1 case; Instead, we move the reference buffer update
+    // to the previous coded frame, i.e. the last-nonref-frame. In that case, no
+    // bit should be set in the refresh-mask, but the visual ref-idx should be
+    // updated and written to the bitstream accordingly, as the virtual ref-idx
+    // for LAST_FRAME and BWDREF_FRAME should be switched, i.e. cpi->lst_fb_idx
+    // and cpi->bwd_fb_idx should be switched.
+    if (cm->show_existing_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
 
-    if (!cpi->rc.is_src_frame_alt_ref) {
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+
+      // NOTE(zoeliu): OVERLAY should not be the last non-reference frame.
+      assert(!cpi->rc.is_src_frame_alt_ref);
+
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
-             cpi->interp_filter_selected[0],
-             sizeof(cpi->interp_filter_selected[0]));
+             cpi->interp_filter_selected[BWDREF_FRAME],
+             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+    } else {
+#endif  // CONFIG_BIDIR_PRED
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+      if (!cpi->rc.is_src_frame_alt_ref) {
+        memcpy(cpi->interp_filter_selected[LAST_FRAME],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
+      }
+#if CONFIG_BIDIR_PRED
     }
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   }
 
@@ -3494,6 +3597,9 @@
                                    *cpi->Source,
                                    cpi->common.frame_type,
                                    cpi->refresh_last_frame,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+                                   cpi->refresh_bwd_ref_frame,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
                                    cpi->refresh_alt_ref_frame,
                                    cpi->refresh_golden_frame);
   }
@@ -3572,6 +3678,9 @@
     VP9_LAST4_FLAG,
 #endif  // CONFIG_EXT_REFS
     VP9_GOLD_FLAG,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    VP9_BWD_FLAG,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     VP9_ALT_FLAG
   };
 
@@ -3693,9 +3802,14 @@
     refresh[1] = refresh[2] = refresh[3] = 0;
     refresh[4] = (cpi->refresh_golden_frame) ? 1 : 0;
     refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
-#else
+#else  // CONFIG_EXT_REFS
     refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+#if CONFIG_BIDIR_PRED
+    refresh[2] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
+    refresh[3] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#else  // CONFIG_BIDIR_PRED
     refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i - 1];
@@ -3830,7 +3944,12 @@
       // after a key/intra-only frame.
       cpi->max_mv_magnitude = max_mv_def;
     } else {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      // TODO(zoeliu): Maybe we should leave it the same as base.
+      if (cm->show_frame || cpi->rc.is_bwd_ref_frame) {
+#else
       if (cm->show_frame) {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
         // Allow mv_steps to correspond to twice the max mv magnitude found
         // in the previous frame, capped by the default max_mv_magnitude based
         // on resolution.
@@ -3922,7 +4041,7 @@
 
     // There has been a change in frame size.
     vp10_set_size_literal(cpi, oxcf->scaled_frame_width,
-                         oxcf->scaled_frame_height);
+                          oxcf->scaled_frame_height);
   }
 
   if (oxcf->pass == 0 &&
@@ -4159,7 +4278,7 @@
     }
 
     cpi->Source = vp10_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
+                                         &cpi->scaled_source);
 
     if (cpi->unscaled_last_source != NULL)
       cpi->Last_Source = vp10_scale_if_required(cm, cpi->unscaled_last_source,
@@ -4434,16 +4553,25 @@
       map[cpi->lst_fb_idxes[3]] == map[cpi->lst_fb_idxes[1]];
   const int last4_is_last3 =
       map[cpi->lst_fb_idxes[3]] == map[cpi->lst_fb_idxes[2]];
-  const int gld_is_last4 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[3]];
+
   const int last4_is_alt = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[3]];
-#else
+  const int gld_is_last4 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[3]];
+#else  // CONFIG_EXT_REFS
   const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+#if CONFIG_BIDIR_PRED
+  const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idx];
+#endif  // CONFIG_BIDIR_PRED
   const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
 #endif  // CONFIG_EXT_REFS
   const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
 
   int flags = VP9_REFFRAME_ALL;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (!cpi->rc.is_bwd_ref_frame)
+    flags &= ~VP9_BWD_FLAG;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   if (gld_is_alt || gld_is_last)
     flags &= ~VP9_GOLD_FLAG;
 
@@ -4465,6 +4593,11 @@
 
   if (gld_is_last4 || gld_is_last3 || gld_is_last2)
     flags &= ~VP9_GOLD_FLAG;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  if (bwd_is_last && (flags & VP9_BWD_FLAG))
+    flags &= ~VP9_BWD_FLAG;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
 
   return flags;
@@ -4532,6 +4665,9 @@
       (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
   }
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 }
 
 static int setup_interp_filter_search_mask(VP10_COMP *cpi) {
@@ -4563,6 +4699,11 @@
         (ref_total[GOLDEN_FRAME] == 0 ||
          cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
            < ref_total[GOLDEN_FRAME]) &&
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+        (ref_total[BWDREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50
+           < ref_total[BWDREF_FRAME]) &&
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
         (ref_total[ALTREF_FRAME] == 0 ||
          cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
            < ref_total[ALTREF_FRAME]))
@@ -4571,6 +4712,61 @@
   return mask;
 }
 
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (recon_buf == NULL || !cm->show_frame) {
+    printf("Frame %d is not ready or no show to dump.\n",
+           cm->current_video_frame);
+    return;
+  }
+
+  if (cm->current_video_frame == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf("\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
+         "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
+         cm->current_video_frame, cpi->twopass.gf_group.index,
+         cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+         cm->show_existing_frame,
+         recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h*recon_buf->y_stride],
+           1, cm->width, f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h*recon_buf->uv_stride],
+           1, (cm->width >> 1), f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h*recon_buf->uv_stride],
+           1, (cm->width >> 1), f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
+
 static void encode_frame_to_data_rate(VP10_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
@@ -4586,6 +4782,56 @@
   // Set the arf sign bias for this frame.
   set_arf_sign_bias(cpi);
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (cm->show_existing_frame) {
+    // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+    //               BWDREF_FRAME in the reference frame buffer.
+
+    cm->frame_type = INTER_FRAME;
+    cm->show_frame = 1;
+    cpi->frame_flags = *frame_flags;
+
+    cpi->refresh_last_frame = 1;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_bwd_ref_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+
+    cpi->rc.is_bwd_ref_frame = 0;
+    cpi->rc.is_last_nonref_frame = 0;
+    cpi->rc.is_nonref_frame = 0;
+
+    // Build the bitstream
+    vp10_pack_bitstream(cpi, dest, size);
+
+    // Set up frame to show to get ready for stats collection.
+    cm->frame_to_show = get_frame_new_buffer(cm);
+
+    // Update the LAST_FRAME in the reference frame buffer.
+    vp10_update_reference_frames(cpi);
+
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+    // Update the frame type
+    cm->last_frame_type = cm->frame_type;
+
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+
+    ++cm->current_video_frame;
+
+    return;
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
@@ -4651,13 +4897,14 @@
     vp10_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
                             yuv_denoised_file);
   }
-#endif
-#endif
+#endif  // OUTPUT_YUV_DENOISED
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_video_frame > 1) {
     vp10_compute_skin_map(cpi, yuv_skinmap_file);
   }
-#endif
+#endif  // OUTPUT_YUV_SKINMAP
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -4686,18 +4933,38 @@
   cm->frame_to_show->render_width  = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
-  // build the bitstream
+  // Build the bitstream
   vp10_pack_bitstream(cpi, dest, size);
 
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  if (cm->show_frame)
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (cpi->rc.is_last_nonref_frame) {
+    // NOTE: If the current frame is a LAST_NONREF_FRAME, we need next to show
+    //       the BWDREF_FRAME.
+    cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx;
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   if (cm->seg.update_map)
     update_reference_segmentation_map(cpi);
 
   if (frame_is_intra_only(cm) == 0) {
     release_scaled_references(cpi);
   }
+
   vp10_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
@@ -4729,6 +4996,13 @@
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (cpi->refresh_bwd_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_BWDREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
 #if CONFIG_EXT_REFS
@@ -4764,13 +5038,37 @@
   if (!cm->show_existing_frame)
     cm->last_show_frame = cm->show_frame;
 
+#if 0
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if ((cm->show_frame &&
+       !(cpi->rc.is_nonref_frame || cpi->rc.is_last_nonref_frame) ||
+       cpi->rc.is_bwd_ref_frame) {
+    vp10_swap_mi_and_prev_mi(cm);
+  }
+  if (cm->show_frame || cpi->rc.is_bwd_ref_frame) {
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++cm->current_video_frame;
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+#endif  // 0
+
   if (cm->show_frame) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
+    // being used as reference.
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     vp10_swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
   }
-  cm->prev_frame = cm->cur_frame;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // NOTE: It is not supposed to ref to any frame not used as reference
+  if (cm->is_reference_frame)
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    cm->prev_frame = cm->cur_frame;
 }
 
 static void Pass0Encode(VP10_COMP *cpi, size_t *size, uint8_t *dest,
@@ -4788,7 +5086,10 @@
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
-  vp10_twopass_postencode_update(cpi);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (!cpi->common.show_existing_frame)
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    vp10_twopass_postencode_update(cpi);
 }
 
 static void init_ref_frame_bufs(VP10_COMMON *cm) {
@@ -4904,6 +5205,9 @@
   return cm->frame_type == KEY_FRAME ||
          cpi->refresh_last_frame ||
          cpi->refresh_golden_frame ||
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+         cpi->refresh_bwd_ref_frame ||
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
          cpi->refresh_alt_ref_frame ||
          !cm->error_resilient_mode ||
          cm->lf.mode_ref_delta_update ||
@@ -4968,6 +5272,27 @@
   return arf_src_index;
 }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+static int get_brf_src_index(VP10_COMP *cpi) {
+  int brf_src_index = 0;
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+  //               flag.
+  if (gf_group->bidir_pred_enabled[gf_group->index]) {
+    if (cpi->oxcf.pass == 2) {
+      if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+        brf_src_index = gf_group->brf_src_offset[gf_group->index];
+    } else {
+      // TODO(zoeliu): To re-visit the setup for this scenario
+      brf_src_index = BIDIR_PRED_PERIOD - 1;
+    }
+  }
+
+  return brf_src_index;
+}
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 static void check_src_altref(VP10_COMP *cpi,
                              const struct lookahead_entry *source) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -5117,6 +5442,9 @@
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
   int arf_src_index;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int brf_src_index;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   int i;
 
   vpx_usec_timer_start(&cmptimer);
@@ -5138,11 +5466,63 @@
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  cpi->refresh_bwd_ref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   cpi->refresh_alt_ref_frame = 0;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (oxcf->pass == 2 && cm->show_existing_frame) {
+    // Manage the source buffer and flush out the source frame that has been
+    // coded already; Also get prepared for PSNR calculation if needed.
+    if ((source = vp10_lookahead_pop(cpi->lookahead, flush)) == NULL) {
+      *size = 0;
+      return -1;
+    }
+    cpi->Source = &source->img;
+
+    // TODO(zoeliu): To track down to determine whether it's needed to adjust
+    // the frame rate.
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+
+    // Find a free buffer for the new frame, releasing the reference previously
+    // held.
+    if (cm->new_fb_idx != INVALID_IDX) {
+      --pool->frame_bufs[cm->new_fb_idx].ref_count;
+    }
+    cm->new_fb_idx = get_free_fb(cm);
+
+    if (cm->new_fb_idx == INVALID_IDX)
+      return -1;
+
+    // Clear down mmx registers
+    vpx_clear_system_state();
+
+    // Start with a 0 size frame.
+    *size = 0;
+
+    Pass2Encode(cpi, size, dest, frame_flags);
+
+    if (cpi->b_calculate_psnr)
+      generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+    compute_internal_stats(cpi);
+    cpi->bytes += (int)(*size);
+#endif  // CONFIG_INTERNAL_STATS
+
+    // Clear down mmx registers
+    vpx_clear_system_state();
+
+    cm->show_existing_frame = 0;
+
+    return 0;
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
-
   if (arf_src_index) {
     for (i = 0; i <= arf_src_index; ++i) {
       struct lookahead_entry *e = vp10_lookahead_peek(cpi->lookahead, i);
@@ -5180,6 +5560,27 @@
     rc->source_alt_ref_pending = 0;
   }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  rc->is_bwd_ref_frame = 0;
+  brf_src_index = get_brf_src_index(cpi);
+  // TODO(zoeliu): Need to handle when alt-ref is disabled; Currently bwd-ref
+  //               works only when alt-ref is on.
+  if (brf_src_index) {
+    assert(brf_src_index <= rc->frames_to_key);
+    if ((source = vp10_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      rc->is_bwd_ref_frame = 1;
+    }
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   if (!source) {
     // Get last frame source.
     if (cm->current_video_frame > 0) {
@@ -5227,9 +5628,8 @@
   vpx_clear_system_state();
 
   // adjust frame rates based on timestamps given
-  if (cm->show_frame) {
+  if (cm->show_frame)
     adjust_frame_rate(cpi, source);
-  }
 
   // Find a free buffer for the new frame, releasing the reference previously
   // held.
@@ -5301,8 +5701,21 @@
     compute_internal_stats(cpi);
     cpi->bytes += (int)(*size);
   }
-#endif
+#endif  // CONFIG_INTERNAL_STATS
+
   vpx_clear_system_state();
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (cpi->rc.is_last_nonref_frame) {
+    // NOTE(zoeliu): If the current frame is a last non-reference frame, we need
+    //               next to show the BWDREF_FRAME.
+    cpi->rc.is_last_nonref_frame = 0;
+    cm->show_existing_frame = 1;
+  } else {
+    cm->show_existing_frame = 0;
+  }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   return 0;
 }
 
@@ -5336,6 +5749,15 @@
   }
 }
 
+int vp10_get_last_show_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf_idx == INVALID_IDX)
+    return -1;
+
+  *frame =
+      cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+  return 0;
+}
+
 int vp10_set_internal_size(VP10_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
   VP10_COMMON *cm = &cpi->common;
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 67ebe6d..f1508af 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -75,6 +75,9 @@
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+#define BIDIR_PRED_PERIOD  2
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
 typedef enum {
   // encode_breakout is disabled.
@@ -111,7 +114,12 @@
 typedef enum {
   FRAMEFLAGS_KEY    = 1 << 0,
   FRAMEFLAGS_GOLDEN = 1 << 1,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  FRAMEFLAGS_BWDREF = 1 << 2,
+  FRAMEFLAGS_ALTREF = 1 << 3,
+#else
   FRAMEFLAGS_ALTREF = 1 << 2,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 } FRAMETYPE_FLAGS;
 
 typedef enum {
@@ -197,6 +205,9 @@
   // ----------------------------------------------------------------
 
   int enable_auto_arf;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -347,14 +358,22 @@
   int scaled_ref_idx[MAX_REF_FRAMES];
 #if CONFIG_EXT_REFS
   int lst_fb_idxes[LAST_REF_FRAMES];
-#else
+#else  // CONFIG_EXT_REFS
   int lst_fb_idx;
 #endif  // CONFIG_EXT_REFS
   int gld_fb_idx;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int bwd_fb_idx;  // BWD_REF_FRAME
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   int alt_fb_idx;
 
+  int last_show_frame_buf_idx;  // last show frame buffer index
+
   int refresh_last_frame;
   int refresh_golden_frame;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int refresh_bwd_ref_frame;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   int refresh_alt_ref_frame;
 
   int ext_refresh_frame_flags_pending;
@@ -587,6 +606,10 @@
 #if CONFIG_ANS
   struct BufAnsCoder buf_ans;
 #endif
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int refresh_frame_mask;
+  int existing_fb_idx_to_show;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 } VP10_COMP;
 
 void vp10_initialize_enc(void);
@@ -610,6 +633,8 @@
 int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp10_ppflags_t *flags);
 
+int vp10_get_last_show_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
 int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags);
 
 void vp10_update_reference(VP10_COMP *cpi, int ref_frame_flags);
@@ -654,12 +679,16 @@
 #endif  // CONFIG_EXT_REFS
   else if (ref_frame == GOLDEN_FRAME)
     return cpi->gld_fb_idx;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  else if (ref_frame == BWDREF_FRAME)
+    return cpi->bwd_fb_idx;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   else
     return cpi->alt_fb_idx;
 }
 
 static INLINE int get_ref_frame_buf_idx(const VP10_COMP *const cpi,
-                                        int ref_frame) {
+                                        MV_REFERENCE_FRAME ref_frame) {
   const VP10_COMMON *const cm = &cpi->common;
   const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
@@ -673,6 +702,14 @@
       buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
 }
 
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
+    VP10_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
+  // Use up-sampled reference frames.
+  const int buf_idx =
+      cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
+  return &cpi->upsampled_ref_bufs[buf_idx].buf;
+}
+
 static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
   // TODO(JBB): double check we can't exceed this token count if we have a
   // 32x32 transform crossing a boundary at a multiple of 16.
@@ -714,6 +751,16 @@
          cpi->oxcf.enable_auto_arf;
 }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+static INLINE int is_bwdref_enabled(const VP10_COMP *const cpi) {
+  // NOTE(zoeliu): The enabling of backward prediction depends on the alt_ref
+  // period, and will be off when the alt_ref period is not sufficiently large.
+  return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0;
+         // (zoeliu):
+         // && cpi->oxcf.enable_auto_brf && cpi->rc.bidir_pred_enabled;
+}
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 static INLINE void set_ref_ptrs(VP10_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {
diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index 68e8107..f0d3ab9 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -1620,7 +1620,7 @@
   GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
-  int frame_index = 1;
+  int frame_index = 0;
   int target_frame_size;
   int key_frame;
   const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
@@ -1630,6 +1630,9 @@
   int mid_boost_bits = 0;
   int mid_frame_idx;
   unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  int bidir_pred_frame_index = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
   key_frame = cpi->common.frame_type == KEY_FRAME;
 
@@ -1639,27 +1642,38 @@
   // is also the golden frame.
   if (!key_frame) {
     if (rc->source_alt_ref_active) {
-      gf_group->update_type[0] = OVERLAY_UPDATE;
-      gf_group->rf_level[0] = INTER_NORMAL;
-      gf_group->bit_allocation[0] = 0;
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = 0;
     } else {
-      gf_group->update_type[0] = GF_UPDATE;
-      gf_group->rf_level[0] = GF_ARF_STD;
-      gf_group->bit_allocation[0] = gf_arf_bits;
+      gf_group->update_type[frame_index] = GF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_STD;
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
     }
-    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
 
     // Step over the golden frame / overlay frame
     if (EOF == input_stats(twopass, &frame_stats))
       return;
   }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   // Deduct the boost bits for arf (or gf if it is not a key frame)
   // from the group total.
   if (rc->source_alt_ref_pending || !key_frame)
     total_group_bits -= gf_arf_bits;
 
+  frame_index++;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  bidir_pred_frame_index++;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
@@ -1673,6 +1687,13 @@
     gf_group->arf_ref_idx[frame_index] =
       arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                          rc->source_alt_ref_active];
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+    // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
     ++frame_index;
 
     if (cpi->multi_arf_enabled) {
@@ -1718,10 +1739,67 @@
     target_frame_size = clamp(target_frame_size, 0,
                               VPXMIN(max_bits, (int)total_group_bits));
 
-    gf_group->update_type[frame_index] = LF_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    // TODO(zoeliu): Currently only support BIDIR_PRED_PERIOD = 2
+    assert(BIDIR_PRED_PERIOD == 2);
+    // NOTE: BIDIR_PRED is only enabled when its interval is strictly
+    //       less than the GOLDEN_FRAME group interval.
+    // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+    if (rc->source_alt_ref_pending && BIDIR_PRED_PERIOD <
+        (rc->baseline_gf_interval - rc->source_alt_ref_pending)) {
+      if (bidir_pred_frame_index == 1) {
+        const int curr_brf_src_offset = BIDIR_PRED_PERIOD - 1;
+        if ((i + curr_brf_src_offset) >=
+            (rc->baseline_gf_interval - rc->source_alt_ref_pending)) {
+          gf_group->update_type[frame_index] = LF_UPDATE;
+          gf_group->bidir_pred_enabled[frame_index] = 0;
+          gf_group->brf_src_offset[frame_index] = 0;
+        } else {
+          gf_group->update_type[frame_index] = BRF_UPDATE;
+          gf_group->bidir_pred_enabled[frame_index] = 1;
+          gf_group->brf_src_offset[frame_index] = curr_brf_src_offset;
+        }
+      } else if (bidir_pred_frame_index == BIDIR_PRED_PERIOD) {
+        gf_group->update_type[frame_index] = LASTNRF_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = 0;
+        // Reset the bidir_pred index.
+        bidir_pred_frame_index = 0;
+      } else {
+        gf_group->update_type[frame_index] = NRF_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = 0;
+      }
 
-    gf_group->bit_allocation[frame_index] = target_frame_size;
+      bidir_pred_frame_index++;
+    } else {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      gf_group->update_type[frame_index] = LF_UPDATE;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+    }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+      // Boost up the allocated bits on BWDREF_FRAME
+      // (zoeliu)gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->rf_level[frame_index] = INTER_HIGH;
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size + (target_frame_size >> 2);
+    } else if (gf_group->update_type[frame_index] == LASTNRF_UPDATE) {
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] =
+          VPXMAX(0, target_frame_size - (target_frame_size >> 1));
+    } else {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
     ++frame_index;
   }
 
@@ -1747,6 +1825,10 @@
     gf_group->update_type[frame_index] = GF_UPDATE;
     gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
   // Note whether multi-arf was enabled this group for next time.
   cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
@@ -1837,6 +1919,7 @@
     int int_lbq =
       (int)(vp10_convert_qindex_to_q(rc->last_boosted_qindex,
                                      cpi->common.bit_depth));
+
     active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
@@ -2399,33 +2482,88 @@
   TWO_PASS *const twopass = &cpi->twopass;
 
   cpi->rc.is_src_frame_alt_ref = 0;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  cpi->rc.is_bwd_ref_frame = 0;
+  cpi->rc.is_last_nonref_frame = 0;
+  cpi->rc.is_nonref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
     case KF_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 1;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cpi->refresh_bwd_ref_frame = 1;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       cpi->refresh_alt_ref_frame = 1;
       break;
+
     case LF_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       cpi->refresh_alt_ref_frame = 0;
       break;
+
     case GF_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 1;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       cpi->refresh_alt_ref_frame = 0;
       break;
+
     case OVERLAY_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 1;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
+
     case ARF_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       cpi->refresh_alt_ref_frame = 1;
       break;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    case BRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_bwd_ref_frame = 1;
+      break;
+
+    // TODO(zoeliu): When BIDIR_PRED and EXT_REFS start to work together, we
+    // may take both LASTNRF and NRF as one of the last ref
+
+    case LASTNRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_last_nonref_frame = 1;
+      break;
+
+    case NRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_nonref_frame = 1;
+      break;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
     default:
       assert(0);
       break;
@@ -2515,6 +2653,7 @@
     rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
+
   vp10_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
     return;
diff --git a/vp10/encoder/firstpass.h b/vp10/encoder/firstpass.h
index 68a8887..a09e523 100644
--- a/vp10/encoder/firstpass.h
+++ b/vp10/encoder/firstpass.h
@@ -72,7 +72,16 @@
   GF_UPDATE = 2,
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
+#if CONFIG_BIDIR_PRED
+  BRF_UPDATE = 5,  // Backward Reference Frame
+  // For NRF's within a BIDIR_PRED period, if it is the last one, then it is
+  // needed to get LAST_FRAME updated; Otherwise no ref update is needed at all.
+  LASTNRF_UPDATE = 6,  // Last Non-Reference Frame
+  NRF_UPDATE = 7,  // Non-Reference Frame, but not the last one
+  FRAME_UPDATE_TYPES = 8
+#else
   FRAME_UPDATE_TYPES = 5
+#endif  // CONFIG_BIDIR_PRED
 } FRAME_UPDATE_TYPE;
 
 #define FC_ANIMATION_THRESH 0.15
@@ -89,6 +98,10 @@
   unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if CONFIG_BIDIR_PRED
+  unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+#endif  // CONFIG_BIDIR_PRED
   int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
 
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index ff0a7c6..33bcab4 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -13,7 +13,6 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vp10/common/idct.h"
-#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
 #include "vp10/encoder/hybrid_fwd_txfm.h"
 
 static INLINE void fdct32x32(int rd_transform, const int16_t *src,
@@ -196,7 +195,7 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -212,7 +211,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -233,7 +231,7 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -250,7 +248,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -271,7 +268,7 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -288,7 +285,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
diff --git a/vp10/encoder/lookahead.c b/vp10/encoder/lookahead.c
index dce0139..bc7b404 100644
--- a/vp10/encoder/lookahead.c
+++ b/vp10/encoder/lookahead.c
@@ -47,13 +47,13 @@
 
 
 struct lookahead_ctx *vp10_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int subsampling_x,
-                                         unsigned int subsampling_y,
+                                          unsigned int height,
+                                          unsigned int subsampling_x,
+                                          unsigned int subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                         int use_highbitdepth,
+                                          int use_highbitdepth,
 #endif
-                                         unsigned int depth) {
+                                          unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
diff --git a/vp10/encoder/ratectrl.c b/vp10/encoder/ratectrl.c
index 6068775..c1dc71e 100644
--- a/vp10/encoder/ratectrl.c
+++ b/vp10/encoder/ratectrl.c
@@ -240,11 +240,14 @@
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (!cm->show_frame && !rc->is_bwd_ref_frame)
+#else
+  if (!cm->show_frame)
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     rc->bits_off_target -= encoded_frame_size;
-  } else {
+  else
     rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
-  }
 
   // Clip the buffer level to the maximum specified buffer size.
   rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
@@ -946,8 +949,13 @@
 
 int vp10_frame_type_qdelta(const VP10_COMP *cpi, int rf_level, int q) {
   static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    0.80,  // INTER_NORMAL
+    1.25,  // INTER_HIGH
+#else
     1.00,  // INTER_NORMAL
     1.00,  // INTER_HIGH
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     1.50,  // GF_ARF_LOW
     1.75,  // GF_ARF_STD
     2.00,  // KF_STD
@@ -1282,7 +1290,7 @@
     }
   }
 
-  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // Keep record of last boosted (KF/GF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
@@ -1314,7 +1322,12 @@
 
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  rc->total_target_bits += (cm->show_frame || rc->is_bwd_ref_frame) ?
+                            rc->avg_frame_bandwidth : 0;
+#else
   rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
@@ -1328,7 +1341,12 @@
 
   if (cm->frame_type == KEY_FRAME)
     rc->frames_since_key = 0;
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  if (cm->show_frame || rc->is_bwd_ref_frame) {
+#else
   if (cm->show_frame) {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
diff --git a/vp10/encoder/ratectrl.h b/vp10/encoder/ratectrl.h
index 0b9fd45..ed51f12 100644
--- a/vp10/encoder/ratectrl.h
+++ b/vp10/encoder/ratectrl.h
@@ -90,6 +90,14 @@
   int source_alt_ref_active;
   int is_src_frame_alt_ref;
 
+#if CONFIG_BIDIR_PRED
+  // NOTE: Different types of frames may have different bits allocated
+  //       accordingly, aiming to achieve the overall optimal RD performance.
+  int is_bwd_ref_frame;
+  int is_last_nonref_frame;
+  int is_nonref_frame;
+#endif  // CONFIG_BIDIR_PRED
+
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
   int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index f935e35..37ee4fc 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -223,7 +223,11 @@
   8, 8, 4, 4, 2, 2, 1, 0
 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
-  128, 144, 128, 128, 144
+  128, 144, 128, 128, 144,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  // TODO(zoeliu): To adjust further following factor values.
+  128, 128, 128
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 };
 
 int vp10_compute_rd_mult(const VP10_COMP *cpi, int qindex) {
@@ -729,10 +733,10 @@
   int inter_filter_cost = 0;
   int dir;
 
-  for (dir = 0; dir < 4; ++dir) {
-    const int frame_idx = (dir >> 1);
-    if (mbmi->ref_frame[frame_idx] > INTRA_FRAME &&
-        has_subpel_mv_component(xd, dir)) {
+  for (dir = 0; dir < 2; ++dir) {
+    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
       const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
       inter_filter_cost +=
           cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
@@ -768,6 +772,10 @@
     rd->thresh_mult[THR_NEARESTL2] = 300;
     rd->thresh_mult[THR_NEARESTL3] = 300;
     rd->thresh_mult[THR_NEARESTL4] = 300;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+    rd->thresh_mult[THR_NEARESTB] = 300;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTG] = 300;
     rd->thresh_mult[THR_NEARESTA] = 300;
@@ -777,6 +785,10 @@
     rd->thresh_mult[THR_NEARESTL2] = 0;
     rd->thresh_mult[THR_NEARESTL3] = 0;
     rd->thresh_mult[THR_NEARESTL4] = 0;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+    rd->thresh_mult[THR_NEARESTB] = 0;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTG] = 0;
     rd->thresh_mult[THR_NEARESTA] = 0;
@@ -789,6 +801,10 @@
   rd->thresh_mult[THR_NEWL2] += 1000;
   rd->thresh_mult[THR_NEWL3] += 1000;
   rd->thresh_mult[THR_NEWL4] += 1000;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_NEWB] += 1000;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
@@ -798,6 +814,10 @@
   rd->thresh_mult[THR_NEARL2] += 1000;
   rd->thresh_mult[THR_NEARL3] += 1000;
   rd->thresh_mult[THR_NEARL4] += 1000;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_NEARB] += 1000;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_NEARG] += 1000;
@@ -808,6 +828,10 @@
   rd->thresh_mult[THR_NEWFROMNEARL2] += 1000;
   rd->thresh_mult[THR_NEWFROMNEARL3] += 1000;
   rd->thresh_mult[THR_NEWFROMNEARL4] += 1000;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_NEWFROMNEARB] += 1000;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWFROMNEARG] += 1000;
   rd->thresh_mult[THR_NEWFROMNEARA] += 1000;
@@ -818,6 +842,10 @@
   rd->thresh_mult[THR_ZEROL2] += 2000;
   rd->thresh_mult[THR_ZEROL3] += 2000;
   rd->thresh_mult[THR_ZEROL4] += 2000;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_ZEROB] += 2000;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_ZEROG] += 2000;
   rd->thresh_mult[THR_ZEROA] += 2000;
@@ -879,20 +907,53 @@
   rd->thresh_mult[THR_COMP_NEW_NEARL4A] += 1700;
   rd->thresh_mult[THR_COMP_NEW_NEWL4A] += 2000;
   rd->thresh_mult[THR_COMP_ZERO_ZEROL4A] += 2500;
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
-#else
+
+#else  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
 #if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTL4A] += 1000;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_COMP_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
 
   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
-  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
-  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
 #if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEARL2A] += 1500;
   rd->thresh_mult[THR_COMP_NEWL2A] += 2000;
@@ -900,15 +961,30 @@
   rd->thresh_mult[THR_COMP_NEWL3A] += 2000;
   rd->thresh_mult[THR_COMP_NEARL4A] += 1500;
   rd->thresh_mult[THR_COMP_NEWL4A] += 2000;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_COMP_NEARLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEARGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_NEWGB] += 2000;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGA] += 2000;
 
   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
 #if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_ZEROL2A] += 2500;
   rd->thresh_mult[THR_COMP_ZEROL3A] += 2500;
   rd->thresh_mult[THR_COMP_ZEROL4A] += 2500;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  rd->thresh_mult[THR_COMP_ZEROLB] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
 #endif  // CONFIG_EXT_INTER
 
   rd->thresh_mult[THR_H_PRED] += 2000;
@@ -964,9 +1040,14 @@
 #if CONFIG_EXT_REFS
     {2500, 2500, 2500, 2500, 2500, 2500, 4500, 4500, 4500, 4500, 4500, 2500},
     {2000, 2000, 2000, 2000, 2000, 2000, 4000, 4000, 4000, 4000, 4000, 2000}
-#else
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+    {2500, 2500, 2500, 2500, 4500, 4500, 4500, 4500, 2500},
+    {2000, 2000, 2000, 2000, 4000, 4000, 4000, 4000, 2000}
+#else  // CONFIG_BIDIR_PRED
     {2500, 2500, 2500, 4500, 4500, 2500},
     {2000, 2000, 2000, 4000, 4000, 2000}
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   };
   RD_OPT *const rd = &cpi->rd;
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 2e67663..624f2d2 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -44,23 +44,43 @@
 #define INVALID_MV 0x80008000
 
 #if CONFIG_EXT_REFS
+
 #if CONFIG_EXT_INTER
 #define MAX_MODES 114
-#else
+#else  // CONFIG_EXT_INTER
 #define MAX_MODES 54
 #endif  // CONFIG_EXT_INTER
-#else
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+
+#if CONFIG_EXT_INTER
+#define MAX_MODES 80
+#else  // CONFIG_EXT_INTER
+#define MAX_MODES 42
+#endif  // CONFIG_EXT_INTER
+
+#else  // CONFIG_BIDIR_PRED
+
 #if CONFIG_EXT_INTER
 #define MAX_MODES 57
-#else
+#else  // CONFIG_EXT_INTER
 #define MAX_MODES 30
 #endif  // CONFIG_EXT_INTER
+
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
 #define MAX_REFS  12
-#else
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+#define MAX_REFS  9
+#else  // CONFIG_BIDIR_PRED
 #define MAX_REFS  6
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
 
 #define RD_THRESH_MAX_FACT 64
@@ -74,6 +94,10 @@
   THR_NEARESTL2,
   THR_NEARESTL3,
   THR_NEARESTL4,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_NEARESTB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_NEARESTA,
   THR_NEARESTG,
@@ -85,6 +109,10 @@
   THR_NEWL2,
   THR_NEWL3,
   THR_NEWL4,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_NEWB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_NEWA,
   THR_NEWG,
@@ -94,6 +122,10 @@
   THR_NEARL2,
   THR_NEARL3,
   THR_NEARL4,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_NEARB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_NEARA,
   THR_NEARG,
@@ -104,6 +136,10 @@
   THR_NEWFROMNEARL2,
   THR_NEWFROMNEARL3,
   THR_NEWFROMNEARL4,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_NEWFROMNEARB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_NEWFROMNEARA,
   THR_NEWFROMNEARG,
@@ -114,6 +150,10 @@
   THR_ZEROL2,
   THR_ZEROL3,
   THR_ZEROL4,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_ZEROB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_ZEROG,
   THR_ZEROA,
@@ -124,6 +164,11 @@
   THR_COMP_NEAREST_NEARESTL2A,
   THR_COMP_NEAREST_NEARESTL3A,
   THR_COMP_NEAREST_NEARESTL4A,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTGB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_COMP_NEAREST_NEARESTGA,
 #else  // CONFIG_EXT_INTER
@@ -132,6 +177,11 @@
   THR_COMP_NEARESTL2A,
   THR_COMP_NEARESTL3A,
   THR_COMP_NEARESTL4A,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_COMP_NEARESTLB,
+  THR_COMP_NEARESTGB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_COMP_NEARESTGA,
 #endif  // CONFIG_EXT_INTER
@@ -188,8 +238,31 @@
   THR_COMP_NEAR_NEWL4A,
   THR_COMP_NEW_NEWL4A,
   THR_COMP_ZERO_ZEROL4A,
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+  THR_COMP_NEAR_NEARESTLB,
+  THR_COMP_NEAR_NEARESTGB,
+  THR_COMP_NEAREST_NEARLB,
+  THR_COMP_NEAREST_NEARGB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_ZERO_ZEROLB,
+  THR_COMP_ZERO_ZEROGB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
-#else
+
+#else  // CONFIG_EXT_INTER
+
   THR_COMP_NEARLA,
   THR_COMP_NEWLA,
 #if CONFIG_EXT_REFS
@@ -199,6 +272,13 @@
   THR_COMP_NEWL3A,
   THR_COMP_NEARL4A,
   THR_COMP_NEWL4A,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_COMP_NEARLB,
+  THR_COMP_NEWLB,
+  THR_COMP_NEARGB,
+  THR_COMP_NEWGB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_COMP_NEARGA,
   THR_COMP_NEWGA,
@@ -208,6 +288,11 @@
   THR_COMP_ZEROL2A,
   THR_COMP_ZEROL3A,
   THR_COMP_ZEROL4A,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_COMP_ZEROLB,
+  THR_COMP_ZEROGB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_COMP_ZEROGA,
 #endif  // CONFIG_EXT_INTER
@@ -262,14 +347,24 @@
   THR_LAST2,
   THR_LAST3,
   THR_LAST4,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_BWDR,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_GOLD,
   THR_ALTR,
+
   THR_COMP_LA,
 #if CONFIG_EXT_REFS
   THR_COMP_L2A,
   THR_COMP_L3A,
   THR_COMP_L4A,
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  THR_COMP_LB,
+  THR_COMP_GB,
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   THR_COMP_GA,
   THR_INTRA,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index c111b56..bfd34c9 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -65,7 +65,22 @@
                                  (1 << LAST2_FRAME) | (1 << INTRA_FRAME) | \
                                  (1 << LAST3_FRAME) | (1 << LAST4_FRAME))
 
-#else
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_BIDIR_PRED
+
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << INTRA_FRAME))
+#define BWD_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+// TODO(zoeliu): To rename the following to ALTREF_MODE_MASK
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << INTRA_FRAME))
+
+
+#else  // CONFIG_BIDIR_PRED
 
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
@@ -74,9 +89,16 @@
 #define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
                                  (1 << INTRA_FRAME))
 
+#endif  // CONFIG_BIDIR_PRED
+
 #endif  // CONFIG_EXT_REFS
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | \
+                                 0x01)
+#else
 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
 
 #define MIN_EARLY_TERM_INDEX    3
 #define NEW_MV_DISCOUNT_FACTOR  8
@@ -122,6 +144,10 @@
   {NEARESTMV, {LAST2_FRAME,  NONE}},
   {NEARESTMV, {LAST3_FRAME,  NONE}},
   {NEARESTMV, {LAST4_FRAME,  NONE}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEARESTMV, {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEARESTMV, {ALTREF_FRAME, NONE}},
   {NEARESTMV, {GOLDEN_FRAME, NONE}},
@@ -133,6 +159,10 @@
   {NEWMV,     {LAST2_FRAME,  NONE}},
   {NEWMV,     {LAST3_FRAME,  NONE}},
   {NEWMV,     {LAST4_FRAME,  NONE}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEWMV,     {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEWMV,     {ALTREF_FRAME, NONE}},
   {NEWMV,     {GOLDEN_FRAME, NONE}},
@@ -142,6 +172,10 @@
   {NEARMV,    {LAST2_FRAME,  NONE}},
   {NEARMV,    {LAST3_FRAME,  NONE}},
   {NEARMV,    {LAST4_FRAME,  NONE}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEARMV,    {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEARMV,    {ALTREF_FRAME, NONE}},
   {NEARMV,    {GOLDEN_FRAME, NONE}},
@@ -152,6 +186,10 @@
   {NEWFROMNEARMV,    {LAST2_FRAME,  NONE}},
   {NEWFROMNEARMV,    {LAST3_FRAME,  NONE}},
   {NEWFROMNEARMV,    {LAST4_FRAME,  NONE}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEWFROMNEARMV,    {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEWFROMNEARMV,    {ALTREF_FRAME, NONE}},
   {NEWFROMNEARMV,    {GOLDEN_FRAME, NONE}},
@@ -162,16 +200,27 @@
   {ZEROMV,    {LAST2_FRAME,  NONE}},
   {ZEROMV,    {LAST3_FRAME,  NONE}},
   {ZEROMV,    {LAST4_FRAME,  NONE}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {ZEROMV,    {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {ZEROMV,    {GOLDEN_FRAME, NONE}},
   {ZEROMV,    {ALTREF_FRAME, NONE}},
 
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
+
 #if CONFIG_EXT_INTER
   {NEAREST_NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
 #if CONFIG_EXT_REFS
   {NEAREST_NEARESTMV, {LAST2_FRAME,  ALTREF_FRAME}},
   {NEAREST_NEARESTMV, {LAST3_FRAME,  ALTREF_FRAME}},
   {NEAREST_NEARESTMV, {LAST4_FRAME,  ALTREF_FRAME}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEAREST_NEARESTMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAREST_NEARESTMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEAREST_NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
 #else  // CONFIG_EXT_INTER
@@ -180,6 +229,11 @@
   {NEARESTMV, {LAST2_FRAME,  ALTREF_FRAME}},
   {NEARESTMV, {LAST3_FRAME,  ALTREF_FRAME}},
   {NEARESTMV, {LAST4_FRAME,  ALTREF_FRAME}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEARESTMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEARESTMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
 #endif  // CONFIG_EXT_INTER
@@ -193,14 +247,14 @@
   {NEAREST_NEARMV, {GOLDEN_FRAME, ALTREF_FRAME}},
   {NEAR_NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
   {NEAR_NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
-  {NEW_NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
-  {NEW_NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
-  {NEAREST_NEWMV, {LAST_FRAME,   ALTREF_FRAME}},
-  {NEAREST_NEWMV, {GOLDEN_FRAME, ALTREF_FRAME}},
-  {NEW_NEARMV, {LAST_FRAME,   ALTREF_FRAME}},
-  {NEW_NEARMV, {GOLDEN_FRAME, ALTREF_FRAME}},
-  {NEAR_NEWMV, {LAST_FRAME,   ALTREF_FRAME}},
-  {NEAR_NEWMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST_FRAME,   ALTREF_FRAME}},
+  {NEW_NEARESTMV,  {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST_FRAME,   ALTREF_FRAME}},
+  {NEAREST_NEWMV,  {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEW_NEARMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEW_NEARMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEAR_NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEAR_NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
   {NEW_NEWMV,      {LAST_FRAME,   ALTREF_FRAME}},
   {NEW_NEWMV,      {GOLDEN_FRAME, ALTREF_FRAME}},
   {ZERO_ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
@@ -235,8 +289,29 @@
   {NEAR_NEWMV,     {LAST4_FRAME,  ALTREF_FRAME}},
   {NEW_NEWMV,      {LAST4_FRAME,  ALTREF_FRAME}},
   {ZERO_ZEROMV,    {LAST4_FRAME,  ALTREF_FRAME}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEAR_NEARESTMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAR_NEARESTMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAREST_NEARMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAREST_NEARMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST_FRAME,   BWDREF_FRAME}},
+  {NEW_NEARESTMV,  {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAREST_NEWMV,  {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEW_NEARMV,     {LAST_FRAME,   BWDREF_FRAME}},
+  {NEW_NEARMV,     {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAR_NEWMV,     {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAR_NEWMV,     {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEW_NEWMV,      {LAST_FRAME,   BWDREF_FRAME}},
+  {NEW_NEWMV,      {GOLDEN_FRAME, BWDREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST_FRAME,   BWDREF_FRAME}},
+  {ZERO_ZEROMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
-#else
+
+#else  // CONFIG_EXT_INTER
+
   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
 #if CONFIG_EXT_REFS
@@ -246,6 +321,13 @@
   {NEWMV,     {LAST3_FRAME,  ALTREF_FRAME}},
   {NEARMV,    {LAST4_FRAME,  ALTREF_FRAME}},
   {NEWMV,     {LAST4_FRAME,  ALTREF_FRAME}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {NEARMV,    {LAST_FRAME,   BWDREF_FRAME}},
+  {NEWMV,     {LAST_FRAME,   BWDREF_FRAME}},
+  {NEARMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEWMV,     {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
@@ -255,6 +337,11 @@
   {ZEROMV,    {LAST3_FRAME,  ALTREF_FRAME}},
   {ZEROMV,    {LAST2_FRAME,  ALTREF_FRAME}},
   {ZEROMV,    {LAST4_FRAME,  ALTREF_FRAME}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {ZEROMV,    {LAST_FRAME,   BWDREF_FRAME}},
+  {ZEROMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 #endif  // CONFIG_EXT_INTER
@@ -309,19 +396,41 @@
   {{LAST2_FRAME,  NONE}},
   {{LAST3_FRAME,  NONE}},
   {{LAST4_FRAME,  NONE}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {{BWDREF_FRAME, NONE}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {{GOLDEN_FRAME, NONE}},
   {{ALTREF_FRAME, NONE}},
+
   {{LAST_FRAME,   ALTREF_FRAME}},
 #if CONFIG_EXT_REFS
   {{LAST2_FRAME,  ALTREF_FRAME}},
   {{LAST3_FRAME,  ALTREF_FRAME}},
   {{LAST4_FRAME,  ALTREF_FRAME}},
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+  {{LAST_FRAME,   BWDREF_FRAME}},
+  {{GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
   {{GOLDEN_FRAME, ALTREF_FRAME}},
   {{INTRA_FRAME,  NONE}},
 };
 
+#if CONFIG_DUAL_FILTER
+// TODO(jingning): The magic number 9 here really means the combination
+// of prediction filter types for vertical and horizontal directions.
+// It will be replaced after we integrate the dual filter experiment with
+// the ext-interp experiment.
+static int filter_sets[9][2] = {
+    {0, 0}, {0, 1}, {0, 2},
+    {1, 0}, {1, 1}, {1, 2},
+    {2, 0}, {2, 1}, {2, 2},
+};
+#endif
+
 static INLINE int write_uniform_cost(int n, int v) {
   int l = get_unsigned_bits(n), m = (1 << l) - n;
   if (l == 0)
@@ -540,8 +649,6 @@
   get_energy_distribution_fine(cpi, bsize, src, src_stride,
                                dst, dst_stride, hdist, vdist);
 
-
-
   svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] +
                vdist[1] * ADST_FLIP_SVM[1] +
                vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
@@ -717,7 +824,7 @@
 
 static void model_rd_for_sb(VP10_COMP *cpi, BLOCK_SIZE bsize,
                             MACROBLOCK *x, MACROBLOCKD *xd,
-                            int num_planes,
+                            int plane_from, int plane_to,
                             int *out_rate_sum, int64_t *out_dist_sum,
                             int *skip_txfm_sb, int64_t *skip_sse_sb) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -744,7 +851,7 @@
 
   x->pred_sse[ref] = 0;
 
-  for (i = 0; i < num_planes; ++i) {
+  for (i = plane_from; i <= plane_to; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
@@ -1997,8 +2104,8 @@
   uint16_t best_dst16[8 * 8];
 #endif
 
-  memcpy(ta, a, sizeof(ta));
-  memcpy(tl, l, sizeof(tl));
+  memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
+  memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
   xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
 
@@ -2020,8 +2127,8 @@
           continue;
       }
 
-      memcpy(tempa, ta, sizeof(ta));
-      memcpy(templ, tl, sizeof(tl));
+      memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+      memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -2105,8 +2212,8 @@
         *bestdistortion = distortion;
         best_rd = this_rd;
         *best_mode = mode;
-        memcpy(a, tempa, sizeof(tempa));
-        memcpy(l, templ, sizeof(templ));
+        memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+        memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
           memcpy(best_dst16 + idy * 8,
                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
@@ -2146,8 +2253,8 @@
         continue;
     }
 
-    memcpy(tempa, ta, sizeof(ta));
-    memcpy(templ, tl, sizeof(tl));
+    memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+    memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -2230,8 +2337,8 @@
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      memcpy(a, tempa, sizeof(tempa));
-      memcpy(l, templ, sizeof(templ));
+      memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+      memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
                num_4x4_blocks_wide * 4);
@@ -2267,12 +2374,8 @@
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
   const int *bmode_costs = cpi->mbmode_cost[0];
 
-  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
-
 #if CONFIG_EXT_INTRA
   mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
   mic->mbmi.intra_filter = INTRA_FILTER_LINEAR;
@@ -2298,7 +2401,9 @@
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
-                                      bmode_costs, t_above + idx, t_left + idy,
+                                      bmode_costs,
+                                      xd->plane[0].above_context + idx,
+                                      xd->plane[0].left_context + idy,
                                       &r, &ry, &d, bsize, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
@@ -5691,8 +5796,10 @@
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       vpx_prob ref_single_p1 = vp10_get_pred_prob_single_ref_p1(cm, xd);
       vpx_prob ref_single_p2 = vp10_get_pred_prob_single_ref_p2(cm, xd);
-#if CONFIG_EXT_REFS
+#if CONFIG_EXT_REFS || CONFIG_BIDIR_PRED
       vpx_prob ref_single_p3 = vp10_get_pred_prob_single_ref_p3(cm, xd);
+#endif  // CONFIG_EXT_REFS || CONFIG_BIDIR_PRED
+#if CONFIG_EXT_REFS
       vpx_prob ref_single_p4 = vp10_get_pred_prob_single_ref_p4(cm, xd);
       vpx_prob ref_single_p5 = vp10_get_pred_prob_single_ref_p5(cm, xd);
 #endif  // CONFIG_EXT_REFS
@@ -5703,6 +5810,10 @@
           ref_costs_single[LAST2_FRAME] =
           ref_costs_single[LAST3_FRAME] =
           ref_costs_single[LAST4_FRAME] =
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+          ref_costs_single[BWDREF_FRAME] =
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
           ref_costs_single[GOLDEN_FRAME] =
           ref_costs_single[ALTREF_FRAME] = base_cost;
@@ -5726,12 +5837,18 @@
       ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p4, 1);
       ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p5, 0);
       ref_costs_single[LAST4_FRAME]  += vp10_cost_bit(ref_single_p5, 1);
-#else
+#else  // CONFIG_EXT_REFS
       ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p1, 1);
-      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
       ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+#if CONFIG_BIDIR_PRED
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p3, 1);
+      ref_costs_single[BWDREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+      ref_costs_single[BWDREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+      ref_costs_single[BWDREF_FRAME] += vp10_cost_bit(ref_single_p3, 0);
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
     } else {
       ref_costs_single[LAST_FRAME]   = 512;
@@ -5739,6 +5856,10 @@
       ref_costs_single[LAST2_FRAME]  = 512;
       ref_costs_single[LAST3_FRAME]  = 512;
       ref_costs_single[LAST4_FRAME]  = 512;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+      ref_costs_single[BWDREF_FRAME] = 512;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
@@ -5750,7 +5871,12 @@
       vpx_prob ref_comp_p1 = vp10_get_pred_prob_comp_ref_p1(cm, xd);
       vpx_prob ref_comp_p2 = vp10_get_pred_prob_comp_ref_p2(cm, xd);
       vpx_prob ref_comp_p3 = vp10_get_pred_prob_comp_ref_p3(cm, xd);
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+      vpx_prob bwdref_comp_p = vp10_get_pred_prob_comp_bwdref_p(cm, xd);
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
+
       unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
 
       ref_costs_comp[LAST_FRAME] =
@@ -5761,6 +5887,12 @@
 #endif  // CONFIG_EXT_REFS
           ref_costs_comp[GOLDEN_FRAME] = base_cost;
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
+      //               more bit.
+      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 #if CONFIG_EXT_REFS
       ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p, 0);
       ref_costs_comp[LAST2_FRAME]  += vp10_cost_bit(ref_comp_p, 0);
@@ -5776,9 +5908,13 @@
 
       ref_costs_comp[LAST3_FRAME]  += vp10_cost_bit(ref_comp_p3, 1);
       ref_costs_comp[LAST4_FRAME]  += vp10_cost_bit(ref_comp_p3, 0);
-#else
+#else  // CONFIG_EXT_REFS
       ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p, 0);
       ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p, 1);
+#if CONFIG_BIDIR_PRED
+      ref_costs_comp[BWDREF_FRAME] += vp10_cost_bit(bwdref_comp_p, 0);
+      ref_costs_comp[ALTREF_FRAME] += vp10_cost_bit(bwdref_comp_p, 1);
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
     } else {
       ref_costs_comp[LAST_FRAME]   = 512;
@@ -5786,6 +5922,11 @@
       ref_costs_comp[LAST2_FRAME]  = 512;
       ref_costs_comp[LAST3_FRAME]  = 512;
       ref_costs_comp[LAST4_FRAME]  = 512;
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+      ref_costs_comp[BWDREF_FRAME] = 512;
+      ref_costs_comp[ALTREF_FRAME] = 512;
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
     }
@@ -6384,6 +6525,56 @@
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+#if CONFIG_EXT_INTER
+static int estimate_wedge_sign(const VP10_COMP *cpi,
+                               const MACROBLOCK *x,
+                               const BLOCK_SIZE bsize,
+                               uint8_t *pred0, int stride0,
+                               uint8_t *pred1, int stride1) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int f_index = bsize - BLOCK_8X8;
+  const int bw = 4 << (b_width_log2_lookup[bsize]);
+  const int bh = 4 << (b_height_log2_lookup[bsize]);
+  uint32_t esq[2][4], var;
+  int64_t tl, br;
+
+  var = cpi->fn_ptr[f_index].vf(
+      src, src_stride,
+      pred0, stride0, &esq[0][0]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bw / 2, src_stride,
+      pred0 + bw / 2, stride0, &esq[0][1]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride, src_stride,
+      pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride + bw / 2, src_stride,
+      pred0 + bh / 2 * stride0 + bw / 2, stride0, &esq[0][3]);
+  var = cpi->fn_ptr[f_index].vf(
+      src, src_stride,
+      pred1, stride1, &esq[1][0]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bw / 2, src_stride,
+      pred1 + bw / 2, stride1, &esq[1][1]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride, src_stride,
+      pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride + bw / 2, src_stride,
+      pred1 + bh / 2 * stride1 + bw / 2, stride0, &esq[1][3]);
+  (void) var;
+
+  tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
+       (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
+  br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
+       (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+  return (tl + br > 0);
+}
+#endif  // CONFIG_EXT_INTER
+
+#if !CONFIG_DUAL_FILTER
 static INTERP_FILTER predict_interp_filter(const VP10_COMP *cpi,
                                            const MACROBLOCK *x,
                                            const BLOCK_SIZE bsize,
@@ -6503,6 +6694,7 @@
   }
   return best_filter;
 }
+#endif
 
 static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
@@ -6590,7 +6782,16 @@
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
+#if CONFIG_DUAL_FILTER
+  // Index use case:
+  // {0, 1} -> (vertical, horizontal) filter types for the first ref frame
+  // {2, 3} -> (vertical, horizontal) filter types for the second ref frame
+  INTERP_FILTER best_filter[4] = {SWITCHABLE, SWITCHABLE,
+      SWITCHABLE, SWITCHABLE,
+  };
+#else
   INTERP_FILTER best_filter = SWITCHABLE;
+#endif
   uint8_t skip_txfm[MAX_MB_PLANE][MAX_TX_BLOCKS_IN_MAX_SB] = {{0}};
   int64_t bsse[MAX_MB_PLANE][MAX_TX_BLOCKS_IN_MAX_SB] = {{0}};
 
@@ -6887,22 +7088,31 @@
   if (is_comp_pred)
     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
 
+#if !CONFIG_DUAL_FILTER
   best_filter = predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
                                       single_filter);
-  if (cm->interp_filter != BILINEAR && best_filter == SWITCHABLE) {
+#endif
+
+  if (cm->interp_filter != BILINEAR) {
     int newbest;
     int tmp_rate_sum = 0;
     int64_t tmp_dist_sum = 0;
 
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 9; ++i) {
+#else
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+#endif
       int j;
       int64_t rs_rd;
       int tmp_skip_sb = 0;
       int64_t tmp_skip_sse = INT64_MAX;
 
 #if CONFIG_DUAL_FILTER
-      for (j = 0; j < 4; ++j)
-        mbmi->interp_filter[j] = i;
+      mbmi->interp_filter[0] = filter_sets[i][0];
+      mbmi->interp_filter[1] = filter_sets[i][1];
+      mbmi->interp_filter[2] = filter_sets[i][0];
+      mbmi->interp_filter[3] = filter_sets[i][1];
 #else
       mbmi->interp_filter = i;
 #endif
@@ -6946,8 +7156,8 @@
           }
         }
         vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, MAX_MB_PLANE, &rate_sum, &dist_sum,
-                        &tmp_skip_sb, &tmp_skip_sse);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
+                        &rate_sum, &dist_sum, &tmp_skip_sb, &tmp_skip_sse);
 
         rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
         if (cm->interp_filter == SWITCHABLE)
@@ -6970,7 +7180,10 @@
       if (newbest) {
         best_rd = rd;
 #if CONFIG_DUAL_FILTER
-        best_filter = mbmi->interp_filter[0];
+        best_filter[0] = mbmi->interp_filter[0];
+        best_filter[1] = mbmi->interp_filter[1];
+        best_filter[2] = mbmi->interp_filter[2];
+        best_filter[3] = mbmi->interp_filter[3];
 #else
         best_filter = mbmi->interp_filter;
 #endif
@@ -7002,11 +7215,15 @@
 
   // Set the appropriate filter
 #if CONFIG_DUAL_FILTER
-  for (i = 0; i < 4; ++i) {
-    const int frame_idx = (i >> 1);
-    if (mbmi->ref_frame[frame_idx] > INTRA_FRAME)
-      mbmi->interp_filter[i] = cm->interp_filter != SWITCHABLE ?
-          cm->interp_filter : best_filter;
+  mbmi->interp_filter[0] = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : best_filter[0];
+  mbmi->interp_filter[1] = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : best_filter[1];
+  if (mbmi->ref_frame[1] > INTRA_FRAME) {
+    mbmi->interp_filter[2] = cm->interp_filter != SWITCHABLE ?
+        cm->interp_filter : best_filter[2];
+    mbmi->interp_filter[3] = cm->interp_filter != SWITCHABLE ?
+        cm->interp_filter : best_filter[3];
   }
 #else
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
@@ -7024,8 +7241,9 @@
 #endif  // CONFIG_OBMC
 
   if (is_comp_pred && is_interinter_wedge_used(bsize)) {
-    int wedge_index, best_wedge_index = WEDGE_NONE, rs;
-    int rate_sum;
+    int wedge_index, best_wedge_index = WEDGE_NONE;
+    int wedge_sign, best_wedge_sign = 0;
+    int rate_sum, rs;
     int64_t dist_sum;
     int64_t best_rd_nowedge = INT64_MAX;
     int64_t best_rd_wedge = INT64_MAX;
@@ -7034,6 +7252,7 @@
     int64_t tmp_skip_sse_sb;
 
     rs = vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 0);
+    mbmi->use_wedge_interinter = 0;
     vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
     vp10_subtract_plane(x, bsize, 0);
     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
@@ -7042,41 +7261,43 @@
     if (rd != INT64_MAX)
       rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
     best_rd_nowedge = rd;
-    mbmi->use_wedge_interinter = 0;
 
     // Disbale wedge search if source variance is small
     if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
-        best_rd_nowedge < 3 * ref_best_rd) {
+        best_rd_nowedge / 3 < ref_best_rd) {
+      uint8_t pred0[2 * MAX_SB_SQUARE * 3];
+      uint8_t pred1[2 * MAX_SB_SQUARE * 3];
+      uint8_t *preds0[3] = {pred0,
+        pred0 + 2 * MAX_SB_SQUARE,
+        pred0 + 4 * MAX_SB_SQUARE};
+      uint8_t *preds1[3] = {pred1,
+        pred1 + 2 * MAX_SB_SQUARE,
+        pred1 + 4 * MAX_SB_SQUARE};
+      int strides[3] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+      int est_wedge_sign;
 
       mbmi->use_wedge_interinter = 1;
-      rs = vp10_cost_literal(1 + get_wedge_bits_lookup[bsize]) +
+      rs = vp10_cost_literal(get_interinter_wedge_bits(bsize)) +
           vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 1);
-      wedge_types = (1 << get_wedge_bits_lookup[bsize]);
-      if (have_newmv_in_inter_mode(this_mode)) {
-        int_mv tmp_mv[2];
-        int rate_mvs[2], tmp_rate_mv = 0;
-        uint8_t pred0[2 * MAX_SB_SQUARE * 3];
-        uint8_t pred1[2 * MAX_SB_SQUARE * 3];
-        uint8_t *preds0[3] = {pred0,
-          pred0 + 2 * MAX_SB_SQUARE,
-          pred0 + 4 * MAX_SB_SQUARE};
-        uint8_t *preds1[3] = {pred1,
-          pred1 + 2 * MAX_SB_SQUARE,
-          pred1 + 4 * MAX_SB_SQUARE};
-        int strides[3] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
-        vp10_build_inter_predictors_for_planes_single_buf(
-            xd, bsize, 0, 0,  mi_row, mi_col, 0, preds0, strides);
-        vp10_build_inter_predictors_for_planes_single_buf(
-            xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
+      wedge_types = (1 << get_wedge_bits_lookup(bsize));
 
-        for (wedge_index = 0; wedge_index < 2 * wedge_types; ++wedge_index) {
-          mbmi->interinter_wedge_index = wedge_index >> 1;
-          mbmi->interinter_wedge_sign = wedge_index & 1;
-          vp10_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
-                                                    mi_row, mi_col,
+      vp10_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0,  mi_row, mi_col, 0, preds0, strides);
+      vp10_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
+
+      // Choose the best wedge
+      if (cpi->sf.fast_wedge_sign_estimate) {
+        est_wedge_sign = estimate_wedge_sign(
+            cpi, x, bsize, pred0, MAX_SB_SIZE, pred1, MAX_SB_SIZE);
+        best_wedge_sign = mbmi->interinter_wedge_sign = est_wedge_sign;
+        for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+          mbmi->interinter_wedge_index = wedge_index;
+          vp10_build_wedge_inter_predictor_from_buf(xd, bsize,
+                                                    0, 0, mi_row, mi_col,
                                                     preds0, strides,
                                                     preds1, strides);
-          model_rd_for_sb(cpi, bsize, x, xd, 1,
+          model_rd_for_sb(cpi, bsize, x, xd, 0, 0,
                           &rate_sum, &dist_sum,
                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
           rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
@@ -7085,8 +7306,34 @@
             best_rd_wedge = rd;
           }
         }
-        mbmi->interinter_wedge_index = best_wedge_index >> 1;
-        mbmi->interinter_wedge_sign = best_wedge_index & 1;
+      } else {
+        for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+          for (wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
+            mbmi->interinter_wedge_index = wedge_index;
+            mbmi->interinter_wedge_sign = wedge_sign;
+            vp10_build_wedge_inter_predictor_from_buf(xd, bsize,
+                                                      0, 0, mi_row, mi_col,
+                                                      preds0, strides,
+                                                      preds1, strides);
+            model_rd_for_sb(cpi, bsize, x, xd, 0, 0,
+                            &rate_sum, &dist_sum,
+                            &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+            rd = RDCOST(x->rdmult, x->rddiv,
+                        rs + rate_mv + rate_sum, dist_sum);
+            if (rd < best_rd_wedge) {
+              best_wedge_index = wedge_index;
+              best_wedge_sign = wedge_sign;
+              best_rd_wedge = rd;
+            }
+          }
+        }
+      }
+      mbmi->interinter_wedge_index = best_wedge_index;
+      mbmi->interinter_wedge_sign = best_wedge_sign;
+
+      if (have_newmv_in_inter_mode(this_mode)) {
+        int_mv tmp_mv[2];
+        int rate_mvs[2], tmp_rate_mv = 0;
         if (this_mode == NEW_NEWMV) {
           int mv_idxs[2] = {0, 0};
           do_masked_motion_search_indexed(cpi, x,
@@ -7117,7 +7364,7 @@
           mbmi->mv[1].as_int = tmp_mv[1].as_int;
         }
         vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 1, &rate_sum, &dist_sum,
+        model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                         &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
         rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
         if (rd < best_rd_wedge) {
@@ -7131,7 +7378,6 @@
                                                     preds0, strides,
                                                     preds1, strides);
         }
-
         vp10_subtract_plane(x, bsize, 0);
         rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
                                  &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
@@ -7143,8 +7389,8 @@
 
         if (best_rd_wedge < best_rd_nowedge) {
           mbmi->use_wedge_interinter = 1;
-          mbmi->interinter_wedge_index = best_wedge_index >> 1;
-          mbmi->interinter_wedge_sign = best_wedge_index & 1;
+          mbmi->interinter_wedge_index = best_wedge_index;
+          mbmi->interinter_wedge_sign = best_wedge_sign;
           xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
           xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
           *rate2 += tmp_rate_mv - rate_mv;
@@ -7157,37 +7403,6 @@
           xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
         }
       } else {
-        uint8_t pred0[2 * MAX_SB_SQUARE * 3];
-        uint8_t pred1[2 * MAX_SB_SQUARE * 3];
-        uint8_t *preds0[3] = {pred0,
-          pred0 + 2 * MAX_SB_SQUARE,
-          pred0 + 4 * MAX_SB_SQUARE};
-        uint8_t *preds1[3] = {pred1,
-          pred1 + 2 * MAX_SB_SQUARE,
-          pred1 + 4 * MAX_SB_SQUARE};
-        int strides[3] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
-        vp10_build_inter_predictors_for_planes_single_buf(
-            xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
-        vp10_build_inter_predictors_for_planes_single_buf(
-            xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
-        for (wedge_index = 0; wedge_index < 2 * wedge_types; ++wedge_index) {
-          mbmi->interinter_wedge_index = wedge_index >> 1;
-          mbmi->interinter_wedge_sign = wedge_index & 1;
-          vp10_build_wedge_inter_predictor_from_buf(xd, bsize,
-                                                    0, 0, mi_row, mi_col,
-                                                    preds0, strides,
-                                                    preds1, strides);
-          model_rd_for_sb(cpi, bsize, x, xd, 1,
-                          &rate_sum, &dist_sum,
-                          &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
-          rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
-          if (rd < best_rd_wedge) {
-            best_wedge_index = wedge_index;
-            best_rd_wedge = rd;
-          }
-        }
-        mbmi->interinter_wedge_sign = best_wedge_index & 1;
-        mbmi->interinter_wedge_index = best_wedge_index >> 1;
         vp10_build_wedge_inter_predictor_from_buf(xd, bsize,
                                                   0, 0, mi_row, mi_col,
                                                   preds0, strides,
@@ -7197,12 +7412,12 @@
                                  &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
                                  INT64_MAX);
         if (rd != INT64_MAX)
-          rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
-        best_rd_wedge = rd;
+            rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
+          best_rd_wedge = rd;
         if (best_rd_wedge < best_rd_nowedge) {
           mbmi->use_wedge_interinter = 1;
-          mbmi->interinter_wedge_index = best_wedge_index >> 1;
-          mbmi->interinter_wedge_sign = best_wedge_index & 1;
+          mbmi->interinter_wedge_index = best_wedge_index;
+          mbmi->interinter_wedge_sign = best_wedge_sign;
         } else {
           mbmi->use_wedge_interinter = 0;
         }
@@ -7217,7 +7432,7 @@
 
     if (mbmi->use_wedge_interinter)
       *compmode_wedge_cost =
-          vp10_cost_literal(1 + get_wedge_bits_lookup[bsize]) +
+          vp10_cost_literal(get_interinter_wedge_bits(bsize)) +
           vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 1);
     else
       *compmode_wedge_cost =
@@ -7241,6 +7456,7 @@
     DECLARE_ALIGNED(16, uint8_t,
                     intrapred_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
     uint8_t *intrapred;
+
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
       intrapred = CONVERT_TO_BYTEPTR(intrapred_);
@@ -7265,34 +7481,34 @@
           xd, bsize, 0, intrapred, MAX_SB_SIZE);
       vp10_combine_interintra(xd, bsize, 0, tmp_buf, MAX_SB_SIZE,
                               intrapred, MAX_SB_SIZE);
-      vp10_subtract_plane(x, bsize, 0);
-      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
-                               INT64_MAX);
-      if (rd != INT64_MAX)
-        rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                      &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+      rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
       if (rd < best_interintra_rd) {
         best_interintra_rd = rd;
         best_interintra_mode = mbmi->interintra_mode;
       }
     }
     mbmi->interintra_mode = best_interintra_mode;
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, intrapred, MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, 0, tmp_buf, MAX_SB_SIZE,
+                            intrapred, MAX_SB_SIZE);
+    vp10_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                             INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+    best_interintra_rd = rd;
+
     if (ref_best_rd < INT64_MAX &&
         best_interintra_rd > 2 * ref_best_rd) {
       return INT64_MAX;
     }
-    vp10_build_intra_predictors_for_interintra(
-        xd, bsize, 0, intrapred, MAX_SB_SIZE);
-
-    rmode = interintra_mode_cost[mbmi->interintra_mode];
     if (is_interintra_wedge_used(bsize)) {
       rwedge = vp10_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
-      vp10_combine_interintra(xd, bsize, 0, tmp_buf, MAX_SB_SIZE,
-                              intrapred, MAX_SB_SIZE);
-      vp10_subtract_plane(x, bsize, 0);
-      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
-                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
-                               INT64_MAX);
       if (rd != INT64_MAX)
         rd = RDCOST(x->rdmult, x->rddiv,
                     rmode + rate_mv + rwedge + rate_sum, dist_sum);
@@ -7300,10 +7516,9 @@
 
       // Disbale wedge search if source variance is small
       if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
-
         mbmi->use_wedge_interintra = 1;
-        wedge_types = (1 << get_wedge_bits_lookup[bsize]);
-        rwedge = vp10_cost_literal(get_wedge_bits_lookup[bsize]) +
+        wedge_types = (1 << get_wedge_bits_lookup(bsize));
+        rwedge = vp10_cost_literal(get_interintra_wedge_bits(bsize)) +
             vp10_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
         for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
           mbmi->interintra_wedge_index = wedge_index;
@@ -7311,7 +7526,7 @@
           vp10_combine_interintra(xd, bsize, 0,
                                   tmp_buf, MAX_SB_SIZE,
                                   intrapred, MAX_SB_SIZE);
-          model_rd_for_sb(cpi, bsize, x, xd, 1,
+          model_rd_for_sb(cpi, bsize, x, xd, 0, 0,
                           &rate_sum, &dist_sum,
                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
           rd = RDCOST(x->rdmult, x->rddiv,
@@ -7333,7 +7548,7 @@
                                   0, mv_idx);
           mbmi->mv[0].as_int = tmp_mv.as_int;
           vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, 1, &rate_sum, &dist_sum,
+          model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
           rd = RDCOST(x->rdmult, x->rddiv,
                       rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
@@ -7390,7 +7605,7 @@
           cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
       if (mbmi->use_wedge_interintra) {
         *compmode_interintra_cost +=
-            vp10_cost_literal(get_wedge_bits_lookup[bsize]);
+            vp10_cost_literal(get_interintra_wedge_bits(bsize));
       }
     }
   } else if (is_interintra_allowed(mbmi)) {
@@ -7428,8 +7643,8 @@
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
     vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, MAX_MB_PLANE, &tmp_rate, &tmp_dist,
-                    &skip_txfm_sb, &skip_sse_sb);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
+                    &tmp_rate, &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
@@ -7537,6 +7752,19 @@
 #else
     int tmp_rate2 = rate2_nocoeff;
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    INTERP_FILTER obmc_interp_filter[2][2] = {
+        {mbmi->interp_filter[0], mbmi->interp_filter[1]},   // obmc == 0
+        {mbmi->interp_filter[0], mbmi->interp_filter[1]}    // obmc == 1
+    };
+#else
+    INTERP_FILTER obmc_interp_filter[2] = {
+        mbmi->interp_filter,  // obmc == 0
+        mbmi->interp_filter   // obmc == 1
+    };
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
 
     if (mbmi->obmc) {
 #if CONFIG_EXT_INTER
@@ -7565,6 +7793,21 @@
 #else
         tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+          obmc_interp_filter[1][0] = mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+          obmc_interp_filter[1][1] = mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#else
+        if (!vp10_is_interp_needed(xd))
+          obmc_interp_filter[1] = mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        // This is not quite correct with CONFIG_DUAL_FILTER when a filter
+        // is needed in only one direction
+        if (!vp10_is_interp_needed(xd))
+          tmp_rate2 -= rs;
+#endif  // CONFIG_EXT_INTERP
         vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 #if CONFIG_EXT_INTER
       } else {
@@ -7575,8 +7818,8 @@
                                        NULL, NULL,
                                        dst_buf1, dst_stride1,
                                        dst_buf2, dst_stride2);
-      model_rd_for_sb(cpi, bsize, x, xd, MAX_MB_PLANE, &tmp_rate, &tmp_dist,
-                      &skip_txfm_sb, &skip_sse_sb);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
+                      &tmp_rate, &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     }
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -7705,6 +7948,14 @@
 #if CONFIG_OBMC
     tmp_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     if (mbmi->obmc == 0 || (tmp_rd < best_rd)) {
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = obmc_interp_filter[mbmi->obmc][0];
+      mbmi->interp_filter[1] = obmc_interp_filter[mbmi->obmc][1];
+#else
+      mbmi->interp_filter = obmc_interp_filter[mbmi->obmc];
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
       best_mbmi = *mbmi;
       best_rd = tmp_rd;
       best_rate2 = *rate2;
@@ -8042,6 +8293,9 @@
     VP9_LAST4_FLAG,
 #endif  // CONFIG_EXT_REFS
     VP9_GOLD_FLAG,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    VP9_BWD_FLAG,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     VP9_ALT_FLAG
   };
   int64_t best_rd = best_rd_so_far;
@@ -8220,8 +8474,17 @@
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped iff both reference frames
       // are masked out.
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
+        ref_frame_skip_mask[0] |= (1 << ref_frame);
+        ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
+      } else {
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+        ref_frame_skip_mask[0] |= (1 << ref_frame);
+        ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     } else {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
@@ -8255,9 +8518,15 @@
           (1 << LAST2_FRAME) |
           (1 << LAST3_FRAME) |
           (1 << LAST4_FRAME) |
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+          (1 << BWDREF_FRAME) |
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
           (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      // TODO(zoeliu): To further explore whether following needs to be done for
+      //               BWDREF_FRAME as well.
       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
@@ -8433,8 +8702,17 @@
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+        case BWDREF_FRAME:
+          ref_frame_skip_mask[0] |= BWD_REF_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
         case ALTREF_FRAME:
           ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
           break;
         case NONE:
         case MAX_REF_FRAMES:
@@ -8459,6 +8737,14 @@
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      // TODO(zoeliu): To further justify whether following is needed
+      if (cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]
+          != LASTNRF_UPDATE && second_ref_frame == BWDREF_FRAME) {
+        continue;
+      }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
       if (!cpi->allow_comp_inter_inter)
         continue;
 
@@ -9688,6 +9974,9 @@
     VP9_LAST4_FLAG,
 #endif  // CONFIG_EXT_REFS
     VP9_GOLD_FLAG,
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+    VP9_BWD_FLAG,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
     VP9_ALT_FLAG
   };
   int64_t best_rd = best_rd_so_far;
@@ -9698,7 +9987,11 @@
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER tmp_best_filter[4] = { 0 };
+#else
   INTERP_FILTER tmp_best_filter = SWITCHABLE;
+#endif
   int rate_uv_intra, rate_uv_tokenonly;
   int64_t dist_uv;
   int skip_uv;
@@ -9814,6 +10107,10 @@
                                       (1 << LAST2_FRAME) |
                                       (1 << LAST3_FRAME) |
                                       (1 << LAST4_FRAME) |
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
                                       (1 << ALTREF_FRAME);
             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -9850,18 +10147,37 @@
                                       (1 << LAST2_FRAME) |
                                       (1 << LAST3_FRAME) |
                                       (1 << LAST4_FRAME) |
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
                                       (1 << ALTREF_FRAME);
             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+          case BWDREF_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
           case ALTREF_FRAME:
             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
 #if CONFIG_EXT_REFS
                                       (1 << LAST2_FRAME) |
                                       (1 << LAST3_FRAME) |
                                       (1 << LAST4_FRAME) |
+#else  // CONFIG_EXT_REFS
+#if CONFIG_BIDIR_PRED
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_BIDIR_PRED
 #endif  // CONFIG_EXT_REFS
                                       (1 << LAST_FRAME);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
             break;
           case NONE:
           case MAX_REF_FRAMES:
@@ -9884,6 +10200,13 @@
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      // TODO(zoeliu): To further justify whether following is needed
+      if (cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]
+          != LASTNRF_UPDATE && second_ref_frame == BWDREF_FRAME) {
+        continue;
+      }
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       if (!cpi->allow_comp_inter_inter)
         continue;
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
@@ -9992,7 +10315,11 @@
                              &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
       b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
       MB_MODE_INFO tmp_best_mbmode;
+#if CONFIG_DUAL_FILTER
+      BEST_SEG_INFO bsi[9];
+#else
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+#endif
       int pred_exists = 0;
       int uv_skippable;
 #if CONFIG_EXT_INTER
@@ -10022,26 +10349,53 @@
       mbmi->tx_type = DCT_DCT;
 
       if (cm->interp_filter != BILINEAR) {
+#if CONFIG_DUAL_FILTER
+        tmp_best_filter[0] = EIGHTTAP_REGULAR;
+        tmp_best_filter[1] = EIGHTTAP_REGULAR;
+        tmp_best_filter[2] = EIGHTTAP_REGULAR;
+        tmp_best_filter[3] = EIGHTTAP_REGULAR;
+#else
         tmp_best_filter = EIGHTTAP_REGULAR;
+#endif
         if (x->source_variance < sf->disable_filter_search_var_thresh) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = EIGHTTAP_REGULAR;
+#else
           tmp_best_filter = EIGHTTAP_REGULAR;
+#endif
         } else if (sf->adaptive_pred_interp_filter == 1 &&
                    ctx->pred_interp_filter < SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter;
+#else
           tmp_best_filter = ctx->pred_interp_filter;
+#endif
         } else if (sf->adaptive_pred_interp_filter == 2) {
-          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE ?
                               ctx->pred_interp_filter : 0;
+#else
+          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+              ctx->pred_interp_filter : 0;
+#endif
         } else {
+#if CONFIG_DUAL_FILTER
+          for (switchable_filter_index = 0;
+               switchable_filter_index < 9;
+               ++switchable_filter_index) {
+#else
           for (switchable_filter_index = 0;
                switchable_filter_index < SWITCHABLE_FILTERS;
                ++switchable_filter_index) {
+#endif
             int newbest, rs;
             int64_t rs_rd;
             MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
 #if CONFIG_DUAL_FILTER
-            int dir;
-            for (dir = 0; dir < 4; ++dir)
-              mbmi->interp_filter[dir] = switchable_filter_index;
+            mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1];
+            mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1];
 #else
             mbmi->interp_filter = switchable_filter_index;
 #endif
@@ -10077,7 +10431,10 @@
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
 #if CONFIG_DUAL_FILTER
-              tmp_best_filter = mbmi->interp_filter[0];
+              tmp_best_filter[0] = mbmi->interp_filter[0];
+              tmp_best_filter[1] = mbmi->interp_filter[1];
+              tmp_best_filter[2] = mbmi->interp_filter[2];
+              tmp_best_filter[3] = mbmi->interp_filter[3];
 #else
               tmp_best_filter = mbmi->interp_filter;
 #endif
@@ -10113,9 +10470,14 @@
         continue;
 
 #if CONFIG_DUAL_FILTER
-      for (i = 0; i < 4; ++i)
-        mbmi->interp_filter[i] = (cm->interp_filter == SWITCHABLE ?
-                               tmp_best_filter : cm->interp_filter);
+      mbmi->interp_filter[0] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[0] : cm->interp_filter);
+      mbmi->interp_filter[1] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[1] : cm->interp_filter);
+      mbmi->interp_filter[2] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[2] : cm->interp_filter);
+      mbmi->interp_filter[3] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[3] : cm->interp_filter);
 #else
       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
                              tmp_best_filter : cm->interp_filter);
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index 2ca39a5..be6227b 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -90,24 +90,6 @@
                                    int use_fast_coef_casting);
 #endif  // CONFIG_SUPERTX
 
-static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
-                                                          const int ref) {
-  // Use up-sampled reference frames.
-  int ref_idx = 0;
-  if (ref == LAST_FRAME)
-#if CONFIG_EXT_REFS
-    ref_idx = cpi->lst_fb_idxes[ref - LAST_FRAME];
-#else
-    ref_idx = cpi->lst_fb_idx;
-#endif  // CONFIG_EXT_REFS
-  else if (ref == GOLDEN_FRAME)
-    ref_idx = cpi->gld_fb_idx;
-  else if (ref == ALTREF_FRAME)
-    ref_idx = cpi->alt_fb_idx;
-
-  return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
-}
-
 #if CONFIG_OBMC
 void calc_target_weighted_pred(VP10_COMMON *cm,
                                MACROBLOCK *x,
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index 155f28e..b766cae 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -165,6 +165,7 @@
     sf->use_transform_domain_distortion = 1;
 #if CONFIG_EXT_INTER
     sf->disable_wedge_search_var_thresh = 100;
+    sf->fast_wedge_sign_estimate = 1;
 #endif  // CONFIG_EXT_INTER
   }
 
@@ -283,6 +284,7 @@
   sf->use_upsampled_references = 0;
 #if CONFIG_EXT_INTER
   sf->disable_wedge_search_var_thresh = 100;
+  sf->fast_wedge_sign_estimate = 1;
 #endif  // CONFIG_EXT_INTER
 
   // Use transform domain distortion computation
@@ -517,6 +519,7 @@
 #endif  // CONFIG_EXT_TILE
 #if CONFIG_EXT_INTER
   sf->disable_wedge_search_var_thresh = 0;
+  sf->fast_wedge_sign_estimate = 0;
 #endif  // CONFIG_EXT_INTER
 
   for (i = 0; i < TX_SIZES; i++) {
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 6cee748..ca6adbe 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -402,6 +402,9 @@
 #if CONFIG_EXT_INTER
   // A source variance threshold below which wedge search is disabled
   unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
 #endif  // CONFIG_EXT_INTER
 
   // These bit masks allow you to enable or disable intra modes for each
diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
index ce9089e..dffdf20 100644
--- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "vp10/common/vp10_fwd_txfm2d_cfg.h"
 #include "vp10/common/vp10_txfm.h"
+#include "vp10/common/x86/highbd_txfm_utility_sse4.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
@@ -239,6 +240,43 @@
       fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
       write_buffer_4x4(in, coeff);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#endif
     default:
       assert(0);
   }
@@ -369,30 +407,6 @@
   in[15] = _mm_srai_epi32(in[15], shift);
 }
 
-#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
-  do {                                \
-    __m128i u0, u1, u2, u3;           \
-    u0 = _mm_unpacklo_epi32(x0, x1);  \
-    u1 = _mm_unpackhi_epi32(x0, x1);  \
-    u2 = _mm_unpacklo_epi32(x2, x3);  \
-    u3 = _mm_unpackhi_epi32(x2, x3);  \
-    y0 = _mm_unpacklo_epi64(u0, u2);  \
-    y1 = _mm_unpackhi_epi64(u0, u2);  \
-    y2 = _mm_unpacklo_epi64(u1, u3);  \
-    y3 = _mm_unpackhi_epi64(u1, u3);  \
-  } while (0)
-
-static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
-  TRANSPOSE_4X4(in[0], in[2], in[4], in[6],
-                out[0], out[2], out[4], out[6]);
-  TRANSPOSE_4X4(in[1], in[3], in[5], in[7],
-                out[8], out[10], out[12], out[14]);
-  TRANSPOSE_4X4(in[8], in[10], in[12], in[14],
-                out[1], out[3], out[5], out[7]);
-  TRANSPOSE_4X4(in[9], in[11], in[13], in[15],
-                out[9], out[11], out[13], out[15]);
-}
-
 static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
@@ -960,6 +974,58 @@
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
   }
@@ -1708,47 +1774,6 @@
   col_txfm_8x8_rounding(&in[48], shift);
 }
 
-static void transpose_16x16(const __m128i *in, __m128i *out) {
-  // Upper left 8x8
-  TRANSPOSE_4X4(in[0], in[4], in[8], in[12],
-                out[0], out[4], out[8], out[12]);
-  TRANSPOSE_4X4(in[1], in[5], in[9], in[13],
-                out[16], out[20], out[24], out[28]);
-  TRANSPOSE_4X4(in[16], in[20], in[24], in[28],
-                out[1], out[5], out[9], out[13]);
-  TRANSPOSE_4X4(in[17], in[21], in[25], in[29],
-                out[17], out[21], out[25], out[29]);
-
-  // Upper right 8x8
-  TRANSPOSE_4X4(in[2], in[6], in[10], in[14],
-                out[32], out[36], out[40], out[44]);
-  TRANSPOSE_4X4(in[3], in[7], in[11], in[15],
-                out[48], out[52], out[56], out[60]);
-  TRANSPOSE_4X4(in[18], in[22], in[26], in[30],
-                out[33], out[37], out[41], out[45]);
-  TRANSPOSE_4X4(in[19], in[23], in[27], in[31],
-                out[49], out[53], out[57], out[61]);
-
-  // Lower left 8x8
-  TRANSPOSE_4X4(in[32], in[36], in[40], in[44],
-                out[2], out[6], out[10], out[14]);
-  TRANSPOSE_4X4(in[33], in[37], in[41], in[45],
-                out[18], out[22], out[26], out[30]);
-  TRANSPOSE_4X4(in[48], in[52], in[56], in[60],
-                out[3], out[7], out[11], out[15]);
-  TRANSPOSE_4X4(in[49], in[53], in[57], in[61],
-                out[19], out[23], out[27], out[31]);
-  // Lower right 8x8
-  TRANSPOSE_4X4(in[34], in[38], in[42], in[46],
-                out[34], out[38], out[42], out[46]);
-  TRANSPOSE_4X4(in[35], in[39], in[43], in[47],
-                out[50], out[54], out[58], out[62]);
-  TRANSPOSE_4X4(in[50], in[54], in[58], in[62],
-                out[35], out[39], out[43], out[47]);
-  TRANSPOSE_4X4(in[51], in[55], in[59], in[63],
-                out[51], out[55], out[59], out[63]);
-}
-
 static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
   const int size_8x8 = 16 * 4;
   write_buffer_8x8(&in[0], output);
@@ -1806,6 +1831,58 @@
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
   }
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 89b0edb..650b6f3 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -112,6 +112,10 @@
 VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm1d_sse4.c
 VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm2d_sse4.c
 
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
+endif
+
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index 0cad961..bed6648 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c
@@ -25,6 +25,9 @@
 struct vp10_extracfg {
   int                         cpu_used;  // available cpu percentage in 1/16
   unsigned int                enable_auto_alt_ref;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  unsigned int                enable_auto_bwd_ref;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   unsigned int                noise_sensitivity;
   unsigned int                sharpness;
   unsigned int                static_thresh;
@@ -55,6 +58,9 @@
 static struct vp10_extracfg default_extra_cfg = {
   0,                            // cpu_used
   1,                            // enable_auto_alt_ref
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  0,                            // enable_auto_bwd_ref
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   0,                            // noise_sensitivity
   0,                            // sharpness
   0,                            // static_thresh
@@ -199,6 +205,9 @@
           "or kf_max_dist instead.");
 
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  RANGE_CHECK(extra_cfg, enable_auto_bwd_ref, 0, 2);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size,
@@ -411,6 +420,9 @@
   oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
   oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  oxcf->enable_auto_brf        =  extra_cfg->enable_auto_bwd_ref;
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
   oxcf->sharpness              =  extra_cfg->sharpness;
 
@@ -574,6 +586,15 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+static vpx_codec_err_t ctrl_set_enable_auto_bwd_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_bwd_ref = CAST(VP8E_SET_ENABLEAUTOBWDREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct vp10_extracfg extra_cfg = ctx->extra_cfg;
@@ -924,8 +945,12 @@
     if (res == VPX_CODEC_OK) {
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img);
+#else
       data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
                 (cpi->multi_arf_allowed ? 8 : 2);
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
       if (data_sz < 4096)
         data_sz = 4096;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
@@ -1141,6 +1166,24 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_new_frame_image(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  vpx_image_t *const new_img = va_arg(args, vpx_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (vp10_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
 #if CONFIG_VP9_POSTPROC
@@ -1299,6 +1342,9 @@
   {VP8E_SET_SCALEMODE,                ctrl_set_scale_mode},
   {VP8E_SET_CPUUSED,                  ctrl_set_cpuused},
   {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_enable_auto_alt_ref},
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  {VP8E_SET_ENABLEAUTOBWDREF,         ctrl_set_enable_auto_bwd_ref},
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
   {VP8E_SET_SHARPNESS,                ctrl_set_sharpness},
   {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_static_thresh},
   {VP9E_SET_TILE_COLUMNS,             ctrl_set_tile_columns},
@@ -1330,6 +1376,7 @@
   {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
   {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+  {VP10_GET_NEW_FRAME_IMAGE,          ctrl_get_new_frame_image},
 
   { -1, NULL},
 };
diff --git a/vp10/vp10_dx_iface.c b/vp10/vp10_dx_iface.c
index cf6ab56..d5c4c1c 100644
--- a/vp10/vp10_dx_iface.c
+++ b/vp10/vp10_dx_iface.c
@@ -58,6 +58,8 @@
   int                     last_show_frame;  // Index of last output frame.
   int                     byte_alignment;
   int                     skip_loop_filter;
+  int                     decode_tile_row;
+  int                     decode_tile_col;
 
   // Frame parallel related.
   int                     frame_parallel_decode;  // frame-based threading.
@@ -501,8 +503,8 @@
     frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
 #if CONFIG_EXT_TILE
-    frame_worker_data->pbi->dec_tile_row = ctx->cfg.tile_row;
-    frame_worker_data->pbi->dec_tile_col = ctx->cfg.tile_col;
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
 #endif  // CONFIG_EXT_TILE
 
     worker->had_error = 0;
@@ -919,6 +921,32 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_new_frame_image(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  vpx_image_t *new_img = va_arg(args, vpx_image_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (new_img) {
+    YV12_BUFFER_CONFIG new_frame;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (vp10_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
 #if CONFIG_VP9_POSTPROC
@@ -1118,6 +1146,18 @@
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_decode_tile_row(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_row = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_decode_tile_col(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_col = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,            ctrl_copy_reference},
 
@@ -1132,6 +1172,8 @@
   {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
   {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
   {VP9_SET_SKIP_LOOP_FILTER,      ctrl_set_skip_loop_filter},
+  {VP10_SET_DECODE_TILE_ROW,      ctrl_set_decode_tile_row},
+  {VP10_SET_DECODE_TILE_COL,      ctrl_set_decode_tile_col},
 
   // Getters
   {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
@@ -1140,6 +1182,7 @@
   {VP9D_GET_DISPLAY_SIZE,         ctrl_get_render_size},
   {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
   {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
+  {VP10_GET_NEW_FRAME_IMAGE,      ctrl_get_new_frame_image},
 
   { -1, NULL},
 };
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index da90fe6..1aaac15 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -118,6 +118,7 @@
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
+VP10_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
 endif
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
diff --git a/vpx/vp8.h b/vpx/vp8.h
index 8a035f9..ba67c38 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -56,6 +56,9 @@
    */
   VP9_GET_REFERENCE           = 128,  /**< get a pointer to a reference frame */
   VP8_COMMON_CTRL_ID_MAX,
+
+  VP10_GET_NEW_FRAME_IMAGE    = 192,  /**< get a pointer to the new frame */
+
   VP8_DECODER_CTRL_ID_START   = 256
 };
 
@@ -137,6 +140,8 @@
 #define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
 #define VPX_CTRL_VP9_GET_REFERENCE
+VPX_CTRL_USE_TYPE(VP10_GET_NEW_FRAME_IMAGE,    vpx_image_t *)
+#define VPX_CTRL_VP10_GET_NEW_FRAME_IMAGE
 
 /*!\endcond */
 /*! @} - end defgroup vp8 */
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 1306481..4d9a2a7 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -184,6 +184,15 @@
    */
   VP8E_SET_ENABLEAUTOALTREF,
 
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+  /*!\brief Codec control function to enable automatic set and use
+   * bwd-pred frames.
+   *
+   * Supported in codecs: VP10
+   */
+  VP8E_SET_ENABLEAUTOBWDREF,
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
   /*!\brief control function to set noise sensitivity
    *
    * 0: off, 1: OnYOnly, 2: OnYUV,
@@ -744,6 +753,12 @@
 #define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
 #define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF
+
+#if !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOBWDREF,   unsigned int)
+#define VPX_CTRL_VP8E_SET_ENABLEAUTOBWDREF
+#endif  // !CONFIG_EXT_REFS && CONFIG_BIDIR_PRED
+
 VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
 #define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY
 VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS,          unsigned int)
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 1f02fd5..347521e 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -121,7 +121,16 @@
    */
   VP9_SET_SKIP_LOOP_FILTER,
 
-  VP8_DECODER_CTRL_ID_MAX
+  VP8_DECODER_CTRL_ID_MAX,
+
+  /** control function to set the range of tile decoding. A value that is
+   * greater and equal to zero indicates only the specific row/column is
+   * decoded. A value that is -1 indicates the whole row/column is decoded.
+   * A special case is both values are -1 that means the whole frame is
+   * decoded.
+   */
+  VP10_SET_DECODE_TILE_ROW,
+  VP10_SET_DECODE_TILE_COL
 };
 
 /** Decrypt n bytes of data from input -> output, using the decrypt_state
@@ -174,7 +183,10 @@
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-
+VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_ROW,     int)
+#define VPX_CTRL_VP10_SET_DECODE_TILE_ROW
+VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_COL,     int)
+#define VPX_CTRL_VP10_SET_DECODE_TILE_COL
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
 
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index d4ba986..62fd919 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -108,10 +108,6 @@
     unsigned int threads; /**< Maximum number of threads to use, default 1 */
     unsigned int w;      /**< Width */
     unsigned int h;      /**< Height */
-    int          tile_row;  /**< The index of row tile to be decoded.
-                                 Value -1 means to decode all row tiles. */
-    int          tile_col;  /**< The index of column tile to be decoded.
-                                 Value -1 means to decode all column tiles */
   } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */
 
 
diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c
new file mode 100644
index 0000000..584ee6a
--- /dev/null
+++ b/vpx_dsp/blend_mask6.c
@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
+                       uint8_t *src0, uint32_t src0_stride,
+                       uint8_t *src1, uint32_t src1_stride,
+                       const uint8_t *mask, uint32_t mask_stride,
+                       int h, int w, int subh, int subw) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 = mask[i * mask_stride + j];
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+                               mask[i * mask_stride + (2 * j + 1)], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+                               mask[(2 * i + 1) * mask_stride + j], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
+                              uint8_t *src0_8, uint32_t src0_stride,
+                              uint8_t *src1_8, uint32_t src1_stride,
+                              const uint8_t *mask, uint32_t mask_stride,
+                              int h, int w, int subh, int subw, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 = mask[i * mask_stride + j];
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+                               mask[i * mask_stride + (2 * j + 1)], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+                               mask[(2 * i + 1) * mask_stride + j], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index a5802e1..4c0d5db 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/fwd_txfm.h"
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 402fd9a..533f762 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 46ef646..645a1ab 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -11,6 +11,7 @@
 #include <stdlib.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index e4e741a..6426ccc 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 46ef5fc..430cae1 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -65,6 +65,15 @@
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 
+# inter predictions
+
+ifeq ($(CONFIG_VP10),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-yes            += blend_mask6.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
+endif  #CONFIG_EXT_INTER
+endif  #CONFIG_VP10
+
 # interpolation filters
 DSP_SRCS-yes += vpx_convolve.c
 DSP_SRCS-yes += vpx_convolve.h
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 3571eea..7aaa89f 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -32,6 +32,8 @@
 
 #define IMPLIES(a, b)  (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 
+#define IS_POWER_OF_TWO(x)  (((x) & ((x) - 1)) == 0)
+
 // These can be used to give a hint about branch outcomes.
 // This can have an effect, even if your target processor has a
 // good branch predictor, as these hints can affect basic block
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad524a2..7bae037 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1358,10 +1358,10 @@
   }
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
 #
 # Masked Variance / Masked Subpixel Variance
 #
-if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
@@ -1381,6 +1381,14 @@
       }
     }
   }
+
+  add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  specialize "vpx_blend_mask6", qw/sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
+  }
 }
 
 #
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_mask6_sse4.c
new file mode 100644
index 0000000..5de3e23
--- /dev/null
+++ b/vpx_dsp/x86/blend_mask6_sse4.c
@@ -0,0 +1,1146 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+static INLINE __m128i mm_loadl_32(const void *a) {
+  return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i mm_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i mm_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void mm_storel_32(void *const a, const __m128i v) {
+  *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void mm_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void mm_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = mm_loadl_32(src0);
+  const __m128i v_s1_b = mm_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = mm_loadl_64(src0);
+  const __m128i v_s1_b = mm_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_b = mm_loadl_64(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_m0l_b = mm_loadl_64(mask + c);
+      const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
+      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_r_b = mm_loadu_128(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
+      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+
+      const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
+      const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadl_32(mask);
+    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadl_64(mask);
+    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zero = _mm_setzero_si128();
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ra_b = mm_loadu_128(mask + c);
+      const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadl_64(mask);
+    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadu_128(mask);
+    const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+      const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1),
+                                              v_zmask_b);
+      const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1),
+                                              v_zmask_b);
+      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                            uint8_t *src0, uint32_t src0_stride,
+                            uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w, int suby, int subx) {
+  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                            uint8_t *src0, uint32_t src0_stride,
+                            uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w);
+
+  static blend_fn blend[3][2][2] = {  // width_index X subx X suby
+    {     // w % 16 == 0
+      {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
+      {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+    }, {  // w == 4
+      {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
+      {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+    }, {  // w == 8
+      {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
+      {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+    }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+                                            src0, src0_stride,
+                                            src1, src1_stride,
+                                            mask, mask_stride,
+                                            h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadl_64(src0);
+  const __m128i v_s1_w = mm_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadu_128(src0);
+  const __m128i v_s1_w = mm_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadl_64(src0);
+  const __m128i v_s1_w = mm_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadu_128(src0);
+  const __m128i v_s1_w = mm_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
+  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                           src1_stride, mask, mask_stride, h,
+                           blend_4_b10);
+}
+
+static void blend_mask6_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                           src1_stride, mask, mask_stride, h,
+                           blend_4_b12);
+}
+
+static inline void blend_mask6_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_m0_b = mm_loadl_64(mask + c);
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, mask_stride, h, w,
+                            blend_8_b10);
+}
+
+static void blend_mask6_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, mask_stride, h, w,
+                            blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_ra_b = mm_loadl_32(mask);
+    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
+}
+
+static void blend_mask6_b12_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = mm_loadl_64(mask + c);
+      const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
+}
+
+static void blend_mask6_b12_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_ra_b = mm_loadl_64(mask);
+    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+      const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                             v_zmask_b);
+      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+      const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+                                   uint8_t *src0_8, uint32_t src0_stride,
+                                   uint8_t *src1_8, uint32_t src1_stride,
+                                   const uint8_t *mask, uint32_t mask_stride,
+                                   int h, int w, int suby, int subx, int bd) {
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+  uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+  uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                            uint16_t *src0, uint32_t src0_stride,
+                            uint16_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w);
+
+  static blend_fn blend[2][2][2][2] = {  // bd_index X width_index X subx X suby
+    {   // bd == 8 or 10
+      {     // w % 8 == 0
+        {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
+        {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+      }, {  // w == 4
+        {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
+        {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+      }
+    },
+    {   // bd == 12
+      {     // w % 8 == 0
+        {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
+        {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+      }, {  // w == 4
+        {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
+        {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+      }
+    }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+                                                      src0, src0_stride,
+                                                      src1, src1_stride,
+                                                      mask, mask_stride,
+                                                      h, w);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 4df39df..951af3a 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -10,6 +10,7 @@
 
 #include <immintrin.h>  // AVX2
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 
 #define pair256_set_epi16(a, b) \
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index e4deeec..3e4f49b 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -11,6 +11,7 @@
 #include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
 
diff --git a/vpx_dsp/x86/masked_variance_intrin_ssse3.c b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
index 47e2c32..a0c2b6e 100644
--- a/vpx_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
@@ -1380,8 +1380,9 @@
 #endif  // CONFIG_EXT_PARTITION
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef int (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
-             unsigned int* sse, const int w, const int h);
+typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
+                                             uint32_t *sse,
+                                             const int w, const int h);
 typedef unsigned int (*highbd_variance_fn_t)(
                       const uint8_t *a8, int a_stride,
                       const uint8_t *b8, int b_stride,
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index 4dce9c2..993124a 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -46,6 +46,11 @@
 #define ROUNDZ_POWER_OF_TWO(value, n) \
     ((n) ? (((value) + (1 << ((n) - 1))) >> (n)) : (value))
 
+/* Shift down with rounding for signed integers, for use when n > 0 */
+#define ROUND_POWER_OF_TWO_SIGNED(value, n) \
+    (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
+                   : ROUND_POWER_OF_TWO((value), (n)))
+
 #define ALIGN_POWER_OF_TWO(value, n) \
     (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 5212075..d6a88b8 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -235,7 +235,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   memcpy(dst, src, num * sizeof(uint16_t));
diff --git a/vpxdec.c b/vpxdec.c
index 13b020b..235d17a 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -818,11 +818,6 @@
   if (!interface)
     interface = get_vpx_decoder_by_index(0);
 
-#if CONFIG_EXT_TILE
-  cfg.tile_row = tile_row;
-  cfg.tile_col = tile_col;
-#endif  // CONFIG_EXT_TILE
-
   dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
               (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
               (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
@@ -877,6 +872,21 @@
   }
 #endif
 
+#if CONFIG_VP10_DECODER && CONFIG_EXT_TILE
+  if (strncmp(decoder.name, "WebM Project VP10", 17) == 0) {
+    if (vpx_codec_control(&decoder, VP10_SET_DECODE_TILE_ROW, tile_row)) {
+      fprintf(stderr, "Failed to set decode_tile_row: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+
+    if (vpx_codec_control(&decoder, VP10_SET_DECODE_TILE_COL, tile_col)) {
+      fprintf(stderr, "Failed to set decode_tile_col: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+#endif
 
   if (arg_skip)
     fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
diff --git a/vpxenc.c b/vpxenc.c
index 7fb28cd..d988b30 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1580,8 +1580,18 @@
 #if CONFIG_DECODERS
   if (global->test_decode != TEST_DECODE_OFF) {
     const VpxInterface *decoder = get_vpx_decoder_by_name(global->codec->name);
-    vpx_codec_dec_cfg_t cfg = { 0, 0, 0, -1, -1 };
+    vpx_codec_dec_cfg_t cfg = { 0, 0, 0};
     vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+
+#if CONFIG_VP10_DECODER && CONFIG_EXT_TILE
+    if (strcmp(global->codec->name, "vp10") == 0) {
+      vpx_codec_control(&stream->decoder, VP10_SET_DECODE_TILE_ROW, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
+
+      vpx_codec_control(&stream->decoder, VP10_SET_DECODE_TILE_COL, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
+    }
+#endif
   }
 #endif
 }
@@ -1846,26 +1856,25 @@
     vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
     vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
   } else {
-    struct vp9_ref_frame ref_enc, ref_dec;
+    vpx_codec_control(&stream->encoder, VP10_GET_NEW_FRAME_IMAGE, &enc_img);
+    vpx_codec_control(&stream->decoder, VP10_GET_NEW_FRAME_IMAGE, &dec_img);
 
-    ref_enc.idx = 0;
-    ref_dec.idx = 0;
-    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
-    enc_img = ref_enc.img;
-    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
-    dec_img = ref_dec.img;
 #if CONFIG_VP9_HIGHBITDEPTH
     if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
         (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
       if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-        vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+        vpx_image_t enc_hbd_img;
+        vpx_img_alloc(&enc_hbd_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
                       enc_img.d_w, enc_img.d_h, 16);
-        vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+        vpx_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+        enc_img = enc_hbd_img;
       }
       if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-        vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+        vpx_image_t dec_hbd_img;
+        vpx_img_alloc(&dec_hbd_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
                       dec_img.d_w, dec_img.d_h, 16);
-        vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
+        vpx_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+        dec_img = dec_hbd_img;
       }
     }
 #endif