Merge "Prevent decoder from using uninitialized entropy context."

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 7be583d..f980180 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -1272,9 +1272,6 @@
     check_cc <<EOF && INLINE="inline"
 static inline function() {}
 EOF
-    check_cc <<EOF && INLINE="__inline__ __attribute__((always_inline))"
-static __attribute__((always_inline)) function() {}
-EOF
 
   # Almost every platform uses pthreads.
   if enabled multithread; then

diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 9030108..bdd6690 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh

@@ -22,7 +22,8 @@
 FRAMEWORK_DIR="VPX.framework"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 MAKE_JOBS=1
-LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
 TARGETS="arm64-darwin-gcc
@@ -41,7 +42,7 @@
 
   mkdir "${target}"
   cd "${target}"
-  eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+  eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
       --disable-docs ${EXTRA_CONFIGURE_ARGS} ${devnull}
   export DIST_DIR
   eval make -j ${MAKE_JOBS} dist ${devnull}
@@ -57,9 +58,6 @@
     arm64-*)
       echo "__aarch64__"
       ;;
-    armv6-*)
-      echo "__ARM_ARCH_6__"
-      ;;
     armv7-*)
       echo "__ARM_ARCH_7A__"
       ;;
@@ -175,8 +173,13 @@
 # Trap function. Cleans up the subtree used to build all targets contained in
 # $TARGETS.
 cleanup() {
+  local readonly res=$?
   cd "${ORIG_PWD}"
 
+  if [ $res -ne 0 ]; then
+    elog "build exited with error ($res)"
+  fi
+
   if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
     rm -rf "${BUILD_ROOT}"
   fi
@@ -186,14 +189,21 @@
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
+    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
     --jobs: Number of make jobs.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
+    --targets <targets>: Override default target list. Defaults:
+         ${TARGETS}
     --verbose: Output information about the environment and each stage of the
                build.
 EOF
 }
 
+elog() {
+  echo "${0##*/} failed because: $@" 1>&2
+}
+
 vlog() {
   if [ "${VERBOSE}" = "yes" ]; then
     echo "$@"
@@ -223,6 +233,10 @@
     --show-build-output)
       devnull=
       ;;
+    --targets)
+      TARGETS="$2"
+      shift
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -251,3 +265,5 @@
 fi
 
 build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+echo "         ${TARGETS}"

diff --git a/test/codec_factory.h b/test/codec_factory.h
index 7f9398c..3a0e3db 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h

@@ -35,6 +35,11 @@
   virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                  unsigned long deadline) const = 0;
 
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline)  // NOLINT(runtime/int)
+                                 const = 0;
+
   virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
                                  unsigned long deadline,
                                  const unsigned long init_flags,
@@ -72,6 +77,10 @@
   VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
       : Decoder(cfg, deadline) {}
 
+  VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP8_DECODER
@@ -104,8 +113,14 @@
 
   virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                  unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP8_DECODER
-    return new VP8Decoder(cfg, deadline);
+    return new VP8Decoder(cfg, flags, deadline);
 #else
     return NULL;
 #endif
@@ -154,6 +169,10 @@
   VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
       : Decoder(cfg, deadline) {}
 
+  VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+             unsigned long deadline)  // NOLINT
+      : Decoder(cfg, flag, deadline) {}
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const {
 #if CONFIG_VP9_DECODER
@@ -186,8 +205,14 @@
 
   virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
                                  unsigned long deadline) const {
+    return CreateDecoder(cfg, 0, deadline);
+  }
+
+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                                 const vpx_codec_flags_t flags,
+                                 unsigned long deadline) const {  // NOLINT
 #if CONFIG_VP9_DECODER
-    return new VP9Decoder(cfg, deadline);
+    return new VP9Decoder(cfg, flags, deadline);
 #else
     return NULL;
 #endif

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5dd73db..e30ccf9 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -10,6 +10,7 @@
 
 #include <string.h>
 #include "test/acm_random.h"
+#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -315,6 +316,8 @@
 #endif
   }
 
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
   static void TearDownTestCase() {
     vpx_free(input_ - 1);
     input_ = NULL;

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 0ef4f7b..852d90e 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -110,4 +110,12 @@
   RunLoop(video, dec_cfg);
 }
 
+void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const vpx_codec_flags_t flags) {
+  flags_ = flags;
+}
+
 }  // namespace libvpx_test

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index a757b59..232428e 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -41,7 +41,13 @@
 class Decoder {
  public:
   Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : cfg_(cfg), deadline_(deadline), init_done_(false) {
+      : cfg_(cfg), flags_(0), deadline_(deadline), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+          unsigned long deadline)  // NOLINT
+      : cfg_(cfg), flags_(flag), deadline_(deadline), init_done_(false) {
     memset(&decoder_, 0, sizeof(decoder_));
   }
 
@@ -97,6 +103,10 @@
 
   bool IsVP8() const;
 
+  vpx_codec_ctx_t * GetDecoder() {
+    return &decoder_;
+  }
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const = 0;
 
@@ -104,7 +114,7 @@
     if (!init_done_) {
       const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
                                                      CodecInterface(),
-                                                     &cfg_, 0);
+                                                     &cfg_, flags_);
       ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
       init_done_ = true;
     }
@@ -112,6 +122,7 @@
 
   vpx_codec_ctx_t     decoder_;
   vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
   unsigned int        deadline_;
   bool                init_done_;
 };
@@ -124,6 +135,9 @@
   virtual void RunLoop(CompressedVideoSource *video,
                        const vpx_codec_dec_cfg_t &dec_cfg);
 
+  virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const vpx_codec_flags_t flags);
+
   // Hook to be called before decompressing every frame.
   virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
                                   Decoder* /*decoder*/) {}
@@ -146,11 +160,16 @@
                                 const vpx_codec_err_t res_peek);
 
  protected:
-  explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {}
+  explicit DecoderTest(const CodecFactory *codec)
+      : codec_(codec),
+        cfg_(),
+        flags_(0) {}
 
   virtual ~DecoderTest() {}
 
   const CodecFactory *codec_;
+  vpx_codec_dec_cfg_t cfg_;
+  vpx_codec_flags_t   flags_;
 };
 
 }  // namespace libvpx_test

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 5b6757c..b49e8cb 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -140,6 +140,8 @@
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
+  vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+
   stats_.Reset();
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
@@ -157,7 +159,13 @@
     Encoder* const encoder = codec_->CreateEncoder(cfg_, deadline_, init_flags_,
                                                    &stats_);
     ASSERT_TRUE(encoder != NULL);
-    Decoder* const decoder = codec_->CreateDecoder(dec_cfg_, 0);
+
+    unsigned long dec_init_flags = 0;  // NOLINT
+    // Use fragment decoder if encoder outputs partitions.
+    // NOTE: fragment decoder and partition encoder are only supported by VP8.
+    if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION)
+      dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
+    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
     bool again;
     for (again = true, video->Begin(); again; video->Next()) {
       again = (video->img() != NULL);
@@ -199,6 +207,13 @@
         }
       }
 
+      // Flush the decoder when there are no more fragments.
+      if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
+        const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+        if (!HandleDecodeResult(res_dec, *video, decoder))
+          break;
+      }
+
       if (has_dxdata && has_cxdata) {
         const vpx_image_t *img_enc = encoder->GetPreviewFrame();
         DxDataIterator dec_iter = decoder->GetDxData();

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 3bc7847..770ac7e 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -185,6 +185,11 @@
   // Map the TestMode enum to the deadline_ and passes_ variables.
   void SetMode(TestMode mode);
 
+  // Set encoder flag.
+  void set_init_flags(unsigned long flag) {  // NOLINT(runtime/int)
+    init_flags_ = flag;
+  }
+
   // Main loop
   virtual void RunLoop(VideoSource *video);
 

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index d6a3473..5357a8d 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -474,14 +474,17 @@
     ::testing::Values(
         make_tuple(&vp9_fdct4x4_c,
                    &vp9_idct4x4_16_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, Trans4x4HT,
+    NEON, Trans4x4HT,
     ::testing::Values(
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
     !CONFIG_EMULATE_HARDWARE

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 110c9c3..6f2c89b 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -62,6 +62,10 @@
 using libvpx_test::ACMRandom;
 
 namespace {
+
+const int kSignBiasMaxDiff255 = 1500;
+const int kSignBiasMaxDiff15 = 10000;
+
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
@@ -160,7 +164,7 @@
 
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-      const int max_diff = 1125;
+      const int max_diff = kSignBiasMaxDiff255;
       EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
@@ -190,9 +194,9 @@
 
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
-      const int max_diff = 10000;
+      const int max_diff = kSignBiasMaxDiff15;
       EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
-          << "Error: 4x4 FDCT/FHT has a sign bias > "
+          << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
           << " count0: " << count_sign_block[j][0]
@@ -646,36 +650,24 @@
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_C, FwdTrans8x8DCT,
-    ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8DCT,
     ::testing::Values(
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
         make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12)));
 #else
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_C, FwdTrans8x8DCT,
+    C, FwdTrans8x8DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_C, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
         make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
         make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
@@ -691,12 +683,9 @@
 // TODO(jingning): re-enable after this handles the expanded range [0, 65535]
 // returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_C, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
@@ -706,49 +695,44 @@
 // TODO(jingning): re-enable after this handles the expanded range [0, 65535]
 // returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, FwdTrans8x8DCT,
+    NEON, FwdTrans8x8DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,
                    VPX_BITS_8)));
+#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_NEON, FwdTrans8x8HT,
+    NEON, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 // TODO(jingning): re-enable after these handle the expanded range [0, 65535]
 // returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSE2, FwdTrans8x8DCT,
+    SSE2, FwdTrans8x8DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,
                    VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSE2, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSE2, FwdTrans8x8DCT,
-    ::testing::Values(
-        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(
+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_highbd_fdct8x8_c,
                    &idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
         make_tuple(&vp9_highbd_fdct8x8_sse2,
@@ -761,12 +745,9 @@
 // TODO(jingning): re-enable after these handle the expanded range [0, 65535]
 // returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSE2, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
@@ -791,7 +772,7 @@
 // TODO(jingning): re-enable after this handles the expanded range [0, 65535]
 // returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSSE3, FwdTrans8x8DCT,
+    SSSE3, FwdTrans8x8DCT,
     ::testing::Values(
         make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
                    VPX_BITS_8)));

diff --git a/test/test-data.mk b/test/test-data.mk
index 9c46372..d07ca32 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk

@@ -17,6 +17,7 @@
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
 
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 

diff --git a/test/test.mk b/test/test.mk
index f199e87..6e935de 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -94,6 +94,7 @@
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
@@ -127,6 +128,12 @@
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 
+# TODO(jbb): Remove this when the TODO(minghai): in vpx_encoder.h gets resolved.
+# This test fails because of a mismatch in the size of the vpx_codec_alg_priv
+# between encoder and decoder when this is enabled.
+ifneq ($(CONFIG_SPATIAL_SVC),yes)
+LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
+endif
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc

diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc
new file mode 100644
index 0000000..cb0d1a1
--- /dev/null
+++ b/test/vp8_fragments_test.cc

@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class VP8FramgmentsTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::Test {
+ protected:
+  VP8FramgmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
+  virtual ~VP8FramgmentsTest() {}
+
+  virtual void SetUp() {
+    const unsigned long init_flags =  // NOLINT(runtime/int)
+        VPX_CODEC_USE_OUTPUT_PARTITION;
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_init_flags(init_flags);
+  }
+};
+
+TEST_F(VP8FramgmentsTest, TestFragmentsEncodeDecode) {
+  ::libvpx_test::RandomVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+}  // namespace

diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
new file mode 100644
index 0000000..6c354fd
--- /dev/null
+++ b/test/vp9_encoder_parms_get_to_decoder.cc

@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/decoder/vp9_decoder.h"
+
+typedef vpx_codec_stream_info_t vp9_stream_info_t;
+struct vpx_codec_alg_priv {
+  vpx_codec_priv_t        base;
+  vpx_codec_dec_cfg_t     cfg;
+  vp9_stream_info_t       si;
+  struct VP9Decoder      *pbi;
+  int                     postproc_cfg_set;
+  vp8_postproc_cfg_t      postproc_cfg;
+  vpx_decrypt_cb          decrypt_cb;
+  void                   *decrypt_state;
+  vpx_image_t             img;
+  int                     img_avail;
+  int                     flushed;
+  int                     invert_tile_order;
+  int                     frame_parallel_decode;
+
+  // External frame buffer info to save for VP9 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+};
+
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+  return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
+namespace {
+
+const unsigned int kFramerate = 50;
+const int kCpuUsed = 2;
+
+struct EncodePerfTestVideo {
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+    {"niklas_1280_720_30.yuv", 1280, 720, 600, 10},
+};
+
+struct EncodeParameters {
+  int32_t tile_rows;
+  int32_t tile_cols;
+  int32_t lossless;
+  int32_t error_resilient;
+  int32_t frame_parallel;
+  // TODO(JBB): quantizers / bitrate
+};
+
+const EncodeParameters kVP9EncodeParameterSet[] = {
+    {0, 0, 0, 1, 0},
+    {0, 0, 0, 0, 0},
+    {0, 0, 1, 0, 0},
+    {0, 2, 0, 0, 1},
+    // TODO(JBB): Test profiles (requires more work).
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class Vp9EncoderParmsGetToDecoder
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<EncodeParameters, \
+                                                 EncodePerfTestVideo> {
+ protected:
+  Vp9EncoderParmsGetToDecoder()
+      : EncoderTest(GET_PARAM(0)),
+        encode_parms(GET_PARAM(1)) {
+  }
+
+  virtual ~Vp9EncoderParmsGetToDecoder() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 25;
+    cfg_.g_error_resilient = encode_parms.error_resilient;
+    dec_cfg_.threads = 4;
+    test_video_ = GET_PARAM(2);
+    cfg_.rc_target_bitrate = test_video_.bitrate;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                       encode_parms.frame_parallel);
+      encoder->Control(VP9E_SET_TILE_ROWS, encode_parms.tile_rows);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, encode_parms.tile_cols);
+      encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const libvpx_test::VideoSource& video,
+                                  libvpx_test::Decoder *decoder) {
+    vpx_codec_ctx_t* vp9_decoder = decoder->GetDecoder();
+    vpx_codec_alg_priv_t* priv =
+        (vpx_codec_alg_priv_t*) get_alg_priv(vp9_decoder);
+
+    VP9Decoder* pbi = priv->pbi;
+    VP9_COMMON* common = &pbi->common;
+
+    if (encode_parms.lossless) {
+      EXPECT_EQ(common->base_qindex, 0);
+      EXPECT_EQ(common->y_dc_delta_q, 0);
+      EXPECT_EQ(common->uv_dc_delta_q, 0);
+      EXPECT_EQ(common->uv_ac_delta_q, 0);
+      EXPECT_EQ(common->tx_mode, ONLY_4X4);
+    }
+    EXPECT_EQ(common->error_resilient_mode, encode_parms.error_resilient);
+    if (encode_parms.error_resilient) {
+      EXPECT_EQ(common->frame_parallel_decoding_mode, 1);
+      EXPECT_EQ(common->use_prev_frame_mvs, 0);
+    } else {
+      EXPECT_EQ(common->frame_parallel_decoding_mode,
+                encode_parms.frame_parallel);
+    }
+
+    EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
+    EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
+
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
+  EncodePerfTestVideo test_video_;
+
+ private:
+  EncodeParameters encode_parms;
+};
+
+TEST_P(Vp9EncoderParmsGetToDecoder, BitstreamParms) {
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_.name)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_.name,
+                                            0, test_video_.frames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_.name,
+                                            VPX_IMG_FMT_I420,
+                                            test_video_.width,
+                                            test_video_.height,
+                                            kFramerate, 1, 0,
+                                            test_video_.frames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    Vp9EncoderParmsGetToDecoder,
+    ::testing::ValuesIn(kVP9EncodeParameterSet),
+    ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+
+}  // namespace

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 5aa274d..352ed7b 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -60,7 +60,6 @@
     vpx_decrypt_cb          decrypt_cb;
     void                    *decrypt_state;
     vpx_image_t             img;
-    int                     flushed;
     int                     img_setup;
     struct frame_buffers    yv12_frame_buffers;
     void                    *user_priv;
@@ -89,7 +88,6 @@
     priv->si.sz = sizeof(priv->si);
     priv->decrypt_cb = NULL;
     priv->decrypt_state = NULL;
-    priv->flushed = 0;
 
     if (ctx->config.dec)
     {
@@ -307,6 +305,11 @@
         return 0;
     }
 
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
+    }
+
     if (!ctx->fragments.enabled)
     {
         ctx->fragments.ptrs[0] = data;
@@ -327,14 +330,11 @@
     unsigned int resolution_change = 0;
     unsigned int w, h;
 
-    if (data == NULL && data_sz == 0) {
-      ctx->flushed = 1;
-      return VPX_CODEC_OK;
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
     }
 
-    /* Reset flushed when receiving a valid frame */
-    ctx->flushed = 0;
-
     /* Update the input fragment data */
     if(update_fragments(ctx, data, data_sz, &res) <= 0)
         return res;

diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e2..0000000
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null

@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht4x4_16_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
-    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
-    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
-    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
-    ; them as buffer during calculation.
-    MACRO
-    IDCT4x4_1D
-    ; stage 1
-    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
-    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
-
-    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
-    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
-    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
-    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q10, #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16    q8,  q13, q14
-    vsub.s16    q9,  q13, q14
-    vswp        d18, d19
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
-    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
-    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
-    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
-    ; q14,q15 registers and use them as buffer during calculation.
-    MACRO
-    IADST4x4_1D
-    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
-    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
-    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
-    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
-    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
-    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
-    vaddw.s16   q15, q15, d19   ; x0 + x3
-    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
-    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
-    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
-
-    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
-    vadd.s32    q10, q10, q8
-    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
-    vdup.32     q8, r0          ; duplicate sinpi_3_9
-    vsub.s32    q11, q11, q9
-    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
-
-    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
-    vadd.s32    q10, q10, q11   ; x0 + x1
-    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
-    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d16, q13, #14
-    vqrshrn.s32 d17, q14, #14
-    vqrshrn.s32 d18, q15, #14
-    vqrshrn.s32 d19, q10, #14
-    MEND
-
-    ; Generate cosine constants in d6 - d8 for the IDCT
-    MACRO
-    GENERATE_COSINE_CONSTANTS
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov         r0, #0x3b00
-    add         r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov         r3, #0x2d00
-    add         r3, #0x41
-    ; cospi_24_64 = 6270 = 0x187e
-    mov         r12, #0x1800
-    add         r12, #0x7e
-
-    ; generate constant vectors
-    vdup.16     d0, r0          ; duplicate cospi_8_64
-    vdup.16     d1, r3          ; duplicate cospi_16_64
-    vdup.16     d2, r12         ; duplicate cospi_24_64
-    MEND
-
-    ; Generate sine constants in d1 - d4 for the IADST.
-    MACRO
-    GENERATE_SINE_CONSTANTS
-    ; sinpi_1_9 = 5283 = 0x14A3
-    mov         r0, #0x1400
-    add         r0, #0xa3
-    ; sinpi_2_9 = 9929 = 0x26C9
-    mov         r3, #0x2600
-    add         r3, #0xc9
-    ; sinpi_4_9 = 15212 = 0x3B6C
-    mov         r12, #0x3b00
-    add         r12, #0x6c
-
-    ; generate constant vectors
-    vdup.16     d3, r0          ; duplicate sinpi_1_9
-
-    ; sinpi_3_9 = 13377 = 0x3441
-    mov         r0, #0x3400
-    add         r0, #0x41
-
-    vdup.16     d4, r3          ; duplicate sinpi_2_9
-    vdup.16     d5, r12         ; duplicate sinpi_4_9
-    vdup.16     q3, r0          ; duplicate sinpi_3_9
-    MEND
-
-    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
-    MACRO
-    TRANSPOSE4X4
-    vtrn.16     d16, d17
-    vtrn.16     d18, d19
-    vtrn.32     q8, q9
-    MEND
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16    {q8,q9}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE4X4
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IDCT4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IDCT4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
-    ; generate constants
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16   q8, q8, #4
-    vrshr.s16   q9, q9, #4
-
-    vld1.32     {d26[0]}, [r1], r2
-    vld1.32     {d26[1]}, [r1], r2
-    vld1.32     {d27[0]}, [r1], r2
-    vld1.32     {d27[1]}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8    q8, q8, d26
-    vaddw.u8    q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb         r2, r2, #0
-    vst1.32     {d27[1]}, [r1], r2
-    vst1.32     {d27[0]}, [r1], r2
-    vst1.32     {d26[1]}, [r1], r2
-    vst1.32     {d26[0]}, [r1]  ; no post-increment
-    bx          lr
-    ENDP  ; |vp9_iht4x4_16_add_neon|
-
-    END

diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 0000000..cd8c358
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c

@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static inline void TRANSPOSE4X4(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int32x4_t q8s32, q9s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+
+    d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+    d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+    q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+    q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+    q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+    *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+    *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+    return;
+}
+
+static inline void GENERATE_COSINE_CONSTANTS(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16) {
+    *d0s16 = vdup_n_s16(cospi_8_64);
+    *d1s16 = vdup_n_s16(cospi_16_64);
+    *d2s16 = vdup_n_s16(cospi_24_64);
+    return;
+}
+
+static inline void GENERATE_SINE_CONSTANTS(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16) {
+    *d3s16 = vdup_n_s16(sinpi_1_9);
+    *d4s16 = vdup_n_s16(sinpi_2_9);
+    *q3s16 = vdupq_n_s16(sinpi_3_9);
+    *d5s16 = vdup_n_s16(sinpi_4_9);
+    return;
+}
+
+static inline void IDCT4x4_1D(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    int32x4_t q10s32, q13s32, q14s32, q15s32;
+    int16x8_t q13s16, q14s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, *d2s16);
+    q10s32 = vmull_s16(d17s16, *d0s16);
+    q13s32 = vmull_s16(d23s16, *d1s16);
+    q14s32 = vmull_s16(d24s16, *d1s16);
+    q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+    q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+    *q8s16 = vaddq_s16(q13s16, q14s16);
+    *q9s16 = vsubq_s16(q13s16, q14s16);
+    *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+                          vget_low_s16(*q9s16));  // vswp
+    return;
+}
+
+static inline void IADST4x4_1D(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d6s16 = vget_low_s16(*q3s16);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    q10s32 = vmull_s16(*d3s16, d16s16);
+    q11s32 = vmull_s16(*d4s16, d16s16);
+    q12s32 = vmull_s16(d6s16, d17s16);
+    q13s32 = vmull_s16(*d5s16, d18s16);
+    q14s32 = vmull_s16(*d3s16, d18s16);
+    q15s32 = vmovl_s16(d16s16);
+    q15s32 = vaddw_s16(q15s32, d19s16);
+    q8s32  = vmull_s16(*d4s16, d19s16);
+    q15s32 = vsubw_s16(q15s32, d18s16);
+    q9s32  = vmull_s16(*d5s16, d19s16);
+
+    q10s32 = vaddq_s32(q10s32, q13s32);
+    q10s32 = vaddq_s32(q10s32, q8s32);
+    q11s32 = vsubq_s32(q11s32, q14s32);
+    q8s32  = vdupq_n_s32(sinpi_3_9);
+    q11s32 = vsubq_s32(q11s32, q9s32);
+    q15s32 = vmulq_s32(q15s32, q8s32);
+
+    q13s32 = vaddq_s32(q10s32, q12s32);
+    q10s32 = vaddq_s32(q10s32, q11s32);
+    q14s32 = vaddq_s32(q11s32, q12s32);
+    q10s32 = vsubq_s32(q10s32, q12s32);
+
+    d16s16 = vqrshrn_n_s32(q13s32, 14);
+    d17s16 = vqrshrn_n_s32(q14s32, 14);
+    d18s16 = vqrshrn_n_s32(q15s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+    *q8s16 = vcombine_s16(d16s16, d17s16);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    uint8x8_t d26u8, d27u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+    uint32x2_t d26u32, d27u32;
+    int16x8_t q3s16, q8s16, q9s16;
+    uint16x8_t q8u16, q9u16;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    TRANSPOSE4X4(&q8s16, &q9s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate constants
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      case 2:  // idct_iadst
+        // generate constantsyy
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate constants
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+    dest += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+    return;
+}

diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f566..0000000
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null

@@ -1,698 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht8x8_64_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Generate IADST constants in r0 - r12 for the IADST.
-    MACRO
-    GENERATE_IADST_CONSTANTS
-    ; generate  cospi_2_64  = 16305
-    mov             r0, #0x3f00
-    add             r0, #0xb1
-
-    ; generate cospi_30_64 = 1606
-    mov             r1, #0x600
-    add             r1, #0x46
-
-    ; generate cospi_10_64 = 14449
-    mov             r2, #0x3800
-    add             r2, #0x71
-
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_18_64 = 10394
-    mov             r4, #0x2800
-    add             r4, #0x9a
-
-    ; generate cospi_14_64 = 12665
-    mov             r5, #0x3100
-    add             r5, #0x79
-
-    ; generate cospi_26_64 = 4756
-    mov             r6, #0x1200
-    add             r6, #0x94
-
-    ; generate cospi_6_64  = 15679
-    mov             r7, #0x3d00
-    add             r7, #0x3f
-
-    ; generate cospi_8_64  = 15137
-    mov             r8, #0x3b00
-    add             r8, #0x21
-
-    ; generate cospi_24_64 = 6270
-    mov             r9, #0x1800
-    add             r9, #0x7e
-
-    ; generate 0
-    mov             r10, #0
-
-    ; generate  cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-    MEND
-
-    ; Generate IDCT constants in r3 - r9 for the IDCT.
-    MACRO
-    GENERATE_IDCT_CONSTANTS
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-    MEND
-
-    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
-    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
-    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
-    ; registers and use them as buffer during calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14             ; >> 14
-    vqrshrn.s32     d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
-    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
-    ; output will be stored back into q8-q15 registers. This macro will touch
-    ; q0 - q7 registers and use them as buffer during calculation.
-    MACRO
-    IADST8X8_1D
-    vdup.16         d14, r0                   ; duplicate cospi_2_64
-    vdup.16         d15, r1                   ; duplicate cospi_30_64
-
-    ; cospi_2_64  * x0
-    vmull.s16       q1, d30, d14
-    vmull.s16       q2, d31, d14
-
-    ; cospi_30_64 * x0
-    vmull.s16       q3, d30, d15
-    vmull.s16       q4, d31, d15
-
-    vdup.16         d30, r4                   ; duplicate cospi_18_64
-    vdup.16         d31, r5                   ; duplicate cospi_14_64
-
-    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-    vmlal.s16       q1, d16, d15
-    vmlal.s16       q2, d17, d15
-
-    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
-    vmlsl.s16       q3, d16, d14
-    vmlsl.s16       q4, d17, d14
-
-    ; cospi_18_64 * x4
-    vmull.s16       q5, d22, d30
-    vmull.s16       q6, d23, d30
-
-    ; cospi_14_64 * x4
-    vmull.s16       q7, d22, d31
-    vmull.s16       q8, d23, d31
-
-    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-    vmlal.s16       q5, d24, d31
-    vmlal.s16       q6, d25, d31
-
-    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
-    vmlsl.s16       q7, d24, d30
-    vmlsl.s16       q8, d25, d30
-
-    ; (s0 + s4)
-    vadd.s32        q11, q1, q5
-    vadd.s32        q12, q2, q6
-
-    vdup.16         d0, r2                   ; duplicate cospi_10_64
-    vdup.16         d1, r3                   ; duplicate cospi_22_64
-
-    ; (s0 - s4)
-    vsub.s32        q1, q1, q5
-    vsub.s32        q2, q2, q6
-
-    ; x0 = dct_const_round_shift(s0 + s4);
-    vqrshrn.s32     d22, q11, #14             ; >> 14
-    vqrshrn.s32     d23, q12, #14             ; >> 14
-
-    ; (s1 + s5)
-    vadd.s32        q12, q3, q7
-    vadd.s32        q15, q4, q8
-
-    ; (s1 - s5)
-    vsub.s32        q3, q3, q7
-    vsub.s32        q4, q4, q8
-
-    ; x4 = dct_const_round_shift(s0 - s4);
-    vqrshrn.s32     d2, q1, #14               ; >> 14
-    vqrshrn.s32     d3, q2, #14               ; >> 14
-
-    ; x1 = dct_const_round_shift(s1 + s5);
-    vqrshrn.s32     d24, q12, #14             ; >> 14
-    vqrshrn.s32     d25, q15, #14             ; >> 14
-
-    ; x5 = dct_const_round_shift(s1 - s5);
-    vqrshrn.s32     d6, q3, #14               ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; cospi_10_64 * x2
-    vmull.s16       q4, d26, d0
-    vmull.s16       q5, d27, d0
-
-    ; cospi_22_64 * x2
-    vmull.s16       q2, d26, d1
-    vmull.s16       q6, d27, d1
-
-    vdup.16         d30, r6                   ; duplicate cospi_26_64
-    vdup.16         d31, r7                   ; duplicate cospi_6_64
-
-    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-    vmlal.s16       q4, d20, d1
-    vmlal.s16       q5, d21, d1
-
-    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-    vmlsl.s16       q2, d20, d0
-    vmlsl.s16       q6, d21, d0
-
-    ; cospi_26_64 * x6
-    vmull.s16       q0, d18, d30
-    vmull.s16       q13, d19, d30
-
-    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-    vmlal.s16       q0, d28, d31
-    vmlal.s16       q13, d29, d31
-
-    ; cospi_6_64  * x6
-    vmull.s16       q10, d18, d31
-    vmull.s16       q9, d19, d31
-
-    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-    vmlsl.s16       q10, d28, d30
-    vmlsl.s16       q9, d29, d30
-
-    ; (s3 + s7)
-    vadd.s32        q14, q2, q10
-    vadd.s32        q15, q6, q9
-
-    ; (s3 - s7)
-    vsub.s32        q2, q2, q10
-    vsub.s32        q6, q6, q9
-
-    ; x3 = dct_const_round_shift(s3 + s7);
-    vqrshrn.s32     d28, q14, #14             ; >> 14
-    vqrshrn.s32     d29, q15, #14             ; >> 14
-
-    ; x7 = dct_const_round_shift(s3 - s7);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; (s2 + s6)
-    vadd.s32        q9, q4, q0
-    vadd.s32        q10, q5, q13
-
-    ; (s2 - s6)
-    vsub.s32        q4, q4, q0
-    vsub.s32        q5, q5, q13
-
-    vdup.16         d30, r8                   ; duplicate cospi_8_64
-    vdup.16         d31, r9                   ; duplicate cospi_24_64
-
-    ; x2 = dct_const_round_shift(s2 + s6);
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q10, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s2 - s6);
-    vqrshrn.s32     d8, q4, #14               ; >> 14
-    vqrshrn.s32     d9, q5, #14               ; >> 14
-
-    ; cospi_8_64  * x4
-    vmull.s16       q5, d2, d30
-    vmull.s16       q6, d3, d30
-
-    ; cospi_24_64 * x4
-    vmull.s16       q7, d2, d31
-    vmull.s16       q0, d3, d31
-
-    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-    vmlal.s16       q5, d6, d31
-    vmlal.s16       q6, d7, d31
-
-    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-    vmlsl.s16       q7, d6, d30
-    vmlsl.s16       q0, d7, d30
-
-    ; cospi_8_64  * x7
-    vmull.s16       q1, d4, d30
-    vmull.s16       q3, d5, d30
-
-    ; cospi_24_64 * x7
-    vmull.s16       q10, d4, d31
-    vmull.s16       q2, d5, d31
-
-    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-    vmlsl.s16       q1, d8, d31
-    vmlsl.s16       q3, d9, d31
-
-    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-    vmlal.s16       q10, d8, d30
-    vmlal.s16       q2, d9, d30
-
-    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
-
-    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
-
-    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
-
-    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
-
-    ; (s4 + s6)
-    vadd.s32        q14, q5, q1
-    vadd.s32        q15, q6, q3
-
-    ; (s4 - s6)
-    vsub.s32        q5, q5, q1
-    vsub.s32        q6, q6, q3
-
-    ; x4 = dct_const_round_shift(s4 + s6);
-    vqrshrn.s32     d18, q14, #14             ; >> 14
-    vqrshrn.s32     d19, q15, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s4 - s6);
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; (s5 + s7)
-    vadd.s32        q1, q7, q10
-    vadd.s32        q3, q0, q2
-
-    ; (s5 - s7))
-    vsub.s32        q7, q7, q10
-    vsub.s32        q0, q0, q2
-
-    ; x5 = dct_const_round_shift(s5 + s7);
-    vqrshrn.s32     d28, q1, #14               ; >> 14
-    vqrshrn.s32     d29, q3, #14               ; >> 14
-
-    ; x7 = dct_const_round_shift(s5 - s7);
-    vqrshrn.s32     d14, q7, #14              ; >> 14
-    vqrshrn.s32     d15, q0, #14              ; >> 14
-
-    vdup.16         d30, r12                  ; duplicate cospi_16_64
-
-    ; cospi_16_64 * x2
-    vmull.s16       q2, d22, d30
-    vmull.s16       q3, d23, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q13, d22, d30
-    vmull.s16       q1, d23, d30
-
-    ; cospi_16_64 * x2 + cospi_16_64  * x3;
-    vmlal.s16       q2, d24, d30
-    vmlal.s16       q3, d25, d30
-
-    ; cospi_16_64 * x2 - cospi_16_64  * x3;
-    vmlsl.s16       q13, d24, d30
-    vmlsl.s16       q1, d25, d30
-
-    ; x2 = dct_const_round_shift(s2);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q3, #14               ; >> 14
-
-    ;x3 = dct_const_round_shift(s3);
-    vqrshrn.s32     d24, q13, #14             ; >> 14
-    vqrshrn.s32     d25, q1, #14              ; >> 14
-
-    ; cospi_16_64 * x6
-    vmull.s16       q13, d10, d30
-    vmull.s16       q1, d11, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q11, d10, d30
-    vmull.s16       q0, d11, d30
-
-    ; cospi_16_64 * x6 + cospi_16_64  * x7;
-    vmlal.s16       q13, d14, d30
-    vmlal.s16       q1, d15, d30
-
-    ; cospi_16_64 * x6 - cospi_16_64  * x7;
-    vmlsl.s16       q11, d14, d30
-    vmlsl.s16       q0, d15, d30
-
-    ; x6 = dct_const_round_shift(s6);
-    vqrshrn.s32     d20, q13, #14             ; >> 14
-    vqrshrn.s32     d21, q1, #14              ; >> 14
-
-    ;x7 = dct_const_round_shift(s7);
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q0, #14              ; >> 14
-
-    vdup.16         q5, r10                   ; duplicate 0
-
-    vsub.s16        q9, q5, q9                ; output[1] = -x4;
-    vsub.s16        q11, q5, q2               ; output[3] = -x2;
-    vsub.s16        q13, q5, q6               ; output[5] = -x7;
-    vsub.s16        q15, q5, q4               ; output[7] = -x1;
-    MEND
-
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    push            {r0-r10}
-    vpush           {d8-d15}
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; first transform rows
-    IDCT8x8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; then transform columns
-    IADST8X8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; then transform columns
-    IDCT8x8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; then transform columns
-    IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
-    vpop           {d8-d15}
-    pop            {r0-r10}
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-    bx          lr
-    ENDP  ; |vp9_iht8x8_64_add_neon|
-
-    END

diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 0000000..03c836d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c

@@ -0,0 +1,623 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static inline void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static inline void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16  = vqrshrn_n_s32(q2s32, 14);
+    d9s16  = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16  = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16   = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16   = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32  = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32  = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16  = vaddq_s16(q0s16, q7s16);
+    *q9s16  = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+static inline void IADST8X8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q2s16, q4s16, q5s16, q6s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    d14s16 = vdup_n_s16(cospi_2_64);
+    d15s16 = vdup_n_s16(cospi_30_64);
+
+    q1s32 = vmull_s16(d30s16, d14s16);
+    q2s32 = vmull_s16(d31s16, d14s16);
+    q3s32 = vmull_s16(d30s16, d15s16);
+    q4s32 = vmull_s16(d31s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_18_64);
+    d31s16 = vdup_n_s16(cospi_14_64);
+
+    q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+    q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+    q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+    q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+    q5s32 = vmull_s16(d22s16, d30s16);
+    q6s32 = vmull_s16(d23s16, d30s16);
+    q7s32 = vmull_s16(d22s16, d31s16);
+    q8s32 = vmull_s16(d23s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+    q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+    q11s32 = vaddq_s32(q1s32, q5s32);
+    q12s32 = vaddq_s32(q2s32, q6s32);
+    q1s32 = vsubq_s32(q1s32, q5s32);
+    q2s32 = vsubq_s32(q2s32, q6s32);
+
+    d22s16 = vqrshrn_n_s32(q11s32, 14);
+    d23s16 = vqrshrn_n_s32(q12s32, 14);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q12s32 = vaddq_s32(q3s32, q7s32);
+    q15s32 = vaddq_s32(q4s32, q8s32);
+    q3s32 = vsubq_s32(q3s32, q7s32);
+    q4s32 = vsubq_s32(q4s32, q8s32);
+
+    d2s16  = vqrshrn_n_s32(q1s32, 14);
+    d3s16  = vqrshrn_n_s32(q2s32, 14);
+    d24s16 = vqrshrn_n_s32(q12s32, 14);
+    d25s16 = vqrshrn_n_s32(q15s32, 14);
+    d6s16  = vqrshrn_n_s32(q3s32, 14);
+    d7s16  = vqrshrn_n_s32(q4s32, 14);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    d0s16 = vdup_n_s16(cospi_10_64);
+    d1s16 = vdup_n_s16(cospi_22_64);
+    q4s32 = vmull_s16(d26s16, d0s16);
+    q5s32 = vmull_s16(d27s16, d0s16);
+    q2s32 = vmull_s16(d26s16, d1s16);
+    q6s32 = vmull_s16(d27s16, d1s16);
+
+    d30s16 = vdup_n_s16(cospi_26_64);
+    d31s16 = vdup_n_s16(cospi_6_64);
+
+    q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+    q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+    q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+    q0s32 = vmull_s16(d18s16, d30s16);
+    q13s32 = vmull_s16(d19s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+    q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+    q10s32 = vmull_s16(d18s16, d31s16);
+    q9s32 = vmull_s16(d19s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+    q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+    q14s32 = vaddq_s32(q2s32, q10s32);
+    q15s32 = vaddq_s32(q6s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q6s32 = vsubq_s32(q6s32, q9s32);
+
+    d28s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    q9s32 = vaddq_s32(q4s32, q0s32);
+    q10s32 = vaddq_s32(q5s32, q13s32);
+    q4s32 = vsubq_s32(q4s32, q0s32);
+    q5s32 = vsubq_s32(q5s32, q13s32);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    d18s16 = vqrshrn_n_s32(q9s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+    d8s16 = vqrshrn_n_s32(q4s32, 14);
+    d9s16 = vqrshrn_n_s32(q5s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q5s32 = vmull_s16(d2s16, d30s16);
+    q6s32 = vmull_s16(d3s16, d30s16);
+    q7s32 = vmull_s16(d2s16, d31s16);
+    q0s32 = vmull_s16(d3s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+    q1s32 = vmull_s16(d4s16, d30s16);
+    q3s32 = vmull_s16(d5s16, d30s16);
+    q10s32 = vmull_s16(d4s16, d31s16);
+    q2s32 = vmull_s16(d5s16, d31s16);
+
+    q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+    q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+    *q8s16 = vaddq_s16(*q11s16, *q9s16);
+    *q11s16 = vsubq_s16(*q11s16, *q9s16);
+    q4s16 = vaddq_s16(*q12s16, *q14s16);
+    *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+    q14s32 = vaddq_s32(q5s32, q1s32);
+    q15s32 = vaddq_s32(q6s32, q3s32);
+    q5s32 = vsubq_s32(q5s32, q1s32);
+    q6s32 = vsubq_s32(q6s32, q3s32);
+
+    d18s16 = vqrshrn_n_s32(q14s32, 14);
+    d19s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q1s32 = vaddq_s32(q7s32, q10s32);
+    q3s32 = vaddq_s32(q0s32, q2s32);
+    q7s32 = vsubq_s32(q7s32, q10s32);
+    q0s32 = vsubq_s32(q0s32, q2s32);
+
+    d28s16 = vqrshrn_n_s32(q1s32, 14);
+    d29s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q7s32, 14);
+    d15s16 = vqrshrn_n_s32(q0s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    q2s32 = vmull_s16(d22s16, d30s16);
+    q3s32 = vmull_s16(d23s16, d30s16);
+    q13s32 = vmull_s16(d22s16, d30s16);
+    q1s32 = vmull_s16(d23s16, d30s16);
+
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+    q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q3s32, 14);
+    d24s16 = vqrshrn_n_s32(q13s32, 14);
+    d25s16 = vqrshrn_n_s32(q1s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    q13s32 = vmull_s16(d10s16, d30s16);
+    q1s32 = vmull_s16(d11s16, d30s16);
+    q11s32 = vmull_s16(d10s16, d30s16);
+    q0s32 = vmull_s16(d11s16, d30s16);
+
+    q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+    q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+    d20s16 = vqrshrn_n_s32(q13s32, 14);
+    d21s16 = vqrshrn_n_s32(q1s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q0s32, 14);
+    *q10s16 = vcombine_s16(d20s16, d21s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q5s16 = vdupq_n_s16(0);
+
+    *q9s16  = vsubq_s16(q5s16, *q9s16);
+    *q11s16 = vsubq_s16(q5s16, q2s16);
+    *q13s16 = vsubq_s16(q5s16, q6s16);
+    *q15s16 = vsubq_s16(q5s16, q4s16);
+    return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    int i;
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16  = vld1q_s16(input);
+    q9s16  = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 8 * 2);
+    q11s16 = vld1q_s16(input + 8 * 3);
+    q12s16 = vld1q_s16(input + 8 * 4);
+    q13s16 = vld1q_s16(input + 8 * 5);
+    q14s16 = vld1q_s16(input + 8 * 6);
+    q15s16 = vld1q_s16(input + 8 * 7);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // first transform rows
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 2:  // idct_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // then transform columns
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    for (d1 = d2 = dest, i = 0; i < 2; i++) {
+        if (i != 0) {
+            q8s16 = q12s16;
+            q9s16 = q13s16;
+            q10s16 = q14s16;
+            q11s16 = q15s16;
+        }
+
+        d0u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d1u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d2u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+
+        q8u16  = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                          vreinterpret_u8_u64(d0u64));
+        q9u16  = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_u64(d1u64));
+        q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                          vreinterpret_u8_u64(d2u64));
+        q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                          vreinterpret_u8_u64(d3u64));
+
+        d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+    }
+    return;
+}

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 284d3a2..cad5750 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -44,7 +44,10 @@
     vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
   }
 
+#if CONFIG_VP9_POSTPROC
   vp9_free_frame_buffer(&cm->post_proc_buffer);
+  vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#endif
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
@@ -125,7 +128,7 @@
 
   init_frame_bufs(cm);
 
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
+#if CONFIG_VP9_POSTPROC
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                              cm->use_highbitdepth,

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7d7209c..ebb1d1d 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -212,6 +212,12 @@
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+
   /* mc buffer */
   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
 
@@ -221,17 +227,10 @@
   DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
 #endif
 
-  int lossless;
-
-  int corrupted;
-
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
 
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
+  int lossless;
+  int corrupted;
 } MACROBLOCKD;
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 43a4fe5..58b2da7 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -968,7 +968,7 @@
       break;
   }
   // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also also.
+  // for 32x32 transforms also.
   lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
   lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
   lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];

diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
new file mode 100644
index 0000000..f1bdc1b
--- /dev/null
+++ b/vp9/common/vp9_mfqe.c

@@ -0,0 +1,314 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
+                             int block_size, int src_weight) {
+  const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+  int r, c;
+
+  for (r = 0; r < block_size; r++) {
+    for (c = 0; c < block_size; c++) {
+      dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+               >> MFQE_PRECISION;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
+  filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
+  filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
+                   dst_stride, 16, weight);
+  filter_by_weight(src + src_stride * 16 + 16, src_stride,
+                   dst + dst_stride * 16 + 16, dst_stride, 16, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+  filter_by_weight32x32(src + 32, src_stride, dst + 32,
+                        dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32, src_stride,
+                        dst + dst_stride * 32, dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+                        dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+                          int yd_stride, const uint8_t *u, const uint8_t *v,
+                          int uv_stride, uint8_t *ud, uint8_t *vd,
+                          int uvd_stride, BLOCK_SIZE block_size,
+                          int weight) {
+  if (block_size == BLOCK_16X16) {
+    filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
+    filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
+    filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
+  } else if (block_size == BLOCK_32X32) {
+    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+    filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
+    filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
+  } else if (block_size == BLOCK_64X64) {
+    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+    filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+  }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+                        uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 8; r++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 16; r++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem16x16(src, src_stride, dst, dst_stride);
+  copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16, src_stride,
+                dst + dst_stride * 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+                dst + dst_stride * 16 + 16, dst_stride);
+}
+
+void copy_mem64x64(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride) {
+  copy_mem32x32(src, src_stride, dst, dst_stride);
+  copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32, src_stride,
+                dst + src_stride * 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+                dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+                       uint8_t *vd, int yd_stride, int uvd_stride,
+                       BLOCK_SIZE bs) {
+  if (bs == BLOCK_16X16) {
+    copy_mem16x16(y, y_stride, yd, yd_stride);
+    copy_mem8x8(u, uv_stride, ud, uvd_stride);
+    copy_mem8x8(v, uv_stride, vd, uvd_stride);
+  } else if (bs == BLOCK_32X32) {
+    copy_mem32x32(y, y_stride, yd, yd_stride);
+    copy_mem16x16(u, uv_stride, ud, uvd_stride);
+    copy_mem16x16(v, uv_stride, vd, uvd_stride);
+  } else {
+    copy_mem64x64(y, y_stride, yd, yd_stride);
+    copy_mem32x32(u, uv_stride, ud, uvd_stride);
+    copy_mem32x32(v, uv_stride, vd, uvd_stride);
+  }
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+                       const uint8_t *v, int y_stride, int uv_stride,
+                       uint8_t *yd, uint8_t *ud, uint8_t *vd,
+                       int yd_stride, int uvd_stride) {
+  int sad, sad_thr, vdiff;
+  uint32_t sse;
+
+  if (bs == BLOCK_16X16) {
+    vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+  } else if (bs == BLOCK_32X32) {
+    vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    sad = (vp9_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+  } else /* if (bs == BLOCK_64X64) */ {
+    vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+  }
+
+  if (bs == BLOCK_16X16) {
+    sad_thr = 8;
+  } else if (bs == BLOCK_32X32) {
+    sad_thr = 7;
+  } else {  // BLOCK_64X64
+    sad_thr = 6;
+  }
+
+  // TODO(jackychen): More experiments and remove magic numbers.
+  // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+  // it might be a lighting change in smooth area. When there is a
+  // lighting change in smooth area, it is dangerous to do MFQE.
+  if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) {
+    // TODO(jackychen): Add weighted average in the calculation.
+    // Currently, the data is copied from last frame without averaging.
+    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride,
+                  ud, vd, uvd_stride, bs, 0);
+  } else {
+    // Copy the block from current frame (i.e., no mfqe is done).
+    copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+               yd_stride, uvd_stride, bs);
+  }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+  // Check the motion in current block(for inter frame),
+  // or check the motion in the correlated block in last frame (for keyframe).
+  const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+                            mi->mbmi.mv[0].as_mv.row +
+                            mi->mbmi.mv[0].as_mv.col *
+                            mi->mbmi.mv[0].as_mv.col;
+  const int mv_threshold = 100;
+  return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
+         cur_bs >= BLOCK_16X16 &&
+         mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+                           const uint8_t *y, const uint8_t *u,
+                           const uint8_t *v, int y_stride, int uv_stride,
+                           uint8_t *yd, uint8_t *ud, uint8_t *vd,
+                           int yd_stride, int uvd_stride) {
+  int mi_offset, y_offset, uv_offset;
+  const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+  // TODO(jackychen): Consider how and whether to use qdiff in MFQE.
+  // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+  const int bsl = b_width_log2_lookup[bs];
+  PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+  const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+  if (cur_bs < BLOCK_8X8) {
+    // If there are blocks smaller than 8x8, it must be on the boundary.
+    return;
+  }
+  // No MFQE on blocks smaller than 16x16
+  if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) {
+    partition = PARTITION_NONE;
+  }
+  switch (partition) {
+    case PARTITION_HORZ:
+    case PARTITION_VERT:
+      // If current block size is not square.
+      // Copy the block from current frame(i.e., no mfqe is done).
+      // TODO(jackychen): Rectangle blocks should also be taken into account.
+      copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+                 yd_stride, uvd_stride, bs);
+      break;
+    case PARTITION_NONE:
+      if (mfqe_decision(mi, cur_bs)) {
+        // Do mfqe on this partition.
+        mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride);
+      } else {
+        // Copy the block from current frame(i.e., no mfqe is done).
+        copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+                   yd_stride, uvd_stride, bs);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bs == BLOCK_64X64) {
+        mi_offset = 4;
+        y_offset = 32;
+        uv_offset = 16;
+      } else {
+        mi_offset = 2;
+        y_offset = 16;
+        uv_offset = 8;
+      }
+      // Recursion on four square partitions, e.g. if bs is 64X64,
+      // then look into four 32X32 blocks in it.
+      mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+                     yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+                     v + uv_offset, y_stride, uv_stride, yd + y_offset,
+                     ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+                     y + y_offset * y_stride, u + uv_offset * uv_stride,
+                     v + uv_offset * uv_stride, y_stride, uv_stride,
+                     yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                     vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+                     subsize, y + y_offset * y_stride + y_offset,
+                     u + uv_offset * uv_stride + uv_offset,
+                     v + uv_offset * uv_stride + uv_offset, y_stride,
+                     uv_stride, yd + y_offset * yd_stride + y_offset,
+                     ud + uv_offset * uvd_stride + uv_offset,
+                     vd + uv_offset * uvd_stride + uv_offset,
+                     yd_stride, uvd_stride);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+  int mi_row, mi_col;
+  // Current decoded frame.
+  const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+  // Last decoded frame and will store the MFQE result.
+  YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+  // Loop through each super block.
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      MODE_INFO *mi;
+      MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+      // Motion Info in last frame.
+      MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+                           (mi_row * cm->mi_stride + mi_col);
+      const uint32_t y_stride = show->y_stride;
+      const uint32_t uv_stride = show->uv_stride;
+      const uint32_t yd_stride = dest->y_stride;
+      const uint32_t uvd_stride = dest->uv_stride;
+      const uint32_t row_offset_y = mi_row << 3;
+      const uint32_t row_offset_uv = mi_row << 2;
+      const uint32_t col_offset_y = mi_col << 3;
+      const uint32_t col_offset_uv = mi_col << 2;
+      const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+                         col_offset_y;
+      const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+      uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      if (frame_is_intra_only(cm)) {
+        mi = mi_prev;
+      } else {
+        mi = mi_local;
+      }
+      mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+                     vd, yd_stride, uvd_stride);
+    }
+  }
+}

diff --git a/vp9/common/vp9_mfqe.h b/vp9/common/vp9_mfqe.h
new file mode 100644
index 0000000..dfff8c2
--- /dev/null
+++ b/vp9/common/vp9_mfqe.h

@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_MFQE_H_

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index c012055..c166590 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -112,7 +112,10 @@
 
   int new_fb_idx;
 
+#if CONFIG_VP9_POSTPROC
   YV12_BUFFER_CONFIG post_proc_buffer;
+  YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
   FRAME_TYPE frame_type;
@@ -181,7 +184,6 @@
   struct segmentation seg;
 
   // Context probabilities for reference frame prediction
-  int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
   REFERENCE_MODE reference_mode;

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 575ffbc..06cb65a 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c

@@ -79,6 +79,9 @@
   0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
 void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
@@ -616,6 +619,17 @@
   }
 }
 
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO *temp = cm->postproc_state.prev_mip;
+  cm->postproc_state.prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
   const int q = MIN(63, cm->lf.filter_level * 10 / 6);
@@ -633,7 +647,42 @@
 
   vp9_clear_system_state();
 
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
+  // Alloc memory for prev_mip in the first frame.
+  if (cm->current_video_frame == 1) {
+    cm->postproc_state.last_base_qindex = cm->base_qindex;
+    cm->postproc_state.last_frame_valid = 1;
+    ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+    if (!ppstate->prev_mip) {
+      return 1;
+    }
+    ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+    vpx_memset(ppstate->prev_mip, 0,
+               cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  }
+
+  // Allocate post_proc_buffer_int if needed.
+  if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+      const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+      if (vp9_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                 VP9_ENC_BORDER_IN_PIXELS) < 0) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate MFQE framebuffer");
+      }
+
+      // Ensure that postproc is set to all 0s so that post proc
+      // doesn't pull random data in from edge.
+      vpx_memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+                 cm->post_proc_buffer.frame_size);
+    }
+  }
+
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -642,9 +691,28 @@
                                VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");
-#endif
 
-  if (flags & VP9D_DEMACROBLOCK) {
+  if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+      cm->postproc_state.last_frame_valid &&
+      cm->postproc_state.last_base_qindex <= last_q_thresh &&
+      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+    vp9_mfqe(cm);
+    // TODO(jackychen): Consider whether enable deblocking by default
+    // if mfqe is enabled. Need to take both the quality and the speed
+    // into consideration.
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+    }
+    if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+                                 q + (ppflags->deblocking_level - 5) * 10,
+                                 1, 0);
+    } else if (flags & VP9D_DEBLOCK) {
+      vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+    } else {
+      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+    }
+  } else if (flags & VP9D_DEMACROBLOCK) {
     deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
                                q + (ppflags->deblocking_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
@@ -653,6 +721,9 @@
     vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
+  cm->postproc_state.last_base_qindex = cm->base_qindex;
+  cm->postproc_state.last_frame_valid = 1;
+
   if (flags & VP9D_ADDNOISE) {
     const int noise_level = ppflags->noise_level;
     if (ppstate->last_q != q ||
@@ -673,6 +744,7 @@
   dest->uv_width = dest->y_width >> cm->subsampling_x;
   dest->uv_height = dest->y_height >> cm->subsampling_y;
 
+  swap_mi_and_prev_mi(cm);
   return 0;
 }
-#endif
+#endif  // CONFIG_VP9_POSTPROC

diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index ebebc1a..035c9cd 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h

@@ -14,6 +14,8 @@
 
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
 #include "vp9/common/vp9_ppflags.h"
 
 #ifdef __cplusplus
@@ -24,6 +26,10 @@
   int last_q;
   int last_noise;
   char noise[3072];
+  int last_base_qindex;
+  int last_frame_valid;
+  MODE_INFO *prev_mip;
+  MODE_INFO *prev_mi;
   DECLARE_ALIGNED(16, char, blackclamp[16]);
   DECLARE_ALIGNED(16, char, whiteclamp[16]);
   DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -31,6 +37,8 @@
 
 struct VP9Common;
 
+#define MFQE_PRECISION 4
+
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
 

diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h
index 1644a1b..12b989f 100644
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h

@@ -26,7 +26,8 @@
   VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
   VP9D_DEBUG_DRAW_MV          = 1 << 7,
   VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+  VP9D_MFQE                   = 1 << 10
 };
 
 typedef struct {

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 575990b..11c0d81 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -457,12 +457,10 @@
     specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-    $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-    $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/vp9_iht16x16_256_add sse2 dspr2/;

diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 3610c71..42e0baa 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -4260,7 +4260,7 @@
     // N.B. Only first 4 cols contain non-zero coeffs
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
     min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 4; i++) {
+    for (i = 2; i < 8; i++) {
       max_input = _mm_max_epi16(max_input, inptr[i]);
       min_input = _mm_min_epi16(min_input, inptr[i]);
     }

diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index 91055b9f..f5f7d5a 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm

@@ -601,9 +601,6 @@
 t80:
     times 8 db 0x80
 align 16
-t1s:
-    times 8 db 0x01
-align 16
 t3:
     times 8 db 0x03
 align 16
@@ -612,15 +609,3 @@
 align 16
 ones:
     times 4 dw 0x0001
-align 16
-s27:
-    times 4 dw 0x1b00
-align 16
-s18:
-    times 4 dw 0x1200
-align 16
-s9:
-    times 4 dw 0x0900
-align 16
-s63:
-    times 4 dw 0x003f

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index c4efa65..71dbb40 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

@@ -312,9 +312,11 @@
                                          unsigned int out_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i addFilterReg64, filtersReg, minReg;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -333,27 +335,26 @@
   // duplicate only the forth 16 bits in the filter
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+  srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
+  srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
+  srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
+  srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
+  srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
+  srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+
   for (i = 0; i < output_height; i++) {
-    // load the first 8 bytes
-    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
 
     // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
 
     // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -377,6 +378,15 @@
 
     src_ptr+=src_pitch;
 
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
     // save only 8 bytes convolve result
     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
 
@@ -390,9 +400,11 @@
                                           unsigned int out_pitch,
                                           unsigned int output_height,
                                           int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -411,19 +423,24 @@
   // duplicate only the forth 16 bits in the filter
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
+  // load the first 7 rows of 16 bytes
+  srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
+  srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+
   for (i = 0; i < output_height; i++) {
-    // load the first 16 bytes
-    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-    // load the next 16 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+    // load the last 16 bytes
+    srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
 
     // merge the result together
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
@@ -435,25 +452,17 @@
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
 
-    // load the next 16 bytes in stride of two/three src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
     // merge the result together
-    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
 
-    // load the next 16 bytes in stride of four/five src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
     // merge the result together
-    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
@@ -461,13 +470,13 @@
 
     // add and saturate the results together
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                  _mm_min_epi16(srcRegFilt6, srcRegFilt8));
 
     // add and saturate the results together
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                  _mm_max_epi16(srcRegFilt6, srcRegFilt8));
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
@@ -484,6 +493,15 @@
 
     src_ptr+=src_pitch;
 
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
     // save 16 bytes convolve result
     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
 

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 470b2ba..5bf44d7 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -952,7 +952,6 @@
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
       init_macroblockd(cm, &tile_data->xd);
-      vp9_zero(tile_data->xd.dqcoeff);
     }
   }
 
@@ -1149,7 +1148,6 @@
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
       init_macroblockd(cm, &tile_data->xd);
-      vp9_zero(tile_data->xd.dqcoeff);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1563,7 +1561,6 @@
                        "Uninitialized entropy context.");
 
   vp9_zero(cm->counts);
-  vp9_zero(xd->dqcoeff);
 
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);

diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c
index 3eef728..c3b38a9 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c

@@ -10,20 +10,20 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
+  return (rb->bit_offset + 7) >> 3;
 }
 
 int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
   const size_t off = rb->bit_offset;
-  const size_t p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
-  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  } else {
-    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+  const size_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
     rb->bit_offset = off + 1;
     return bit;
+  } else {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
   }
 }
 

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 20368f0..4154a6f 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -1183,7 +1183,7 @@
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
                                 counts->intra_inter[i]);
 
-    if (cm->allow_comp_inter_inter) {
+    if (cpi->allow_comp_inter_inter) {
       const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 69dedac..86023a5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -278,7 +278,7 @@
 typedef struct {
   int64_t sum_square_error;
   int64_t sum_error;
-  int count;
+  int log2_count;
   int variance;
 } var;
 
@@ -327,7 +327,6 @@
 static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
   int i;
   node->part_variances = NULL;
-  vpx_memset(node->split, 0, sizeof(node->split));
   switch (bsize) {
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
@@ -375,18 +374,18 @@
 static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
-  v->count = c;
-  if (c > 0)
-    v->variance = (int)(256 *
-                        (v->sum_square_error - v->sum_error * v->sum_error /
-                         v->count) / v->count);
-  else
-    v->variance = 0;
+  v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
 }
 
 void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->count + b->count, r);
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
 }
 
 static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
@@ -424,8 +423,9 @@
 
   if (cm->frame_type == KEY_FRAME) {
     bsize_ref = BLOCK_8X8;
-    // Choose lower thresholds for key frame variance to favor split.
-    threshold_bsize_ref = threshold >> 1;
+    // Choose lower thresholds for key frame variance to favor split, but keep
+    // threshold for splitting to 4x4 block still fairly high for now.
+    threshold_bsize_ref = threshold << 2;
     threshold_low = threshold >> 2;
   }
 
@@ -433,6 +433,7 @@
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_ref) {
+    get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
         vt.part_variances->none.variance < threshold_bsize_ref) {
@@ -441,6 +442,7 @@
     }
     return 0;
   } else if (bsize > bsize_ref) {
+    get_variance(&vt.part_variances->none);
     // For key frame, for bsize above 32X32, or very high variance, take split.
     if (cm->frame_type == KEY_FRAME &&
         (bsize > BLOCK_32X32 ||
@@ -454,24 +456,32 @@
       set_block_size(cpi, xd, mi_row, mi_col, bsize);
       return 1;
     }
+
     // Check vertical split.
-    if (mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->vert[0].variance < threshold_low &&
-        vt.part_variances->vert[1].variance < threshold_low) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-      set_block_size(cpi, xd, mi_row, mi_col, subsize);
-      set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
-      return 1;
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold_low &&
+          vt.part_variances->vert[1].variance < threshold_low) {
+        BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+        set_block_size(cpi, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
     }
     // Check horizontal split.
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        vt.part_variances->horz[0].variance < threshold_low &&
-        vt.part_variances->horz[1].variance < threshold_low) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-      set_block_size(cpi, xd, mi_row, mi_col, subsize);
-      set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
-      return 1;
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold_low &&
+          vt.part_variances->horz[1].variance < threshold_low) {
+        BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+        set_block_size(cpi, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
     }
+
     return 0;
   }
   return 0;
@@ -573,7 +583,7 @@
           // If variance is based on 8x8 downsampling, we stop here and have
           // one sample for 8x8 block (so use 1 for count in fill_variance),
           // which of course means variance = 0 for 8x8 block.
-          fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
+          fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
         } else {
           // For key frame, go down to 4x4.
           v8x8 *vst2 = &vst->split[k];
@@ -583,7 +593,16 @@
             unsigned int sse = 0;
             int sum = 0;
             if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+              int s_avg;
+              if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+                s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              } else {
+                s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+              }
+#else
               int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+#endif
               // For key frame, reference is set to 128.
               sum = s_avg - 128;
               sse = sum * sum;
@@ -591,7 +610,7 @@
             // If variance is based on 4x4 downsampling, we stop here and have
             // one sample for 4x4 block (so use 1 for count in fill_variance),
             // which of course means variance = 0 for 4x4 block.
-           fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none);
+           fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
           }
         }
       }
@@ -637,8 +656,6 @@
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              // TODO(marpan): Allow for setting 4x4 partition on key frame.
-              /*
               if (cm->frame_type == KEY_FRAME) {
                 if (!set_vt_partitioning(cpi, xd,
                                          &vt.split[i].split[j].split[k],
@@ -651,12 +668,11 @@
                                    BLOCK_4X4);
                 }
               } else {
-              */
                 set_block_size(cpi, xd,
                                (mi_row + y32_idx + y16_idx + y8_idx),
                                (mi_col + x32_idx + x16_idx + x8_idx),
                                BLOCK_8X8);
-              // }
+               }
             }
           }
         }
@@ -3103,7 +3119,7 @@
         if (mi_row + hbs < cm->mi_rows) {
           pc_tree->horizontal[1].pred_pixel_ready = 1;
           nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
-                              &this_rdc, subsize, &pc_tree->horizontal[0]);
+                              &this_rdc, subsize, &pc_tree->horizontal[1]);
           pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
           pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
           pc_tree->horizontal[1].skip = x->skip;
@@ -3227,7 +3243,7 @@
       if (mi_row + hbs < cm->mi_rows) {
         pc_tree->horizontal[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
-                            dummy_cost, subsize, &pc_tree->horizontal[0]);
+                            dummy_cost, subsize, &pc_tree->horizontal[1]);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
         pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
@@ -3707,9 +3723,9 @@
              cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
              cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cm->allow_comp_inter_inter = 0;
+      cpi->allow_comp_inter_inter = 0;
     } else {
-      cm->allow_comp_inter_inter = 1;
+      cpi->allow_comp_inter_inter = 1;
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
       cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3733,7 +3749,7 @@
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (is_alt_ref || !cm->allow_comp_inter_inter)
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
       cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
              mode_thrs[COMPOUND_REFERENCE] >

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 14f7c7f..a58a90e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -339,6 +339,8 @@
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
+  int allow_comp_inter_inter;
+
   // Default value is 1. From first pass stats, encode_breakout may be disabled.
   ENCODE_BREAKOUT_TYPE allow_encode_breakout;
 

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index b450324..379d067 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -249,14 +249,14 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                  dc_quant >> (xd->bd - 5), &rate, &dist);
   } else {
-    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                  dc_quant >> 3, &rate, &dist);
   }
 #else
-  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+  vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                dc_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -265,14 +265,14 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                  ac_quant >> (xd->bd - 5), &rate, &dist);
   } else {
-    vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                  ac_quant >> 3, &rate, &dist);
   }
 #else
-  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                ac_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -537,9 +537,9 @@
   // Reduce the intra cost penalty for small blocks (<=16x16).
   const int reduction_fac =
       (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
-       bsize <= BLOCK_16X16) ? 4 : 1;
+       bsize <= BLOCK_16X16) ? 2 : 0;
   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
   const int8_t segment_id = mbmi->segment_id;

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 5b49bfc..34d49f0 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -379,7 +379,7 @@
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -395,10 +395,10 @@
     int d_q10, r_q10;
     static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
-        ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
-    *rate = (n * r_q10 + 2) >> 2;
+    *rate = ((r_q10 << n_log2) + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
 }

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 600a3eb..bc5edc8 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -265,15 +265,15 @@
     } else {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+        vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                      pd->dequant[1] >> (xd->bd - 5),
                                      &rate, &dist);
       } else {
-        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+        vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                      pd->dequant[1] >> 3, &rate, &dist);
       }
 #else
-      vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       rate_sum += rate;
@@ -3065,7 +3065,7 @@
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (!cm->allow_comp_inter_inter)
+      if (!cpi->allow_comp_inter_inter)
         continue;
 
       // Skip compound inter modes if ARF is not available.
@@ -3715,7 +3715,7 @@
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (!cm->allow_comp_inter_inter)
+      if (!cpi->allow_comp_inter_inter)
         continue;
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index ccd5518..2560459 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -72,6 +72,8 @@
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -133,12 +135,13 @@
 
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
 
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
 ifeq ($(HAVE_NEON_ASM), yes)

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 85e32d3..8095140 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -11,6 +11,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "./vpx_config.h"
 #include "./vpx_version.h"
 
 #include "vpx/internal/vpx_codec_internal.h"

diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index 3814ef4..c94b76a 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm

@@ -395,7 +395,7 @@
 
 ; On Android platforms use lrand48 when building postproc routines. Prior to L
 ; rand() was not available.
-%if CONFIG_POSTPROC=1
+%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
 %ifdef __ANDROID__
 extern sym(lrand48)
 %define LIBVPX_RAND lrand48
@@ -403,4 +403,4 @@
 extern sym(rand)
 %define LIBVPX_RAND rand
 %endif
-%endif ; CONFIG_POSTPROC
+%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

diff --git a/vpxdec.c b/vpxdec.c
index c4d2a9e..30e2742 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -131,7 +131,7 @@
 #endif
 
 #if CONFIG_LIBYUV
-static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                                   FilterModeEnum mode) {
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   if (src->fmt == VPX_IMG_FMT_I42016) {
@@ -948,7 +948,7 @@
 
         if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
 #if CONFIG_LIBYUV
-          vpx_image_scale(img, scaled_img, kFilterBox);
+          libyuv_scale(img, scaled_img, kFilterBox);
           img = scaled_img;
 #else
           fprintf(stderr, "Failed  to scale output frame: %s.\n"