Merge "vp8: Add temporal denoising for UV-channel."

diff --git a/README b/README
index f209105..f9c24ff 100644
--- a/README
+++ b/README

@@ -55,6 +55,7 @@
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-none-rvct
+    arm64-darwin-gcc
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct

diff --git a/build/make/configure.sh b/build/make/configure.sh
index d4124c7..0fe8ead 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -799,7 +799,7 @@
     arm*)
         # on arm, isa versions are supersets
         case ${tgt_isa} in
-        armv8)
+        arm64|armv8)
             soft_enable neon
             ;;
         armv7|armv7s)

diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index f1cc04e..3653309 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh

@@ -137,7 +137,9 @@
         ;;
         --lib) proj_kind="lib"
         ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
         ;;
         --static-crt) use_static_runtime=true
         ;;
@@ -151,9 +153,9 @@
             esac
         ;;
         -I*)
-            opt="${opt%/}"
             opt=${opt##-I}
             opt=$(fix_path "$opt")
+            opt="${opt%/}"
             incs="${incs}${incs:+;}&quot;${opt}&quot;"
             yasmincs="${yasmincs} -I&quot;${opt}&quot;"
         ;;
@@ -414,7 +416,7 @@
                     vpx)
                         tag Tool \
                             Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
+                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
 
                         tag Tool \
                             Name="VCCLCompilerTool" \

diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index eee354d..23ef6a3 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh

@@ -157,7 +157,9 @@
         ;;
         --lib) proj_kind="lib"
         ;;
-        --src-path-bare=*) src_path_bare=$(fix_path "$optval")
+        --src-path-bare=*)
+            src_path_bare=$(fix_path "$optval")
+            src_path_bare=${src_path_bare%/}
         ;;
         --static-crt) use_static_runtime=true
         ;;
@@ -173,9 +175,9 @@
             esac
         ;;
         -I*)
-            opt="${opt%/}"
             opt=${opt##-I}
             opt=$(fix_path "$opt")
+            opt="${opt%/}"
             incs="${incs}${incs:+;}&quot;${opt}&quot;"
             yasmincs="${yasmincs} -I&quot;${opt}&quot;"
         ;;

diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 230c0ce..9d9c374 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh

@@ -50,9 +50,79 @@
   vlog "***Done building target: ${target}***"
 }
 
+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    armv6-*)
+      echo "__ARM_ARCH_6__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a vpx_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real vpx_config.h for the current target.
+# $1 is the list of targets.
+create_vpx_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/vpx_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
 # Configures and builds each target specified by $1, and then builds
 # VPX.framework.
-build_targets() {
+build_framework() {
   local lib_list=""
   local targets="$1"
   local target=""
@@ -75,15 +145,20 @@
 
   cd "${ORIG_PWD}"
 
-  # Includes are identical for all platforms, and according to dist target
-  # behavior vpx_config.h and vpx_version.h aren't actually necessary for user
-  # apps built with libvpx. So, just copy the includes from the last target
-  # built.
-  # TODO(tomfinegan): The above is a lame excuse. Build common config/version
-  # includes that use the preprocessor to include the correct file.
+  # The basic libvpx API includes are all the same; just grab the most recent
+  # set.
   cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}"
+
+  # Build the fat library.
   ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX
 
+  # Create the vpx_config.h shim that allows usage of vpx_config.h from
+  # within VPX.framework.
+  create_vpx_framework_config_shim "${targets}"
+
+  # Copy in vpx_version.h.
+  cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
+
   vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
   for lib in ${lib_list}; do
     vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
@@ -166,4 +241,4 @@
 EOF
 fi
 
-build_targets "${TARGETS}"
+build_framework "${TARGETS}"

diff --git a/configure b/configure
index 9a7de73..0dca4c3 100755
--- a/configure
+++ b/configure

@@ -96,6 +96,7 @@
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
+all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8

diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 5bc6575..8c87b2a 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c

@@ -296,6 +296,7 @@
   int frame_duration = 1; /* 1 timebase tick per frame */
   FILE *infile = NULL;
   int end_of_stream = 0;
+  int frame_size;
 
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
@@ -351,11 +352,10 @@
       die_codec(&codec, "Failed to encode frame");
     }
     if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
+      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
         vpx_video_writer_write_frame(writer,
                                      vpx_svc_get_buffer(&svc_ctx),
-                                     vpx_svc_get_frame_size(&svc_ctx),
-                                     pts);
+                                     frame_size, pts);
       }
     }
     if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index e667d1d..8bea4cc 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -15,13 +15,27 @@
 
 namespace libvpx_test {
 
+const char kVP8Name[] = "WebM Project VP8";
+
+vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    vpx_codec_stream_info_t *stream_info) {
+  return vpx_codec_peek_stream_info(CodecInterface(),
+                                    cxdata, static_cast<unsigned int>(size),
+                                    stream_info);
+}
+
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
   vpx_codec_err_t res_dec;
   InitOnce();
   REGISTER_STATE_CHECK(
       res_dec = vpx_codec_decode(&decoder_,
                                  cxdata, static_cast<unsigned int>(size),
-                                 NULL, 0));
+                                 user_priv, 0));
   return res_dec;
 }
 
@@ -29,13 +43,37 @@
   vpx_codec_dec_cfg_t dec_cfg = {0};
   Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
   ASSERT_TRUE(decoder != NULL);
+  const char *codec_name = decoder->GetDecoderName();
+  const bool is_vp8 = strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
 
   // Decode frames.
-  for (video->Begin(); video->cxdata(); video->Next()) {
+  for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata();
+       video->Next()) {
     PreDecodeFrameHook(*video, decoder);
+
+    vpx_codec_stream_info_t stream_info;
+    stream_info.sz = sizeof(stream_info);
+    const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
+                                                         video->frame_size(),
+                                                         &stream_info);
+    if (is_vp8) {
+      /* Vp8's implementation of PeekStream returns an error if the frame you
+       * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
+       * frame, which must be a keyframe. */
+      if (video->frame_number() == 0)
+        ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+            << vpx_codec_err_to_string(res_peek);
+    } else {
+      /* The Vp9 implementation of PeekStream returns an error only if the
+       * data passed to it isn't a valid Vp9 chunk. */
+      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
+          << vpx_codec_err_to_string(res_peek);
+    }
+
     vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
                                                    video->frame_size());
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    if (!HandleDecodeResult(res_dec, *video, decoder))
+      break;
 
     DxDataIterator dec_iter = decoder->GetDxData();
     const vpx_image_t *img = NULL;

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 2734a45..dd3593e 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -49,8 +49,14 @@
     vpx_codec_destroy(&decoder_);
   }
 
+  vpx_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             vpx_codec_stream_info_t *stream_info);
+
   vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);
 
+  vpx_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
   DxDataIterator GetDxData() {
     return DxDataIterator(&decoder_);
   }
@@ -85,6 +91,10 @@
         &decoder_, cb_get, cb_release, user_priv);
   }
 
+  const char* GetDecoderName() {
+    return vpx_codec_iface_name(CodecInterface());
+  }
+
  protected:
   virtual vpx_codec_iface_t* CodecInterface() const = 0;
 
@@ -114,6 +124,14 @@
   virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
                                   Decoder *decoder) {}
 
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const CompressedVideoSource& /* video */,
+                                  Decoder *decoder) {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    return VPX_CODEC_OK == res_dec;
+  }
+
   // Hook to be called on every decompressed frame.
   virtual void DecompressedFrameHook(const vpx_image_t& img,
                                      const unsigned int frame_number) {}

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
new file mode 100644
index 0000000..4933658
--- /dev/null
+++ b/test/invalid_file_test.cc

@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class InvalidFileTest
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL)
+      fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name_) {
+    res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
+    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
+        << res_file_name_;
+  }
+
+  virtual bool HandleDecodeResult(
+      const vpx_codec_err_t res_dec,
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    // Check results match.
+    EXPECT_EQ(expected_res_dec, res_dec)
+        << "Results don't match: frame number = " << video.frame_number();
+
+    return !HasFailure();
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) {
+  const std::string filename = GET_PARAM(1);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+#if CONFIG_WEBM_IO
+    video = new libvpx_test::WebMVideoSource(filename);
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  video->Init();
+
+  // Construct result file name. The file holds a list of expected integer
+  // results, one for each decoded frame.  Any result that doesn't match
+  // the files list will cause a test failure.
+  const std::string res_filename = filename + ".res";
+  OpenResFile(res_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+const char *const kVP9InvalidFileTests[] = {
+  "invalid-vp90-01.webm",
+  "invalid-vp90-02.webm",
+  "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
+};
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP9InvalidFileTests,
+                                              kVP9InvalidFileTests +
+                                              NELEMENTS(kVP9InvalidFileTests)));
+
+}  // namespace

diff --git a/test/svc_test.cc b/test/svc_test.cc
index db26a8e..f831e75 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc

@@ -265,9 +265,17 @@
                        video.duration(), VPX_DL_GOOD_QUALITY);
   EXPECT_EQ(VPX_CODEC_OK, res);
 
+  if (vpx_svc_get_frame_size(&svc_) == 0) {
+    // Flush encoder
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+  }
+
+  int frame_size = vpx_svc_get_frame_size(&svc_);
+  EXPECT_GT(frame_size, 0);
   const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
 
   // this test fails with a decoder error
   ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
@@ -277,6 +285,9 @@
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
   vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -291,13 +302,14 @@
   // This frame is a keyframe.
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                        video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
 
-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
 
   // FRAME 1
   video.Next();
@@ -305,12 +317,14 @@
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                        video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
 
-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
 
   // FRAME 2
   video.Next();
@@ -318,12 +332,29 @@
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                        video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
 
-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }
 
 TEST_F(SvcTest, GetLayerResolution) {
@@ -413,6 +444,9 @@
   vpx_codec_destroy(&codec_);
 
   // Second pass encode
+  int decoded_frames = 0;
+  vpx_codec_err_t res_dec;
+  int frame_size;
   codec_enc_.g_pass = VPX_RC_LAST_PASS;
   codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
   codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
@@ -427,12 +461,14 @@
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                        video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
 
-  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
 
   // FRAME 1
   video.Next();
@@ -440,12 +476,14 @@
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                        video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
 
-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
 
   // FRAME 2
   video.Next();
@@ -453,12 +491,29 @@
   res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                        video.duration(), VPX_DL_GOOD_QUALITY);
   ASSERT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
 
-  res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
-      vpx_svc_get_frame_size(&svc_));
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  // Flush encoder
+  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                       video.duration(), VPX_DL_GOOD_QUALITY);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
+    res_dec = decoder_->DecodeFrame(
+        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+    ++decoded_frames;
+  }
+
+  EXPECT_EQ(decoded_frames, 3);
 }
 
 }  // namespace

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 0def69d..bc6f77e 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -1,5 +1,9 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
+fe346136b9b8c1e6f6084cc106485706915795e4  invalid-vp90-01.webm
+25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
+d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
+2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
@@ -576,6 +580,8 @@
 54638c38009198c38c8f3b25c182b709b6c1fd2e  vp90-2-09-lf_deltas.webm.md5
 510d95f3beb3b51c572611fdaeeece12277dac30  vp90-2-10-show-existing-frame.webm
 14d631096f4bfa2d71f7f739aec1448fb3c33bad  vp90-2-10-show-existing-frame.webm.md5
+d2feea7728e8d2c615981d0f47427a4a5a45d881  vp90-2-10-show-existing-frame2.webm
+5f7c7811baa3e4f03be1dd78c33971b727846821  vp90-2-10-show-existing-frame2.webm.md5
 b4318e75f73a6a08992c7326de2fb589c2a794c7  vp90-2-11-size-351x287.webm
 b3c48382cf7d0454e83a02497c229d27720f9e20  vp90-2-11-size-351x287.webm.md5
 8e0096475ea2535bac71d3e2fc09e0c451c444df  vp90-2-11-size-351x288.webm
@@ -638,5 +644,5 @@
 e3ab35d4316c5e81325c50f5236ceca4bc0d35df  vp90-2-15-segkey.webm.md5
 9b7ca2cac09d34c4a5d296c1900f93b1e2f69d0d  vp90-2-15-segkey_adpq.webm
 8f46ba5f785d0c2170591a153e0d0d146a7c8090  vp90-2-15-segkey_adpq.webm.md5
-d78e2fceba5ac942246503ec8366f879c4775ca5  vp90-2-15-fuzz-flicker.webm
-bbd7dd15f43a703ff0a332fee4959e7b23bf77dc  vp90-2-15-fuzz-flicker.webm.md5
+76024eb753cdac6a5e5703aaea189d35c3c30ac7  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038  invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res

diff --git a/test/test.mk b/test/test.mk
index c59ae11..e7c4036 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -30,6 +30,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
@@ -54,6 +55,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 endif
 
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
 
 # Currently we only support decoder perf tests for vp9. Also they read from WebM
@@ -690,6 +692,8 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x288.webm
@@ -754,8 +758,14 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-fuzz-flicker.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-fuzz-flicker.webm.md5
+
+# Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams

diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 3873712..a6d546e 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc

@@ -161,6 +161,7 @@
   "vp90-2-08-tile-4x1.webm", "vp90-2-09-subpixel-00.ivf",
   "vp90-2-02-size-lf-1920x1080.webm", "vp90-2-09-aq2.webm",
   "vp90-2-09-lf_deltas.webm", "vp90-2-10-show-existing-frame.webm",
+  "vp90-2-10-show-existing-frame2.webm",
   "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
   "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
   "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
@@ -179,7 +180,6 @@
   "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
-  "vp90-2-15-fuzz-flicker.webm"
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER

diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
new file mode 100644
index 0000000..f9aef33
--- /dev/null
+++ b/test/user_priv_test.cc

@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vp8.h"
+
+namespace {
+
+using std::string;
+using libvpx_test::ACMRandom;
+
+#if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv) <<
+      "user_priv pointer value does not match.";
+}
+
+// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
+// compares the user_priv from return img with the original user_priv to see if
+// they match. Both the pointer values and the values inside the addresses
+// should match.
+string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  libvpx_test::WebMVideoSource video(filename);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  int frame_num = 0;
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
+    void *user_priv = reinterpret_cast<void *>(&frame_num);
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
+                            (frame_num == 0) ? NULL : user_priv);
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data.
+    while ((img = dec_iter.Next())) {
+      if (frame_num == 0) {
+        CheckUserPrivateData(img->user_priv, NULL);
+      } else {
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct vp9_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(VP9_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, &frame_num);
+      }
+      md5.Add(img);
+    }
+
+    frame_num++;
+  }
+  return string(md5.Get());
+}
+
+TEST(UserPrivTest, VideoDecode) {
+  // no tiles or frame parallel; this exercises the decoding to test the
+  // user_priv.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("vp90-2-03-size-226x226.webm").c_str());
+}
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace

diff --git a/third_party/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h
new file mode 100644
index 0000000..44d4385
--- /dev/null
+++ b/third_party/libmkv/EbmlIDs.h

@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef MKV_DEFS_HPP
+#define MKV_DEFS_HPP 1
+
+/* Commenting out values not available in webm, but available in matroska */
+
+enum mkv {
+  EBML = 0x1A45DFA3,
+  EBMLVersion = 0x4286,
+  EBMLReadVersion = 0x42F7,
+  EBMLMaxIDLength = 0x42F2,
+  EBMLMaxSizeLength = 0x42F3,
+  DocType = 0x4282,
+  DocTypeVersion = 0x4287,
+  DocTypeReadVersion = 0x4285,
+/* CRC_32 = 0xBF, */
+  Void = 0xEC,
+  SignatureSlot = 0x1B538667,
+  SignatureAlgo = 0x7E8A,
+  SignatureHash = 0x7E9A,
+  SignaturePublicKey = 0x7EA5,
+  Signature = 0x7EB5,
+  SignatureElements = 0x7E5B,
+  SignatureElementList = 0x7E7B,
+  SignedElement = 0x6532,
+  /* segment */
+  Segment = 0x18538067,
+  /* Meta Seek Information */
+  SeekHead = 0x114D9B74,
+  Seek = 0x4DBB,
+  SeekID = 0x53AB,
+  SeekPosition = 0x53AC,
+  /* Segment Information */
+  Info = 0x1549A966,
+/* SegmentUID = 0x73A4, */
+/* SegmentFilename = 0x7384, */
+/* PrevUID = 0x3CB923, */
+/* PrevFilename = 0x3C83AB, */
+/* NextUID = 0x3EB923, */
+/* NextFilename = 0x3E83BB, */
+/* SegmentFamily = 0x4444, */
+/* ChapterTranslate = 0x6924, */
+/* ChapterTranslateEditionUID = 0x69FC, */
+/* ChapterTranslateCodec = 0x69BF, */
+/* ChapterTranslateID = 0x69A5, */
+  TimecodeScale = 0x2AD7B1,
+  Segment_Duration = 0x4489,
+  DateUTC = 0x4461,
+/* Title = 0x7BA9, */
+  MuxingApp = 0x4D80,
+  WritingApp = 0x5741,
+  /* Cluster */
+  Cluster = 0x1F43B675,
+  Timecode = 0xE7,
+/* SilentTracks = 0x5854, */
+/* SilentTrackNumber = 0x58D7, */
+/* Position = 0xA7, */
+  PrevSize = 0xAB,
+  BlockGroup = 0xA0,
+  Block = 0xA1,
+/* BlockVirtual = 0xA2, */
+  BlockAdditions = 0x75A1,
+  BlockMore = 0xA6,
+  BlockAddID = 0xEE,
+  BlockAdditional = 0xA5,
+  BlockDuration = 0x9B,
+/* ReferencePriority = 0xFA, */
+  ReferenceBlock = 0xFB,
+/* ReferenceVirtual = 0xFD, */
+/* CodecState = 0xA4, */
+/* Slices = 0x8E, */
+/* TimeSlice = 0xE8, */
+  LaceNumber = 0xCC,
+/* FrameNumber = 0xCD, */
+/* BlockAdditionID = 0xCB, */
+/* MkvDelay = 0xCE, */
+/* Cluster_Duration = 0xCF, */
+  SimpleBlock = 0xA3,
+/* EncryptedBlock = 0xAF, */
+  /* Track */
+  Tracks = 0x1654AE6B,
+  TrackEntry = 0xAE,
+  TrackNumber = 0xD7,
+  TrackUID = 0x73C5,
+  TrackType = 0x83,
+  FlagEnabled = 0xB9,
+  FlagDefault = 0x88,
+  FlagForced = 0x55AA,
+  FlagLacing = 0x9C,
+/* MinCache = 0x6DE7, */
+/* MaxCache = 0x6DF8, */
+  DefaultDuration = 0x23E383,
+/* TrackTimecodeScale = 0x23314F, */
+/* TrackOffset = 0x537F, */
+  MaxBlockAdditionID = 0x55EE,
+  Name = 0x536E,
+  Language = 0x22B59C,
+  CodecID = 0x86,
+  CodecPrivate = 0x63A2,
+  CodecName = 0x258688,
+/* AttachmentLink = 0x7446, */
+/* CodecSettings = 0x3A9697, */
+/* CodecInfoURL = 0x3B4040, */
+/* CodecDownloadURL = 0x26B240, */
+/* CodecDecodeAll = 0xAA, */
+/* TrackOverlay = 0x6FAB, */
+/* TrackTranslate = 0x6624, */
+/* TrackTranslateEditionUID = 0x66FC, */
+/* TrackTranslateCodec = 0x66BF, */
+/* TrackTranslateTrackID = 0x66A5, */
+  /* video */
+  Video = 0xE0,
+  FlagInterlaced = 0x9A,
+  StereoMode = 0x53B8,
+  AlphaMode = 0x53C0,
+  PixelWidth = 0xB0,
+  PixelHeight = 0xBA,
+  PixelCropBottom = 0x54AA,
+  PixelCropTop = 0x54BB,
+  PixelCropLeft = 0x54CC,
+  PixelCropRight = 0x54DD,
+  DisplayWidth = 0x54B0,
+  DisplayHeight = 0x54BA,
+  DisplayUnit = 0x54B2,
+  AspectRatioType = 0x54B3,
+/* ColourSpace = 0x2EB524, */
+/* GammaValue = 0x2FB523, */
+  FrameRate = 0x2383E3,
+  /* end video */
+  /* audio */
+  Audio = 0xE1,
+  SamplingFrequency = 0xB5,
+  OutputSamplingFrequency = 0x78B5,
+  Channels = 0x9F,
+/* ChannelPositions = 0x7D7B, */
+  BitDepth = 0x6264,
+  /* end audio */
+  /* content encoding */
+/* ContentEncodings = 0x6d80, */
+/* ContentEncoding = 0x6240, */
+/* ContentEncodingOrder = 0x5031, */
+/* ContentEncodingScope = 0x5032, */
+/* ContentEncodingType = 0x5033, */
+/* ContentCompression = 0x5034, */
+/* ContentCompAlgo = 0x4254, */
+/* ContentCompSettings = 0x4255, */
+/* ContentEncryption = 0x5035, */
+/* ContentEncAlgo = 0x47e1, */
+/* ContentEncKeyID = 0x47e2, */
+/* ContentSignature = 0x47e3, */
+/* ContentSigKeyID = 0x47e4, */
+/* ContentSigAlgo = 0x47e5, */
+/* ContentSigHashAlgo = 0x47e6, */
+  /* end content encoding */
+  /* Cueing Data */
+  Cues = 0x1C53BB6B,
+  CuePoint = 0xBB,
+  CueTime = 0xB3,
+  CueTrackPositions = 0xB7,
+  CueTrack = 0xF7,
+  CueClusterPosition = 0xF1,
+  CueBlockNumber = 0x5378
+/* CueCodecState = 0xEA, */
+/* CueReference = 0xDB, */
+/* CueRefTime = 0x96, */
+/* CueRefCluster = 0x97, */
+/* CueRefNumber = 0x535F, */
+/* CueRefCodecState = 0xEB, */
+  /* Attachment */
+/* Attachments = 0x1941A469, */
+/* AttachedFile = 0x61A7, */
+/* FileDescription = 0x467E, */
+/* FileName = 0x466E, */
+/* FileMimeType = 0x4660, */
+/* FileData = 0x465C, */
+/* FileUID = 0x46AE, */
+/* FileReferral = 0x4675, */
+  /* Chapters */
+/* Chapters = 0x1043A770, */
+/* EditionEntry = 0x45B9, */
+/* EditionUID = 0x45BC, */
+/* EditionFlagHidden = 0x45BD, */
+/* EditionFlagDefault = 0x45DB, */
+/* EditionFlagOrdered = 0x45DD, */
+/* ChapterAtom = 0xB6, */
+/* ChapterUID = 0x73C4, */
+/* ChapterTimeStart = 0x91, */
+/* ChapterTimeEnd = 0x92, */
+/* ChapterFlagHidden = 0x98, */
+/* ChapterFlagEnabled = 0x4598, */
+/* ChapterSegmentUID = 0x6E67, */
+/* ChapterSegmentEditionUID = 0x6EBC, */
+/* ChapterPhysicalEquiv = 0x63C3, */
+/* ChapterTrack = 0x8F, */
+/* ChapterTrackNumber = 0x89, */
+/* ChapterDisplay = 0x80, */
+/* ChapString = 0x85, */
+/* ChapLanguage = 0x437C, */
+/* ChapCountry = 0x437E, */
+/* ChapProcess = 0x6944, */
+/* ChapProcessCodecID = 0x6955, */
+/* ChapProcessPrivate = 0x450D, */
+/* ChapProcessCommand = 0x6911, */
+/* ChapProcessTime = 0x6922, */
+/* ChapProcessData = 0x6933, */
+  /* Tagging */
+/* Tags = 0x1254C367, */
+/* Tag = 0x7373, */
+/* Targets = 0x63C0, */
+/* TargetTypeValue = 0x68CA, */
+/* TargetType = 0x63CA, */
+/* Tagging_TrackUID = 0x63C5, */
+/* Tagging_EditionUID = 0x63C9, */
+/* Tagging_ChapterUID = 0x63C4, */
+/* AttachmentUID = 0x63C6, */
+/* SimpleTag = 0x67C8, */
+/* TagName = 0x45A3, */
+/* TagLanguage = 0x447A, */
+/* TagDefault = 0x4484, */
+/* TagString = 0x4487, */
+/* TagBinary = 0x4485, */
+};
+#endif

diff --git a/third_party/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c
new file mode 100644
index 0000000..27cfe86
--- /dev/null
+++ b/third_party/libmkv/EbmlWriter.c

@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "EbmlWriter.h"
+#include <stdlib.h>
+#include <wchar.h>
+#include <string.h>
+#include <limits.h>
+#if defined(_MSC_VER)
+#define LITERALU64(n) n
+#else
+#define LITERALU64(n) n##LLU
+#endif
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
+  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
+  unsigned char size = 8; /* size in bytes to output */
+
+  /* mask to compare for byte size */
+  int64_t minVal = 0xff;
+
+  for (size = 1; size < 8; size ++) {
+    if (val < minVal)
+      break;
+
+    minVal = (minVal << 7);
+  }
+
+  val |= (((uint64_t)0x80) << ((size - 1) * 7));
+
+  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
+}
+
+void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
+  const size_t size_ = strlen(str);
+  const uint64_t  size = size_;
+  Ebml_WriteLen(glob, size);
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we omit the null terminator.
+   */
+  Ebml_Write(glob, str, (unsigned long)size);
+}
+
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
+  const size_t strlen = wcslen(wstr);
+
+  /* TODO: it's not clear from the spec whether the nul terminator
+   * should be serialized too.  For now we include it.
+   */
+  const uint64_t  size = strlen;
+
+  Ebml_WriteLen(glob, size);
+  Ebml_Write(glob, wstr, (unsigned long)size);
+}
+
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
+  int len;
+
+  if (class_id >= 0x01000000)
+    len = 4;
+  else if (class_id >= 0x00010000)
+    len = 3;
+  else if (class_id >= 0x00000100)
+    len = 2;
+  else
+    len = 1;
+
+  Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
+}
+
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
+  unsigned char sizeSerialized = 8 | 0x80;
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), 8);
+}
+
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
+  unsigned char size = 8; /* size in bytes to output */
+  unsigned char sizeSerialized = 0;
+  unsigned long minVal;
+
+  Ebml_WriteID(glob, class_id);
+  minVal = 0x7fLU; /* mask to compare for byte size */
+
+  for (size = 1; size < 4; size ++) {
+    if (ui < minVal) {
+      break;
+    }
+
+    minVal <<= 7;
+  }
+
+  sizeSerialized = 0x80 | size;
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), size);
+}
+/* TODO: perhaps this is a poor name for this id serializer helper function */
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
+  int size;
+  for (size = 4; size > 1; size--) {
+    if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
+      break;
+  }
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, size);
+  Ebml_WriteID(glob, bin);
+}
+
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
+  unsigned char len = 0x88;
+
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &len, sizeof(len), 1);
+  Ebml_Serialize(glob,  &d, sizeof(d), 8);
+}
+
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
+  signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
+  Ebml_Serialize(glob, &out, sizeof(out), 3);
+}
+
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteString(glob, s);
+}
+
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
+  Ebml_WriteID(glob,  class_id);
+  Ebml_WriteUTF8(glob,  s);
+}
+
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
+  Ebml_WriteID(glob, class_id);
+  Ebml_WriteLen(glob, data_length);
+  Ebml_Write(glob,  data, data_length);
+}
+
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
+  unsigned char tmp = 0;
+  unsigned long i = 0;
+
+  Ebml_WriteID(glob, 0xEC);
+  Ebml_WriteLen(glob, vSize);
+
+  for (i = 0; i < vSize; i++) {
+    Ebml_Write(glob, &tmp, 1);
+  }
+}
+
+/* TODO Serialize Date */

diff --git a/third_party/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h
new file mode 100644
index 0000000..b94f757
--- /dev/null
+++ b/third_party/libmkv/EbmlWriter.h

@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef EBMLWRITER_HPP
+#define EBMLWRITER_HPP
+#include <stddef.h>
+#include "vpx/vpx_integer.h"
+
+/* note: you must define write and serialize functions as well as your own
+ * EBML_GLOBAL
+ *
+ * These functions MUST be implemented
+ */
+
+typedef struct EbmlGlobal EbmlGlobal;
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
+void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
+
+/*****/
+
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
+void Ebml_WriteString(EbmlGlobal *glob, const char *str);
+void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
+void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
+void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
+void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
+void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
+/* TODO make this more generic to signed */
+void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
+void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
+void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
+void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
+void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
+/* TODO need date function */
+#endif

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index ba26939..b013fe5 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -463,9 +463,7 @@
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2/;
-# TODO(johann) Update sse4 implementation and re-enable
-#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;

diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
index f495bf2..291d219 100644
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c

@@ -26,11 +26,10 @@
         int cmp = (x[z] < boost) | (y[z] == 0); \
         zbin_boost_ptr++; \
         if (cmp) \
-            goto select_eob_end_##i; \
+            break; \
         qcoeff_ptr[z] = y[z]; \
         eob = i; \
         zbin_boost_ptr = b->zrun_zbin_boost; \
-        select_eob_end_##i:; \
     } while (0)
 
 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)

diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
deleted file mode 100644
index dbd171b..0000000
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ /dev/null

@@ -1,256 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp8_asm_enc_offsets.asm"
-
-
-; void vp8_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp8_regular_quantize_b_sse4) PRIVATE
-sym(vp8_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_zbin]
-    mov         rdx, [rdi + vp8_block_round]
-    movd        xmm7, [rdi + vp8_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp8_block_quant_shift]
-    mov         rcx, [rdi + vp8_block_quant]
-    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp8_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp8/common/entropy.c: vp8_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c
new file mode 100644
index 0000000..601dd23
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse4.c

@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp8_rtcd.h"
+#include "vp8/encoder/block.h"
+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+
+#define SELECT_EOB(i, z, x, y, q) \
+    do { \
+        short boost = *zbin_boost_ptr; \
+        short x_z = _mm_extract_epi16(x, z); \
+        short y_z = _mm_extract_epi16(y, z); \
+        int cmp = (x_z < boost) | (y_z == 0); \
+        zbin_boost_ptr++; \
+        if (cmp) \
+            break; \
+        q = _mm_insert_epi16(q, y_z, z); \
+        eob = i; \
+        zbin_boost_ptr = b->zrun_zbin_boost; \
+    } while (0)
+
+void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+    char eob = 0;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+
+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1,
+            dqcoeff0, dqcoeff1;
+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+    __m128i qcoeff0 = _mm_setzero_si128();
+    __m128i qcoeff1 = _mm_setzero_si128();
+
+    /* Duplicate to all lanes. */
+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
+
+    /* Sign of z: z >> 15 */
+    sz0 = _mm_srai_epi16(z0, 15);
+    sz1 = _mm_srai_epi16(z1, 15);
+
+    /* x = abs(z): (z ^ sz) - sz */
+    x0 = _mm_xor_si128(z0, sz0);
+    x1 = _mm_xor_si128(z1, sz1);
+    x0 = _mm_sub_epi16(x0, sz0);
+    x1 = _mm_sub_epi16(x1, sz1);
+
+    /* zbin[] + zbin_extra */
+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
+
+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
+     * the equation because boost is the only value which can change:
+     * x - (zbin[] + extra) >= boost */
+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
+
+    /* All the remaining calculations are valid whether they are done now with
+     * simd or later inside the loop one at a time. */
+    x0 = _mm_add_epi16(x0, round0);
+    x1 = _mm_add_epi16(x1, round1);
+
+    y0 = _mm_mulhi_epi16(x0, quant0);
+    y1 = _mm_mulhi_epi16(x1, quant1);
+
+    y0 = _mm_add_epi16(y0, x0);
+    y1 = _mm_add_epi16(y1, x1);
+
+    /* Instead of shifting each value independently we convert the scaling
+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
+    y0 = _mm_mulhi_epi16(y0, quant_shift0);
+    y1 = _mm_mulhi_epi16(y1, quant_shift1);
+
+    /* Return the sign: (y ^ sz) - sz */
+    y0 = _mm_xor_si128(y0, sz0);
+    y1 = _mm_xor_si128(y1, sz1);
+    y0 = _mm_sub_epi16(y0, sz0);
+    y1 = _mm_sub_epi16(y1, sz1);
+
+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
+    SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
+    SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
+    SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+
+    _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
+    _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
+
+    dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0);
+    dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1);
+
+    _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
+
+    *d->eob = eob;
+}

diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index b7b948a..a0dbdcf 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk

@@ -89,6 +89,7 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
@@ -97,7 +98,6 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm

diff --git a/vp9/common/arm/neon/vp9_convolve_neon.c b/vp9/common/arm/neon/vp9_convolve_neon.c
index d8b24bf..f0881b5 100644
--- a/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/vp9/common/arm/neon/vp9_convolve_neon.c

@@ -25,12 +25,14 @@
   // Account for the vertical phase needing 3 lines prior and 4 lines post
   int intermediate_height = h + 7;
 
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_c(src, src_stride,
-                           dst, dst_stride,
-                           filter_x, x_step_q4,
-                           filter_y, y_step_q4,
-                           w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_c(src, src_stride,
+                    dst, dst_stride,
+                    filter_x, x_step_q4,
+                    filter_y, y_step_q4,
+                    w, h);
+    return;
+  }
 
   /* Filter starting 3 lines back. The neon implementation will ignore the
    * given height and filter a multiple of 4 lines. Since this goes in to
@@ -57,12 +59,14 @@
   DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
   int intermediate_height = h + 7;
 
-  if (x_step_q4 != 16 || y_step_q4 != 16)
-    return vp9_convolve8_avg_c(src, src_stride,
-                               dst, dst_stride,
-                               filter_x, x_step_q4,
-                               filter_y, y_step_q4,
-                               w, h);
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vp9_convolve8_avg_c(src, src_stride,
+                        dst, dst_stride,
+                        filter_x, x_step_q4,
+                        filter_y, y_step_q4,
+                        w, h);
+    return;
+  }
 
   /* This implementation has the same issues as above. In addition, we only want
    * to average the values after both passes.

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 0820db2..bc6a17c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c

@@ -9,6 +9,7 @@
  */
 
 #include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
 
 void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index e56a0b7..2386b13 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -109,7 +109,9 @@
   }
 
   vp9_free_frame_buffer(&cm->post_proc_buffer);
+}
 
+void vp9_free_context_buffers(VP9_COMMON *cm) {
   free_mi(cm);
 
   vpx_free(cm->last_frame_seg_map);
@@ -165,37 +167,55 @@
 
  fail:
   vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
   return 1;
 }
 
+static void init_frame_bufs(VP9_COMMON *cm) {
+  int i;
+
+  cm->new_fb_idx = FRAME_BUFFERS - 1;
+  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = i;
+    cm->frame_bufs[i].ref_count = 1;
+  }
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  int i;
   const int ss_x = cm->subsampling_x;
   const int ss_y = cm->subsampling_y;
-  int i;
 
   vp9_free_frame_buffers(cm);
 
-  for (i = 0; i < FRAME_BUFFERS; i++) {
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
     cm->frame_bufs[i].ref_count = 0;
     if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
                                ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
   }
 
-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; i++) {
-    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
-  }
+  init_frame_bufs(cm);
 
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
                              VP9_ENC_BORDER_IN_PIXELS) < 0)
     goto fail;
 
+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
+}
+
+int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+
+  vp9_free_context_buffers(cm);
+
   set_mb_mi(cm, aligned_width, aligned_height);
 
   if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE)))
@@ -224,12 +244,13 @@
   return 0;
 
  fail:
-  vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
   return 1;
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
   vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
 }
 

diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index 06636a9..c4b1b8d 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h

@@ -23,8 +23,12 @@
 int vp9_resize_frame_buffers(struct VP9Common *cm, int width, int height);
 
 int vp9_alloc_frame_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
+int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
 
 void vp9_free_frame_buffers(struct VP9Common *cm);
+void vp9_free_state_buffers(struct VP9Common *cm);
+void vp9_free_context_buffers(struct VP9Common *cm);
 
 void vp9_update_frame_size(struct VP9Common *cm);
 

diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 1a8c49d..d8aaf32 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c

@@ -117,17 +117,25 @@
                      const InterpKernel *const y_filters,
                      int y0_q4, int y_step_q4,
                      int w, int h) {
-  // Fixed size intermediate buffer places limits on parameters.
-  // Maximum intermediate_height is 324, for y_step_q4 == 80,
-  // h == 64, taps == 8.
-  // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-  uint8_t temp[64 * 324];
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[135 * 64];
   int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(y_step_q4 <= 80);
-  assert(x_step_q4 <= 80);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
 
   if (intermediate_height < h)
     intermediate_height = h;

diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 61682c4..0fe58c5 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c

@@ -11,181 +11,6 @@
 
 #include "vp9/common/vp9_mvref_common.h"
 
-#define MVREF_NEIGHBOURS 8
-
-typedef struct position {
-  int row;
-  int col;
-} POSITION;
-
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[MB_MODE_COUNT] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // TM_PRED
-  0,  // NEARESTMV
-  0,  // NEARMV
-  3,  // ZEROMV
-  1,  // NEWMV
-};
-
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,  // 0
-  NEW_PLUS_NON_INTRA,  // 1
-  BOTH_NEW,  // 2
-  ZERO_PLUS_PREDICTED,  // 3
-  NEW_PLUS_NON_INTRA,  // 4
-  INVALID_CASE,  // 5
-  BOTH_ZERO,  // 6
-  INVALID_CASE,  // 7
-  INVALID_CASE,  // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,  // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,  // 13
-  INVALID_CASE,  // 14
-  INVALID_CASE,  // 15
-  INVALID_CASE,  // 16
-  INVALID_CASE,  // 17
-  BOTH_INTRA  // 18
-};
-
-static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
-  // 4X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 4X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X8
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 16X32
-  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X16
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X32
-  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X64
-  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X32
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
-};
-
-static const int idx_n_column_to_subblock[4][2] = {
-  {1, 2},
-  {1, 3},
-  {3, 2},
-  {3, 3}
-};
-
-// clamp_mv_ref
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
-  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
-          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
-              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
-}
-
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
-  do { \
-    if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
-        goto Done; \
-      } \
-    } else { \
-      mv_ref_list[refmv_count++] = (mv); \
-    } \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
-  do { \
-    if (is_inter_block(mbmi)) { \
-      if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
-      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
-    } \
-  } while (0)
-
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile,
-                            int mi_col, int mi_row, int mi_rows,
-                            const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
-           mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
-           mi_col + mi_pos->col >= tile->mi_col_end);
-}
-
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,

diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 903ac02..7bce3fa 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h

@@ -21,6 +21,181 @@
 #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
                                 VP9_INTERP_EXTEND) << 3)
 
+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,

diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index def1255..3332e58 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c

@@ -12,7 +12,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
-#if 1
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
   4,       8,    8,    9,   10,   11,   12,   12,
   13,     14,   15,   16,   17,   18,   19,   19,
@@ -83,44 +82,6 @@
   1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
 };
 
-void vp9_init_quant_tables(void) { }
-#else
-static int16_t dc_qlookup[QINDEX_RANGE];
-static int16_t ac_qlookup[QINDEX_RANGE];
-
-#define ACDC_MIN 8
-
-// TODO(dkovalev) move to common and reuse
-static double poly3(double a, double b, double c, double d, double x) {
-  return a*x*x*x + b*x*x + c*x + d;
-}
-
-void vp9_init_quant_tables() {
-  int i, val = 4;
-
-  // A "real" q of 1.0 forces lossless mode.
-  // In practice non lossless Q's between 1.0 and 2.0 (represented here by
-  // integer values from 5-7 give poor rd results (lower psnr and often
-  // larger size than the lossless encode. To block out those "not very useful"
-  // values we increment the ac and dc q lookup values by 4 after position 0.
-  ac_qlookup[0] = val;
-  dc_qlookup[0] = val;
-  val += 4;
-
-  for (i = 1; i < QINDEX_RANGE; i++) {
-    const int ac_val = val;
-
-    val = (int)(val * 1.01975);
-    if (val == ac_val)
-      ++val;
-
-    ac_qlookup[i] = (int16_t)ac_val;
-    dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9,
-                                                 0.5, ac_val));
-  }
-}
-#endif
-
 int16_t vp9_dc_quant(int qindex, int delta) {
   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }

diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h
index 5811040..d1545d9 100644
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h

@@ -22,8 +22,6 @@
 #define QINDEX_RANGE (MAXQ - MINQ + 1)
 #define QINDEX_BITS 8
 
-void vp9_init_quant_tables();
-
 int16_t vp9_dc_quant(int qindex, int delta);
 int16_t vp9_ac_quant(int qindex, int delta);
 

diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index a9dda18..04aae65 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h

@@ -46,8 +46,8 @@
 }
 
 static INLINE int vp9_is_scaled(const struct scale_factors *sf) {
-  return sf->x_scale_fp != REF_NO_SCALE ||
-         sf->y_scale_fp != REF_NO_SCALE;
+  return vp9_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }
 
 #ifdef __cplusplus

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index fc70035..9220a9e 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -685,6 +685,10 @@
   while (max_ones-- && vp9_rb_read_bit(rb))
     cm->log2_tile_cols++;
 
+  if (cm->log2_tile_cols > 6)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid number of tile columns");
+
   // rows
   cm->log2_tile_rows = vp9_rb_read_bit(rb);
   if (cm->log2_tile_rows)
@@ -1077,7 +1081,7 @@
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
 
-    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+    if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 5859859..245c5f1 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -37,7 +37,6 @@
 
   if (!init_done) {
     vp9_init_neighbors();
-    vp9_init_quant_tables();
     init_done = 1;
   }
 }
@@ -244,8 +243,8 @@
 
   // Check if the previous frame was a frame without any references to it.
   // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-         cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
     cm->release_fb_cb(cm->cb_priv,
                       &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);
@@ -260,10 +259,10 @@
     // TODO(jkoleszar): Error concealment is undefined and non-normative
     // at this point, but if it becomes so, [0] may not always be the correct
     // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
+    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
       cm->frame_refs[0].buf->corrupted = 1;
 
-    if (cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
+    if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
       cm->frame_bufs[cm->new_fb_idx].ref_count--;
 
     return -1;

diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index a727e2a..01c07f1 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h

@@ -40,6 +40,23 @@
   int sync_range;
 } VP9LfSync;
 
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+} FrameWorkerData;
+
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
                            int rows, int width);

diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c
index 778a635..3eef728 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c

@@ -10,7 +10,7 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
+  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
 }
 
 int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 76f5e7b..1bf826a 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -890,14 +890,8 @@
 }
 
 static int get_refresh_mask(VP9_COMP *cpi) {
-    // Should the GF or ARF be updated using the transmitted frame or buffer
-#if CONFIG_MULTIPLE_ARF
-    if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
-        !cpi->refresh_alt_ref_frame) {
-#else
-    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
-        !cpi->use_svc) {
-#endif
+    if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+        cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
       // Preserve the previously existing golden frame and update the frame in
       // the alt ref slot instead. This is highly specific to the use of
       // alt-ref as a forward reference, and this needs to be generalized as
@@ -910,15 +904,10 @@
              (cpi->refresh_golden_frame << cpi->alt_fb_idx);
     } else {
       int arf_idx = cpi->alt_fb_idx;
-#if CONFIG_MULTIPLE_ARF
-      // Determine which ARF buffer to use to encode this ARF frame.
-      if (cpi->multi_arf_enabled) {
-        int sn = cpi->sequence_number;
-        arf_idx = (cpi->frame_coding_order[sn] < 0) ?
-            cpi->arf_buffer_idx[sn + 1] :
-            cpi->arf_buffer_idx[sn];
+      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
-#endif
       return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
              (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
              (cpi->refresh_alt_ref_frame << arf_idx);

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2463ed0..454d0da 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -93,8 +93,6 @@
 
   int encode_breakout;
 
-  int in_active_map;
-
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
 

diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 687b4c2..ff54033 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c

@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-#include <stdint.h>
-#include "vp9/encoder/vp9_denoiser.h"
+#include <assert.h>
 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_denoiser.h"
 
 static const int widths[]  = {4, 4, 8, 8,  8, 16, 16, 16, 32, 32, 32, 64, 64};
 static const int heights[] = {4, 8, 4, 8, 16,  8, 16, 32, 16, 32, 64, 32, 64};
@@ -20,30 +20,180 @@
   return 0;
 }
 
-void vp9_denoiser_denoise(VP9_DENOISER *denoiser,
-                          MACROBLOCK *mb, MODE_INFO **grid,
+static int update_running_avg(const uint8_t *mc_avg, int mc_avg_stride,
+                              uint8_t *avg, int avg_stride,
+                              const uint8_t *sig, int sig_stride,
+                              int increase_denoising, BLOCK_SIZE bs) {
+  int r, c;
+  int diff, adj, absdiff;
+  int shift_inc1 = 0, shift_inc2 = 1;
+  int adj_val[] = {3, 4, 6};
+  int total_adj = 0;
+
+  if (increase_denoising) {
+    shift_inc1 = 1;
+    shift_inc2 = 2;
+  }
+
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      diff = mc_avg[c] - sig[c];
+      absdiff = abs(diff);
+
+      if (absdiff <= 3 + shift_inc1) {
+        avg[c] = mc_avg[c];
+        total_adj += diff;
+      } else {
+        switch (absdiff) {
+          case 4: case 5: case 6: case 7:
+            adj = adj_val[0];
+            break;
+          case 8: case 9: case 10: case 11:
+          case 12: case 13: case 14: case 15:
+            adj = adj_val[1];
+            break;
+          default:
+            adj = adj_val[2];
+        }
+        if (diff > 0) {
+          avg[c] = MIN(UINT8_MAX, sig[c] + adj);
+          total_adj += adj;
+        } else {
+          avg[c] = MAX(0, sig[c] - adj);
+          total_adj -= adj;
+        }
+      }
+    }
+    sig += sig_stride;
+    avg += avg_stride;
+    mc_avg += mc_avg_stride;
+  }
+  return total_adj;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride,
+                            int mi_row, int mi_col) {
+  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+}
+
+void copy_block(uint8_t *dest, int dest_stride,
+                uint8_t *src, int src_stride, BLOCK_SIZE bs) {
+  int r, c;
+  for (r = 0; r < heights[bs]; ++r) {
+    for (c = 0; c < widths[bs]; ++c) {
+      dest[c] = src[c];
+    }
+    dest += dest_stride;
+    src += src_stride;
+  }
+}
+
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs) {
-  return;
+  int decision = COPY_BLOCK;
+
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+  uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+  uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
+                                          mi_row, mi_col);
+  struct buf_2d src = mb->plane[0].src;
+
+
+  update_running_avg(mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride,
+                     mb->plane[0].src.buf, mb->plane[0].src.stride, 0, bs);
+
+  if (decision == FILTER_BLOCK) {
+    // TODO(tkopp)
+  }
+  if (decision == COPY_BLOCK) {
+    copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+  }
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
+  int r, c;
+  const uint8_t *srcbuf = src.y_buffer;
+  uint8_t *destbuf = dest.y_buffer;
+  assert(dest.y_width == src.y_width);
+  assert(dest.y_height == src.y_height);
+
+  for (r = 0; r < dest.y_height; ++r) {
+    for (c = 0; c < dest.y_width; ++c) {
+      destbuf[c] = srcbuf[c];
+    }
+    destbuf += dest.y_stride;
+    srcbuf += src.y_stride;
+  }
 }
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
                                     int refresh_alt_ref_frame,
                                     int refresh_golden_frame,
                                     int refresh_last_frame) {
-  return;
+  if (frame_type == KEY_FRAME) {
+    int i;
+    copy_frame(denoiser->running_avg_y[LAST_FRAME], src);
+    for (i = 2; i < MAX_REF_FRAMES - 1; i++) {
+      copy_frame(denoiser->running_avg_y[i],
+                 denoiser->running_avg_y[LAST_FRAME]);
+    }
+  } else {  /* For non key frames */
+    if (refresh_alt_ref_frame) {
+      copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_golden_frame) {
+      copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+    if (refresh_last_frame) {
+      copy_frame(denoiser->running_avg_y[LAST_FRAME],
+                 denoiser->running_avg_y[INTRA_FRAME]);
+    }
+  }
 }
 
 void vp9_denoiser_update_frame_stats() {
-  return;
 }
 
 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int border) {
+                       int ssx, int ssy, int border) {
+  int i, fail;
+  assert(denoiser != NULL);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
+                                  ssx, ssy, border);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+
+  fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
+                                ssx, ssy, border);
+  if (fail) {
+    vp9_denoiser_free(denoiser);
+    return 1;
+  }
+
   return 0;
 }
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
-  return;
+  int i;
+  if (denoiser == NULL) {
+    return;
+  }
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    if (&denoiser->running_avg_y[i] != NULL) {
+      vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
+    }
+  }
+  if (&denoiser->mc_running_avg_y != NULL) {
+    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
+  }
 }
-

diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index a7a8d93..18b9766 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h

@@ -12,6 +12,7 @@
 #define VP9_ENCODER_DENOISER_H_
 
 #include "vp9/encoder/vp9_block.h"
+#include "vpx_scale/yv12config.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,24 +24,24 @@
 };
 
 typedef struct vp9_denoiser {
-  struct buf_2d running_avg_y;
-  struct buf_2d mc_running_avg_y;
+  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
+  YV12_BUFFER_CONFIG mc_running_avg_y;
 } VP9_DENOISER;
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
+                                    YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
                                     int refresh_alt_ref_frame,
                                     int refresh_golden_frame,
                                     int refresh_last_frame);
 
-void vp9_denoiser_denoise(VP9_DENOISER *denoiser,
-                          MACROBLOCK *mb, MODE_INFO **grid,
+void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs);
 
 void vp9_denoiser_update_frame_stats();
 
 int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
-                       int border);
+                       int ssx, int ssy, int border);
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 001ac69..b9349a4 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -139,42 +139,6 @@
   xd->mi[0] = cm->mi + idx_str;
 }
 
-static int is_block_in_mb_map(const VP9_COMP *cpi, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const int mb_rows = cm->mb_rows;
-  const int mb_cols = cm->mb_cols;
-  const int mb_row = mi_row >> 1;
-  const int mb_col = mi_col >> 1;
-  const int mb_width = num_8x8_blocks_wide_lookup[bsize] >> 1;
-  const int mb_height = num_8x8_blocks_high_lookup[bsize] >> 1;
-  int r, c;
-  if (bsize <= BLOCK_16X16) {
-    return cpi->active_map[mb_row * mb_cols + mb_col];
-  }
-  for (r = 0; r < mb_height; ++r) {
-    for (c = 0; c < mb_width; ++c) {
-      int row = mb_row + r;
-      int col = mb_col + c;
-      if (row >= mb_rows || col >= mb_cols)
-        continue;
-      if (cpi->active_map[row * mb_cols + col])
-        return 1;
-    }
-  }
-  return 0;
-}
-
-static int check_active_map(const VP9_COMP *cpi, const MACROBLOCK *x,
-                            int mi_row, int mi_col,
-                            BLOCK_SIZE bsize) {
-  if (cpi->active_map_enabled && !x->e_mbd.lossless) {
-    return is_block_in_mb_map(cpi, mi_row, mi_col, bsize);
-  } else {
-    return 1;
-  }
-}
-
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                         int mi_row, int mi_col, BLOCK_SIZE bsize) {
   MACROBLOCK *const x = &cpi->mb;
@@ -187,9 +151,6 @@
 
   set_skip_context(xd, mi_row, mi_col);
 
-  // Activity map pointer
-  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
-
   set_modeinfo_offsets(cm, xd, mi_row, mi_col);
 
   mbmi = &xd->mi[0]->mbmi;
@@ -723,7 +684,6 @@
 
   xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
   x->skip = 1;
-  x->skip_encode = 1;
 
   *rate = 0;
   *dist = 0;
@@ -822,12 +782,18 @@
     vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx,
                               best_rd);
   } else {
-    if (bsize >= BLOCK_8X8)
-      vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
-                                totalrate, totaldist, bsize, ctx, best_rd);
-    else
+    if (bsize >= BLOCK_8X8) {
+      if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, tile, mi_row, mi_col,
+                                           totalrate, totaldist, bsize, ctx,
+                                           best_rd);
+      else
+        vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+                                  totalrate, totaldist, bsize, ctx, best_rd);
+    } else {
       vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
                                     totaldist, bsize, ctx, best_rd);
+    }
   }
 
   x->rdmult = orig_rdmult;
@@ -1508,20 +1474,8 @@
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
-  } else {
-    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
   }
 
-  if (!x->in_active_map) {
-    do_partition_search = 0;
-    if (mi_row + (mi_step >> 1) < cm->mi_rows &&
-        mi_col + (mi_step >> 1) < cm->mi_cols) {
-      pc_tree->partitioning = PARTITION_NONE;
-      bs_type = mi_8x8[0]->mbmi.sb_type = bsize;
-      subsize = bsize;
-      partition = PARTITION_NONE;
-    }
-  }
   if (do_partition_search &&
       cpi->sf.partition_search_type == SEARCH_PARTITION &&
       cpi->sf.adjust_partitioning_from_last_frame) {
@@ -1984,8 +1938,6 @@
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
-  } else {
-    x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
   }
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
@@ -2018,8 +1970,6 @@
     }
   }
 
-  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
-    do_split = 0;
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
@@ -2053,10 +2003,6 @@
         }
       }
     }
-    if (!x->in_active_map) {
-      do_split = 0;
-      do_rect = 0;
-    }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
@@ -2322,8 +2268,15 @@
         rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1, cpi->pc_root);
       } else {
+        GF_GROUP * gf_grp = &cpi->twopass.gf_group;
+        int last_was_mid_sequence_overlay = 0;
+        if ((cpi->pass == 2) && (gf_grp->index)) {
+          if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
+            last_was_mid_sequence_overlay = 1;
+        }
         if ((cm->current_video_frame
             % sf->last_partitioning_redo_frequency) == 0
+            || last_was_mid_sequence_overlay
             || cm->prev_mi == 0
             || cm->show_frame == 0
             || cm->frame_type == KEY_FRAME
@@ -2586,8 +2539,6 @@
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
-  x->in_active_map = check_active_map(cpi, x, mi_row, mi_col, bsize);
-
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (cpi->sf.auto_min_max_partition_size) {
@@ -2606,15 +2557,13 @@
     partition_vert_allowed &= force_vert_split;
   }
 
-  if (!x->in_active_map && (partition_horz_allowed || partition_vert_allowed))
-    do_split = 0;
-
   // PARTITION_NONE
   if (partition_none_allowed) {
     nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
                         &this_rate, &this_dist, bsize);
     ctx->mic.mbmi = xd->mi[0]->mbmi;
     ctx->skip_txfm = x->skip_txfm;
+    ctx->skip = x->skip;
 
     if (this_rate != INT_MAX) {
       int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2643,10 +2592,6 @@
         }
       }
     }
-    if (!x->in_active_map) {
-      do_split = 0;
-      do_rect = 0;
-    }
   }
 
   // store estimated motion vector
@@ -2702,6 +2647,7 @@
 
     pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
     pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
+    pc_tree->horizontal[0].skip = x->skip;
 
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
@@ -2712,6 +2658,7 @@
 
       pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
+      pc_tree->horizontal[1].skip = x->skip;
 
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
@@ -2742,6 +2689,7 @@
                         &this_rate, &this_dist, subsize);
     pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
     pc_tree->vertical[0].skip_txfm = x->skip_txfm;
+    pc_tree->vertical[0].skip = x->skip;
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
       load_pred_mv(x, ctx);
@@ -2749,6 +2697,7 @@
                           &this_rate, &this_dist, subsize);
       pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->vertical[1].skip_txfm = x->skip_txfm;
+      pc_tree->vertical[1].skip = x->skip;
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -2838,16 +2787,19 @@
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
       pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->none.skip_txfm = x->skip_txfm;
+      pc_tree->none.skip = x->skip;
       break;
     case PARTITION_VERT:
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
       pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->vertical[0].skip_txfm = x->skip_txfm;
+      pc_tree->vertical[0].skip = x->skip;
       if (mi_col + hbs < cm->mi_cols) {
         nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
                             &rate, &dist, subsize);
         pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
         pc_tree->vertical[1].skip_txfm = x->skip_txfm;
+        pc_tree->vertical[1].skip = x->skip;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
           *totrate += rate;
@@ -2859,11 +2811,13 @@
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
       pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
+      pc_tree->horizontal[0].skip = x->skip;
       if (mi_row + hbs < cm->mi_rows) {
         nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
                             &rate, &dist, subsize);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
         pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
+        pc_tree->horizontal[1].skip = x->skip;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
           *totrate += rate;
@@ -3364,7 +3318,10 @@
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+    if (!cpi->sf.reuse_inter_pred_sby)
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
+
+    vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     if (!x->skip) {
       mbmi->skip = 1;

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 525eccd..6e07f08 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -64,6 +64,9 @@
 
 // #define OUTPUT_YUV_REC
 
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file;
+#endif
 #ifdef OUTPUT_YUV_SRC
 FILE *yuv_file;
 #endif
@@ -103,7 +106,7 @@
   }
 }
 
-static void set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
   MACROBLOCK *const mb = &cpi->mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
   if (cpi->common.allow_high_precision_mv) {
@@ -142,8 +145,6 @@
 
   if (!init_done) {
     vp9_init_neighbors();
-    vp9_init_quant_tables();
-
     vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
     vp9_init_me_luts();
@@ -173,10 +174,8 @@
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
-  vpx_free(cpi->active_map);
-  cpi->active_map = NULL;
-
   vp9_free_frame_buffers(cm);
+  vp9_free_context_buffers(cm);
 
   vp9_free_frame_buffer(&cpi->last_frame_uf);
   vp9_free_frame_buffer(&cpi->scaled_source);
@@ -414,39 +413,46 @@
                        "Failed to allocate altref buffer");
 }
 
-void vp9_alloc_compressor_data(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
+static void alloc_ref_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   if (vp9_alloc_frame_buffers(cm, cm->width, cm->height))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffers");
+}
 
-  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
-                             cm->width, cm->height,
-                             cm->subsampling_x, cm->subsampling_y,
-                             VP9_ENC_BORDER_IN_PIXELS))
+static void alloc_util_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-  if (vp9_alloc_frame_buffer(&cpi->scaled_source,
-                             cm->width, cm->height,
-                             cm->subsampling_x, cm->subsampling_y,
-                             VP9_ENC_BORDER_IN_PIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
-  if (vp9_alloc_frame_buffer(&cpi->scaled_last_source,
-                             cm->width, cm->height,
-                             cm->subsampling_x, cm->subsampling_y,
-                             VP9_ENC_BORDER_IN_PIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
+}
+
+void vp9_alloc_compressor_data(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  vp9_alloc_context_buffers(cm, cm->width, cm->height);
 
   vpx_free(cpi->tok);
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-
     CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
@@ -456,41 +462,7 @@
 static void update_frame_size(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
   vp9_update_frame_size(cm);
-
-  // Update size of buffers local to this frame
-  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to reallocate last frame buffer");
-
-  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to reallocate scaled source buffer");
-
-  if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to reallocate scaled last source buffer");
-
-  {
-    int y_stride = cpi->scaled_source.y_stride;
-
-    if (cpi->sf.mv.search_method == NSTEP) {
-      vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
-    } else if (cpi->sf.mv.search_method == DIAMOND) {
-      vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
-    }
-  }
-
   init_macroblockd(cm, xd);
 }
 
@@ -518,6 +490,12 @@
   cm->log2_tile_rows = cpi->oxcf.tile_rows;
 }
 
+static void init_buffer_indices(VP9_COMP *cpi) {
+  cpi->lst_fb_idx = 0;
+  cpi->gld_fb_idx = 1;
+  cpi->alt_fb_idx = 2;
+}
+
 static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -528,8 +506,6 @@
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
-  cm->subsampling_x = 0;
-  cm->subsampling_y = 0;
   vp9_alloc_compressor_data(cpi);
 
   // Spatial scalability.
@@ -548,10 +524,9 @@
   vp9_change_config(cpi, oxcf);
 
   cpi->static_mb_pct = 0;
+  cpi->ref_frame_flags = 0;
 
-  cpi->lst_fb_idx = 0;
-  cpi->gld_fb_idx = 1;
-  cpi->alt_fb_idx = 2;
+  init_buffer_indices(cpi);
 
   set_tile_limits(cpi);
 }
@@ -590,7 +565,6 @@
   cpi->pass = get_pass(cpi->oxcf.mode);
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
@@ -598,7 +572,7 @@
   cm->reset_frame_context = 0;
 
   vp9_reset_segment_features(&cm->seg);
-  set_high_precision_mv(cpi, 0);
+  vp9_set_high_precision_mv(cpi, 0);
 
   {
     int i;
@@ -663,11 +637,7 @@
                                            (int)cpi->oxcf.target_bandwidth);
   }
 
-#if CONFIG_MULTIPLE_ARF
-  vp9_zero(cpi->alt_ref_source);
-#else
   cpi->alt_ref_source = NULL;
-#endif
   rc->is_src_frame_alt_ref = 0;
 
 #if 0
@@ -683,6 +653,12 @@
 
 #if CONFIG_DENOISING
   vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                     // TODO(tkopp) An unrelated bug causes
+                     // cm->subsampling_{x,y} to be uninitialized at this point
+                     // in execution. For now we assume YUV-420, which is x/y
+                     // subsampling of 1.
+                     1, 1,
+                     // cm->subsampling_x, cm->subsampling_y,
                      VP9_ENC_BORDER_IN_PIXELS);
 #endif
 }
@@ -781,10 +757,6 @@
   CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
-  CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1));
-  vpx_memset(cpi->active_map, 1, cm->MBs);
-  cpi->active_map_enabled = 0;
-
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
                    sizeof(cpi->mbgraph_stats[0])); i++) {
     CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
@@ -794,18 +766,23 @@
 
   cpi->refresh_alt_ref_frame = 0;
 
-#if CONFIG_MULTIPLE_ARF
-  // Turn multiple ARF usage on/off. This is a quick hack for the initial test
-  // version. It should eventually be set via the codec API.
-  cpi->multi_arf_enabled = 1;
-
-  if (cpi->multi_arf_enabled) {
-    cpi->sequence_number = 0;
-    cpi->frame_coding_order_period = 0;
-    vp9_zero(cpi->frame_coding_order);
-    vp9_zero(cpi->arf_buffer_idx);
+  // Note that at the moment multi_arf will not work with svc.
+  // For the current check in all the execution paths are defaulted to 0
+  // pending further tuning and testing. The code is left in place here
+  // as a place holder in regard to the required paths.
+  if (cpi->pass == 2) {
+    if (cpi->use_svc) {
+      cpi->multi_arf_allowed = 0;
+      cpi->multi_arf_enabled = 0;
+    } else {
+      // Disable by default for now.
+      cpi->multi_arf_allowed = 0;
+      cpi->multi_arf_enabled = 0;
+    }
+  } else {
+    cpi->multi_arf_allowed = 0;
+    cpi->multi_arf_enabled = 0;
   }
-#endif
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -860,6 +837,9 @@
   cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif
 #ifdef OUTPUT_YUV_SRC
   yuv_file = fopen("bd.yuv", "ab");
 #endif
@@ -1105,6 +1085,9 @@
   vp9_remove_common(&cpi->common);
   vpx_free(cpi);
 
+#ifdef OUTPUT_YUV_DENOISED
+  fclose(yuv_denoised_file);
+#endif
 #ifdef OUTPUT_YUV_SRC
   fclose(yuv_file);
 #endif
@@ -1286,13 +1269,13 @@
 }
 
 
-#ifdef OUTPUT_YUV_SRC
-void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
+void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s, FILE *f) {
   uint8_t *src = s->y_buffer;
   int h = s->y_height;
 
   do {
-    fwrite(src, s->y_width, 1,  yuv_file);
+    fwrite(src, s->y_width, 1, f);
     src += s->y_stride;
   } while (--h);
 
@@ -1300,7 +1283,7 @@
   h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width, 1,  yuv_file);
+    fwrite(src, s->uv_width, 1, f);
     src += s->uv_stride;
   } while (--h);
 
@@ -1308,7 +1291,7 @@
   h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width, 1, yuv_file);
+    fwrite(src, s->uv_width, 1, f);
     src += s->uv_stride;
   } while (--h);
 }
@@ -1509,14 +1492,8 @@
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     ref_cnt_fb(cm->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-  }
-#if CONFIG_MULTIPLE_ARF
-  else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
-      !cpi->refresh_alt_ref_frame) {
-#else
-  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame &&
-           !cpi->use_svc) {
-#endif
+  } else if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+             cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
     /* Preserve the previously existing golden frame and update the frame in
      * the alt ref slot instead. This is highly specific to the current use of
      * alt-ref as a forward reference, and this needs to be generalized as
@@ -1534,14 +1511,14 @@
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
-  }  else { /* For non key/golden frames */
+  } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
-#if CONFIG_MULTIPLE_ARF
-      if (cpi->multi_arf_enabled) {
-        arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1];
+      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
-#endif
+
       ref_cnt_fb(cm->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
     }
@@ -1558,6 +1535,7 @@
   }
 #if CONFIG_DENOISING
   vp9_denoiser_update_frame_info(&cpi->denoiser,
+                                *cpi->Source,
                                 cpi->common.frame_type,
                                 cpi->refresh_alt_ref_frame,
                                 cpi->refresh_golden_frame,
@@ -1593,13 +1571,15 @@
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
+  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
     const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
 
-    if (ref->y_crop_width != cm->width ||
-        ref->y_crop_height != cm->height) {
+    // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
+    if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
+        (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
       const int new_fb = get_free_fb(cm);
       vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
                                cm->width, cm->height,
@@ -2121,8 +2101,12 @@
   }
 #endif
 
+#ifdef OUTPUT_YUV_DENOISED
+  vp9_write_yuv_frame(&cpi->denoiser.running_avg_y[INTRA_FRAME],
+                      yuv_denoised_file);
+#endif
 #ifdef OUTPUT_YUV_SRC
-  vp9_write_yuv_frame(cpi->Source);
+  vp9_write_yuv_frame(cpi->Source, yuv_file);
 #endif
 
   set_speed_features(cpi);
@@ -2133,7 +2117,7 @@
   if (!frame_is_intra_only(cm)) {
     cm->interp_filter = DEFAULT_INTERP_FILTER;
     /* TODO: Decide this more intelligently */
-    set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
+    vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
   }
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
@@ -2221,31 +2205,8 @@
   if (cm->frame_type == KEY_FRAME) {
     // Tell the caller that the frame was coded as a key frame
     *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
-
-#if CONFIG_MULTIPLE_ARF
-    // Reset the sequence number.
-    if (cpi->multi_arf_enabled) {
-      cpi->sequence_number = 0;
-      cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
-      cpi->new_frame_coding_order_period = -1;
-    }
-#endif
   } else {
     *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
-
-#if CONFIG_MULTIPLE_ARF
-    /* Increment position in the coded frame sequence. */
-    if (cpi->multi_arf_enabled) {
-      ++cpi->sequence_number;
-      if (cpi->sequence_number >= cpi->frame_coding_order_period) {
-        cpi->sequence_number = 0;
-        cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
-        cpi->new_frame_coding_order_period = -1;
-      }
-      cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
-      assert(cpi->this_frame_weight >= 0);
-    }
-#endif
   }
 
   // Clear the one shot update flags for segmentation map and mode/ref loop
@@ -2299,6 +2260,16 @@
   vp9_twopass_postencode_update(cpi);
 }
 
+static void init_motion_estimation(VP9_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
 static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
                                 int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2306,7 +2277,13 @@
   if (!cpi->initial_width) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
+
     alloc_raw_frame_buffers(cpi);
+    alloc_ref_frame_buffers(cpi);
+    alloc_util_frame_buffers(cpi);
+
+    init_motion_estimation(cpi);
+
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
   }
@@ -2321,11 +2298,22 @@
   int res = 0;
   const int subsampling_x = sd->uv_width  < sd->y_width;
   const int subsampling_y = sd->uv_height < sd->y_height;
+  const int is_spatial_svc = cpi->use_svc &&
+                             (cpi->svc.number_temporal_layers == 1);
 
   check_initial_width(cpi, subsampling_x, subsampling_y);
+
   vpx_usec_timer_start(&timer);
-  if (vp9_lookahead_push(cpi->lookahead,
-                         sd, time_stamp, end_time, frame_flags))
+
+#ifdef CONFIG_SPATIAL_SVC
+  if (is_spatial_svc)
+    res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time,
+                                 frame_flags);
+  else
+#endif
+    res = vp9_lookahead_push(cpi->lookahead,
+                             sd, time_stamp, end_time, frame_flags);
+  if (res)
     res = -1;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -2353,13 +2341,6 @@
          cm->seg.update_data;
 }
 
-#if CONFIG_MULTIPLE_ARF
-int is_next_frame_arf(VP9_COMP *cpi) {
-  // Negative entry in frame_coding_order indicates an ARF at this position.
-  return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0;
-}
-#endif
-
 void adjust_frame_rate(VP9_COMP *cpi) {
   int64_t this_duration;
   int step = 0;
@@ -2398,6 +2379,46 @@
   cpi->last_end_time_stamp_seen = cpi->source->ts_end;
 }
 
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (is_altref_enabled(&cpi->oxcf)) {
+    if (cpi->pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        arf_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    } else if (rc->source_alt_ref_pending) {
+      arf_src_index = rc->frames_till_gf_update_due;
+    }
+  }
+  return arf_src_index;
+}
+
+static void check_src_altref(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (cpi->pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    rc->is_src_frame_alt_ref =
+      (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+  } else {
+    rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
+                               (cpi->source == cpi->alt_ref_source);
+  }
+
+  if (rc->is_src_frame_alt_ref) {
+    // Current frame is an ARF overlay frame.
+    cpi->alt_ref_source = NULL;
+
+    // Don't refresh the last buffer for an ARF overlay frame. It will
+    // become the GF so preserve last as an alternative prediction option.
+    cpi->refresh_last_frame = 0;
+  }
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
@@ -2407,11 +2428,15 @@
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   MV_REFERENCE_FRAME ref_frame;
+  int arf_src_index;
+  const int is_spatial_svc = cpi->use_svc &&
+                             (cpi->svc.number_temporal_layers == 1);
 
   if (!cpi)
     return -1;
 
-  if (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2) {
+  if (is_spatial_svc && cpi->pass == 2) {
+    vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1);
     vp9_restore_layer_context(cpi);
   }
 
@@ -2420,7 +2445,7 @@
   cpi->source = NULL;
   cpi->last_source = NULL;
 
-  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+  vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
   // Normal defaults
   cm->reset_frame_context = 0;
@@ -2429,35 +2454,26 @@
   cpi->refresh_golden_frame = 0;
   cpi->refresh_alt_ref_frame = 0;
 
-  // Should we code an alternate reference frame.
-  if (is_altref_enabled(&cpi->oxcf) && rc->source_alt_ref_pending) {
-    int frames_to_arf;
+  // Should we encode an arf frame.
+  arf_src_index = get_arf_src_index(cpi);
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
 
-#if CONFIG_MULTIPLE_ARF
-    assert(!cpi->multi_arf_enabled ||
-           cpi->frame_coding_order[cpi->sequence_number] < 0);
-
-    if (cpi->multi_arf_enabled && (cpi->pass == 2))
-      frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])
-          - cpi->next_frame_in_order;
+#ifdef CONFIG_SPATIAL_SVC
+    if (is_spatial_svc)
+      cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
+                                           arf_src_index, 1);
     else
 #endif
-      frames_to_arf = rc->frames_till_gf_update_due;
-
-    assert(frames_to_arf <= rc->frames_to_key);
-
-    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {
-#if CONFIG_MULTIPLE_ARF
-      cpi->alt_ref_source[cpi->arf_buffered] = cpi->source;
-#else
+      cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
+    if (cpi->source != NULL) {
       cpi->alt_ref_source = cpi->source;
-#endif
 
       if (cpi->oxcf.arnr_max_frames > 0) {
         // Produce the filtered ARF frame.
         // TODO(agrange) merge these two functions.
-        vp9_configure_arnr_filter(cpi, frames_to_arf, rc->gfu_boost);
-        vp9_temporal_filter_prepare(cpi, frames_to_arf);
+        vp9_configure_arnr_filter(cpi, arf_src_index, rc->gfu_boost);
+        vp9_temporal_filter_prepare(cpi, arf_src_index);
         vp9_extend_frame_borders(&cpi->alt_ref_buffer);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
@@ -2467,59 +2483,38 @@
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
       rc->is_src_frame_alt_ref = 0;
-
-#if CONFIG_MULTIPLE_ARF
-      if (!cpi->multi_arf_enabled)
-#endif
-        rc->source_alt_ref_pending = 0;
+      rc->source_alt_ref_pending = 0;
     } else {
       rc->source_alt_ref_pending = 0;
     }
   }
 
   if (!cpi->source) {
-#if CONFIG_MULTIPLE_ARF
-    int i;
-#endif
-
     // Get last frame source.
     if (cm->current_video_frame > 0) {
-      if ((cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
+#ifdef CONFIG_SPATIAL_SVC
+      if (is_spatial_svc)
+        cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
+      else
+#endif
+        cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1);
+      if (cpi->last_source == NULL)
         return -1;
     }
 
-    if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
+    // Read in the source frame.
+#ifdef CONFIG_SPATIAL_SVC
+    if (is_spatial_svc)
+      cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+    else
+#endif
+      cpi->source = vp9_lookahead_pop(cpi->lookahead, flush);
+    if (cpi->source != NULL) {
       cm->show_frame = 1;
       cm->intra_only = 0;
 
-#if CONFIG_MULTIPLE_ARF
-      // Is this frame the ARF overlay.
-      rc->is_src_frame_alt_ref = 0;
-      for (i = 0; i < cpi->arf_buffered; ++i) {
-        if (cpi->source == cpi->alt_ref_source[i]) {
-          rc->is_src_frame_alt_ref = 1;
-          cpi->refresh_golden_frame = 1;
-          break;
-        }
-      }
-#else
-      rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
-                                 (cpi->source == cpi->alt_ref_source);
-#endif
-      if (rc->is_src_frame_alt_ref) {
-        // Current frame is an ARF overlay frame.
-#if CONFIG_MULTIPLE_ARF
-        cpi->alt_ref_source[i] = NULL;
-#else
-        cpi->alt_ref_source = NULL;
-#endif
-        // Don't refresh the last buffer for an ARF overlay frame. It will
-        // become the GF so preserve last as an alternative prediction option.
-        cpi->refresh_last_frame = 0;
-      }
-#if CONFIG_MULTIPLE_ARF
-      ++cpi->next_frame_in_order;
-#endif
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi);
     }
   }
 
@@ -2527,20 +2522,17 @@
     cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
                                                            : &cpi->source->img;
 
-  if (cpi->last_source != NULL) {
-    cpi->unscaled_last_source = &cpi->last_source->img;
-  } else {
-    cpi->unscaled_last_source = NULL;
-  }
+    if (cpi->last_source != NULL) {
+      cpi->unscaled_last_source = &cpi->last_source->img;
+    } else {
+      cpi->unscaled_last_source = NULL;
+    }
 
     *time_stamp = cpi->source->ts_start;
     *time_end = cpi->source->ts_end;
-    *frame_flags = cpi->source->flags;
+    *frame_flags =
+        (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
-#if CONFIG_MULTIPLE_ARF
-    if (cm->frame_type != KEY_FRAME && cpi->pass == 2)
-      rc->source_alt_ref_pending = is_next_frame_arf(cpi);
-#endif
   } else {
     *size = 0;
     if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
@@ -2578,16 +2570,14 @@
   cm->frame_bufs[cm->new_fb_idx].ref_count--;
   cm->new_fb_idx = get_free_fb(cm);
 
-#if CONFIG_MULTIPLE_ARF
-  /* Set up the correct ARF frame. */
-  if (cpi->refresh_alt_ref_frame) {
-    ++cpi->arf_buffered;
+  if (!cpi->use_svc && cpi->multi_arf_allowed) {
+    if (cm->frame_type == KEY_FRAME) {
+      init_buffer_indices(cpi);
+    } else if (cpi->pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
+    }
   }
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      (cpi->pass == 2)) {
-    cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number];
-  }
-#endif
 
   cpi->frame_flags = *frame_flags;
 
@@ -2606,6 +2596,9 @@
                            cm->subsampling_x, cm->subsampling_y,
                            VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
 
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
     YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
@@ -2775,16 +2768,23 @@
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) {
   if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
     if (map) {
-      vpx_memcpy(cpi->active_map, map, rows * cols);
-      cpi->active_map_enabled = 1;
+      int r, c;
+      for (r = 0; r < mi_rows; r++) {
+        for (c = 0; c < mi_cols; c++) {
+          cpi->segmentation_map[r * mi_cols + c] =
+              !map[(r >> 1) * cols + (c >> 1)];
+        }
+      }
+      vp9_enable_segfeature(&cpi->common.seg, 1, SEG_LVL_SKIP);
+      vp9_enable_segmentation(&cpi->common.seg);
     } else {
-      cpi->active_map_enabled = 0;
+      vp9_disable_segmentation(&cpi->common.seg);
     }
-
     return 0;
   } else {
-    // cpi->active_map_enabled = 0;
     return -1;
   }
 }
@@ -2863,3 +2863,42 @@
 int vp9_get_quantizer(VP9_COMP *cpi) {
   return cpi->common.base_qindex;
 }
+
+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
+  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
+               VP8_EFLAG_NO_REF_ARF)) {
+    int ref = 7;
+
+    if (flags & VP8_EFLAG_NO_REF_LAST)
+      ref ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_GF)
+      ref ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_REF_ARF)
+      ref ^= VP9_ALT_FLAG;
+
+    vp9_use_as_reference(cpi, ref);
+  }
+
+  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+               VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
+               VP8_EFLAG_FORCE_ARF)) {
+    int upd = 7;
+
+    if (flags & VP8_EFLAG_NO_UPD_LAST)
+      upd ^= VP9_LAST_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_GF)
+      upd ^= VP9_GOLD_FLAG;
+
+    if (flags & VP8_EFLAG_NO_UPD_ARF)
+      upd ^= VP9_ALT_FLAG;
+
+    vp9_update_reference(cpi, upd);
+  }
+
+  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
+    vp9_update_entropy(cpi, 0);
+  }
+}

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index a27868a..47649a8 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
@@ -46,9 +47,6 @@
 
 #define DEFAULT_GF_INTERVAL         10
 
-#define MAX_MODES 30
-#define MAX_REFS  6
-
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -66,56 +64,6 @@
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
-// This enumerator type needs to be kept aligned with the mode order in
-// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
-typedef enum {
-  THR_NEARESTMV,
-  THR_NEARESTA,
-  THR_NEARESTG,
-
-  THR_DC,
-
-  THR_NEWMV,
-  THR_NEWA,
-  THR_NEWG,
-
-  THR_NEARMV,
-  THR_NEARA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARESTGA,
-
-  THR_TM,
-
-  THR_COMP_NEARLA,
-  THR_COMP_NEWLA,
-  THR_NEARG,
-  THR_COMP_NEARGA,
-  THR_COMP_NEWGA,
-
-  THR_ZEROMV,
-  THR_ZEROG,
-  THR_ZEROA,
-  THR_COMP_ZEROLA,
-  THR_COMP_ZEROGA,
-
-  THR_H_PRED,
-  THR_V_PRED,
-  THR_D135_PRED,
-  THR_D207_PRED,
-  THR_D153_PRED,
-  THR_D63_PRED,
-  THR_D117_PRED,
-  THR_D45_PRED,
-} THR_MODES;
-
-typedef enum {
-  THR_LAST,
-  THR_GOLD,
-  THR_ALTR,
-  THR_COMP_LA,
-  THR_COMP_GA,
-  THR_INTRA,
-} THR_MODES_SUB8X8;
 
 typedef enum {
   // encode_breakout is disabled.
@@ -293,32 +241,6 @@
   return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
 }
 
-typedef struct RD_OPT {
-  // Thresh_mult is used to set a threshold for the rd score. A higher value
-  // means that we will accept the best mode so far more often. This number
-  // is used in combination with the current block size, and thresh_freq_fact
-  // to pick a threshold.
-  int thresh_mult[MAX_MODES];
-  int thresh_mult_sub8x8[MAX_REFS];
-
-  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
-  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-
-  int64_t comp_pred_diff[REFERENCE_MODES];
-  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
-  int64_t tx_select_diff[TX_MODES];
-  // FIXME(rbultje) can this overflow?
-  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
-
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t mask_filter;
-
-  int RDMULT;
-  int RDDIV;
-} RD_OPT;
-
 typedef struct VP9_COMP {
   QUANTS quants;
   MACROBLOCK mb;
@@ -326,11 +248,7 @@
   VP9EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
-#if CONFIG_MULTIPLE_ARF
-  struct lookahead_entry  *alt_ref_source[REF_FRAMES];
-#else
   struct lookahead_entry  *alt_ref_source;
-#endif
   struct lookahead_entry  *last_source;
 
   YV12_BUFFER_CONFIG *Source;
@@ -349,9 +267,6 @@
   int gld_fb_idx;
   int alt_fb_idx;
 
-#if CONFIG_MULTIPLE_ARF
-  int alt_ref_fb_idx[REF_FRAMES - 3];
-#endif
   int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_alt_ref_frame;
@@ -369,13 +284,6 @@
   TOKENEXTRA *tok;
   unsigned int tok_count[4][1 << 6];
 
-#if CONFIG_MULTIPLE_ARF
-  // Position within a frame coding order (including any additional ARF frames).
-  unsigned int sequence_number;
-  // Next frame in naturally occurring order that has not yet been coded.
-  int next_frame_in_order;
-#endif
-
   // Ambient reconstruction err target for force key frames
   int ambient_err;
 
@@ -425,9 +333,6 @@
 
   unsigned char *complexity_map;
 
-  unsigned char *active_map;
-  unsigned int active_map_enabled;
-
   CYCLIC_REFRESH *cyclic_refresh;
 
   fractional_mv_step_fp *find_fractional_mv_step;
@@ -511,18 +416,8 @@
   PC_TREE *pc_root;
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
 
-#if CONFIG_MULTIPLE_ARF
-  // ARF tracking variables.
+  int multi_arf_allowed;
   int multi_arf_enabled;
-  unsigned int frame_coding_order_period;
-  unsigned int new_frame_coding_order_period;
-  int frame_coding_order[MAX_LAG_BUFFERS * 2];
-  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];
-  int arf_weight[MAX_LAG_BUFFERS];
-  int arf_buffered;
-  int this_frame_weight;
-  int max_arf_level;
-#endif
 
 #if CONFIG_DENOISING
   VP9_DENOISER denoiser;
@@ -622,10 +517,14 @@
 
 int64_t vp9_rescale(int64_t val, int64_t num, int denom);
 
+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled);
 
+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
+
 static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7cadb36..d0dd182 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -33,7 +33,6 @@
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_variance.h"
 
@@ -56,14 +55,7 @@
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #define MIN_KF_BOOST        300
-
-#if CONFIG_MULTIPLE_ARF
-// Set MIN_GF_INTERVAL to 1 for the full decomposition.
-#define MIN_GF_INTERVAL             2
-#else
-#define MIN_GF_INTERVAL             4
-#endif
-
+#define MIN_GF_INTERVAL     4
 #define LONG_TERM_VBR_CORRECTION
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
@@ -497,6 +489,8 @@
                                         &cpi->scaled_source);
   }
 
+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
   vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
   vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
@@ -504,8 +498,6 @@
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
-  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
   vp9_frame_init_quantizer(cpi);
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -594,8 +586,9 @@
 
       // Other than for the first frame do a motion search.
       if (cm->current_video_frame > 0) {
-        int tmp_err, motion_error;
+        int tmp_err, motion_error, raw_motion_error;
         int_mv mv, tmp_mv;
+        struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
         motion_error = get_prediction_error(bsize, &x->plane[0].src,
@@ -603,67 +596,83 @@
         // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
-        // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search.
-        first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
-                                 &motion_error);
-        if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-          vp9_clear_system_state();
-          motion_error = (int)(motion_error * error_weight);
-        }
+        // Compute the motion error of the 0,0 motion using the last source
+        // frame as the reference. Skip the further motion search on
+        // reconstructed frame if this error is small.
+        unscaled_last_source_buf_2d.buf =
+            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+            cpi->unscaled_last_source->y_stride;
+        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                &unscaled_last_source_buf_2d);
 
-        // If the current best reference mv is not centered on 0,0 then do a 0,0
-        // based search as well.
-        if (best_ref_mv.as_int) {
-          tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                   &tmp_err);
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25 ||
+            (cpi->use_svc && cpi->svc.number_temporal_layers == 1)) {
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
+                                   &motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();
-            tmp_err = (int)(tmp_err * error_weight);
+            motion_error = (int)(motion_error * error_weight);
           }
 
-          if (tmp_err < motion_error) {
-            motion_error = tmp_err;
-            mv.as_int = tmp_mv.as_int;
-          }
-        }
+          // If the current best reference mv is not centered on 0,0 then do a
+          // 0,0 based search as well.
+          if (best_ref_mv.as_int) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
+            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+              vp9_clear_system_state();
+              tmp_err = (int)(tmp_err * error_weight);
+            }
 
-        // Search in an older reference frame.
-        if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
-          // Assume 0,0 motion with no mv overhead.
-          int gf_motion_error;
-
-          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                 &xd->plane[0].pre[0]);
-
-          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                   &gf_motion_error);
-          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();
-            gf_motion_error = (int)(gf_motion_error * error_weight);
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv.as_int = tmp_mv.as_int;
+            }
           }
 
-          if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            ++second_ref_count;
+          // Search in an older reference frame.
+          if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;
 
-          // Reset to last frame as reference buffer.
-          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                   &xd->plane[0].pre[0]);
 
-          // In accumulating a score for the older reference frame take the
-          // best of the motion predicted score and the intra coded error
-          // (just as will be done for) accumulation of "coded_error" for
-          // the last frame.
-          if (gf_motion_error < this_error)
-            sr_coded_error += gf_motion_error;
-          else
-            sr_coded_error += this_error;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+                                     &gf_motion_error);
+            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+              vp9_clear_system_state();
+              gf_motion_error = (int)(gf_motion_error * error_weight);
+            }
+
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
         } else {
           sr_coded_error += motion_error;
         }
+
         // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
 
@@ -1205,144 +1214,6 @@
   return arf_boost;
 }
 
-#if CONFIG_MULTIPLE_ARF
-// Work out the frame coding order for a GF or an ARF group.
-// The current implementation codes frames in their natural order for a
-// GF group, and inserts additional ARFs into an ARF group using a
-// binary split approach.
-// NOTE: this function is currently implemented recursively.
-static void schedule_frames(VP9_COMP *cpi, const int start, const int end,
-                            const int arf_idx, const int gf_or_arf_group,
-                            const int level) {
-  int i, abs_end, half_range;
-  int *cfo = cpi->frame_coding_order;
-  int idx = cpi->new_frame_coding_order_period;
-
-  // If (end < 0) an ARF should be coded at position (-end).
-  assert(start >= 0);
-
-  // printf("start:%d end:%d\n", start, end);
-
-  // GF Group: code frames in logical order.
-  if (gf_or_arf_group == 0) {
-    assert(end >= start);
-    for (i = start; i <= end; ++i) {
-      cfo[idx] = i;
-      cpi->arf_buffer_idx[idx] = arf_idx;
-      cpi->arf_weight[idx] = -1;
-      ++idx;
-    }
-    cpi->new_frame_coding_order_period = idx;
-    return;
-  }
-
-  // ARF Group: Work out the ARF schedule and mark ARF frames as negative.
-  if (end < 0) {
-    // printf("start:%d end:%d\n", -end, -end);
-    // ARF frame is at the end of the range.
-    cfo[idx] = end;
-    // What ARF buffer does this ARF use as predictor.
-    cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2;
-    cpi->arf_weight[idx] = level;
-    ++idx;
-    abs_end = -end;
-  } else {
-    abs_end = end;
-  }
-
-  half_range = (abs_end - start) >> 1;
-
-  // ARFs may not be adjacent, they must be separated by at least
-  // MIN_GF_INTERVAL non-ARF frames.
-  if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) {
-    // printf("start:%d end:%d\n", start, abs_end);
-    // Update the coding order and active ARF.
-    for (i = start; i <= abs_end; ++i) {
-      cfo[idx] = i;
-      cpi->arf_buffer_idx[idx] = arf_idx;
-      cpi->arf_weight[idx] = -1;
-      ++idx;
-    }
-    cpi->new_frame_coding_order_period = idx;
-  } else {
-    // Place a new ARF at the mid-point of the range.
-    cpi->new_frame_coding_order_period = idx;
-    schedule_frames(cpi, start, -(start + half_range), arf_idx + 1,
-                    gf_or_arf_group, level + 1);
-    schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx,
-                    gf_or_arf_group, level + 1);
-  }
-}
-
-#define FIXED_ARF_GROUP_SIZE 16
-
-void define_fixed_arf_period(VP9_COMP *cpi) {
-  int i;
-  int max_level = INT_MIN;
-
-  assert(cpi->multi_arf_enabled);
-  assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE);
-
-  // Save the weight of the last frame in the sequence before next
-  // sequence pattern overwrites it.
-  cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
-  assert(cpi->this_frame_weight >= 0);
-
-  cpi->twopass.gf_zeromotion_pct = 0;
-
-  // Initialize frame coding order variables.
-  cpi->new_frame_coding_order_period = 0;
-  cpi->next_frame_in_order = 0;
-  cpi->arf_buffered = 0;
-  vp9_zero(cpi->frame_coding_order);
-  vp9_zero(cpi->arf_buffer_idx);
-  vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
-
-  if (cpi->rc.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
-    // Setup a GF group close to the keyframe.
-    cpi->rc.source_alt_ref_pending = 0;
-    cpi->rc.baseline_gf_interval = cpi->rc.frames_to_key;
-    schedule_frames(cpi, 0, (cpi->rc.baseline_gf_interval - 1), 2, 0, 0);
-  } else {
-    // Setup a fixed period ARF group.
-    cpi->rc.source_alt_ref_pending = 1;
-    cpi->rc.baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
-    schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
-  }
-
-  // Replace level indicator of -1 with correct level.
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    if (cpi->arf_weight[i] > max_level) {
-      max_level = cpi->arf_weight[i];
-    }
-  }
-  ++max_level;
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    if (cpi->arf_weight[i] == -1) {
-      cpi->arf_weight[i] = max_level;
-    }
-  }
-  cpi->max_arf_level = max_level;
-#if 0
-  printf("\nSchedule: ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->frame_coding_order[i]);
-  }
-  printf("\n");
-  printf("ARFref:   ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->arf_buffer_idx[i]);
-  }
-  printf("\n");
-  printf("Weight:   ");
-  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-    printf("%4d ", cpi->arf_weight[i]);
-  }
-  printf("\n");
-#endif
-}
-#endif
-
 // Calculate a section intra ratio used in setting max loop filter.
 static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
                                          const FIRSTPASS_STATS *end,
@@ -1412,6 +1283,18 @@
   return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
 }
 
+// Current limit on maximum number of active arfs in a GF/ARF group.
+#define MAX_ACTIVE_ARFS 2
+#define ARF_SLOT1 2
+#define ARF_SLOT2 3
+// This function indirects the choice of buffers for arfs.
+// At the moment the values are fixed but this may change as part of
+// the integration process with other codec features that swap buffers around.
+static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
+  arf_buffer_indices[0] = ARF_SLOT1;
+  arf_buffer_indices[1] = ARF_SLOT2;
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                    double group_error, int gf_arf_bits) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1419,42 +1302,85 @@
   TWO_PASS *twopass = &cpi->twopass;
   FIRSTPASS_STATS frame_stats;
   int i;
-  int group_frame_index = 1;
+  int frame_index = 1;
   int target_frame_size;
   int key_frame;
   const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
   int64_t total_group_bits = gf_group_bits;
   double modified_err = 0.0;
   double err_fraction;
+  int mid_boost_bits = 0;
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
 
   key_frame = cpi->common.frame_type == KEY_FRAME ||
               vp9_is_upper_layer_key_frame(cpi);
 
+  get_arf_buffer_indices(arf_buffer_indices);
+
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
-  // NOTE: We dont bother to check for the special case of ARF overlay
-  // frames here, as there is clamping code for this in the function
-  // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
-  // encodes.
   if (!key_frame) {
-    twopass->gf_group_bit_allocation[0] = gf_arf_bits;
+    if (rc->source_alt_ref_active) {
+      twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
+      twopass->gf_group.rf_level[0] = INTER_NORMAL;
+      twopass->gf_group.bit_allocation[0] = 0;
+      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
+      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+    } else {
+      twopass->gf_group.update_type[0] = GF_UPDATE;
+      twopass->gf_group.rf_level[0] = GF_ARF_STD;
+      twopass->gf_group.bit_allocation[0] = gf_arf_bits;
+      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
+      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+    }
 
     // Step over the golden frame / overlay frame
     if (EOF == input_stats(twopass, &frame_stats))
       return;
   }
 
-  // Store the bits to spend on the ARF if there is one.
-  if (rc->source_alt_ref_pending) {
-    twopass->gf_group_bit_allocation[group_frame_index++] = gf_arf_bits;
-  }
-
-  // Deduct the boost bits for arf or gf if it is not a key frame.
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
   if (rc->source_alt_ref_pending || !key_frame)
     total_group_bits -= gf_arf_bits;
 
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    if (cpi->multi_arf_enabled) {
+      // A portion of the gf / arf extra bits are set asside for lower level
+      // boosted frames in the middle of the group.
+      mid_boost_bits += gf_arf_bits >> 5;
+      gf_arf_bits -= (gf_arf_bits >> 5);
+    }
+
+    twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+    twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
+    twopass->gf_group.arf_src_offset[frame_index] =
+      (unsigned char)(rc->baseline_gf_interval - 1);
+    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
+    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+    ++frame_index;
+
+    if (cpi->multi_arf_enabled) {
+      // Set aside a slot for a level 1 arf.
+      twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
+      twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
+      twopass->gf_group.arf_src_offset[frame_index] =
+        (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
+      twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
+      twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      ++frame_index;
+    }
+  }
+
+  // Define middle frame
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+
   // Allocate bits to the other frames in the group.
   for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+    int arf_idx = 0;
     if (EOF == input_stats(twopass, &frame_stats))
       break;
 
@@ -1466,10 +1392,48 @@
       err_fraction = 0.0;
 
     target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      mid_boost_bits += (target_frame_size >> 4);
+      target_frame_size -= (target_frame_size >> 4);
+
+      if (frame_index <= mid_frame_idx)
+        arf_idx = 1;
+    }
+    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+
     target_frame_size = clamp(target_frame_size, 0,
                               MIN(max_bits, (int)total_group_bits));
 
-    twopass->gf_group_bit_allocation[group_frame_index++] = target_frame_size;
+    twopass->gf_group.update_type[frame_index] = LF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+
+    twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+    ++frame_index;
+  }
+
+  // Note:
+  // We need to configure the frame at the end of the sequence + 1 that will be
+  // the start frame for the next group. Otherwise prior to the call to
+  // vp9_rc_get_second_pass_params() the data will be undefined.
+  twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
+  twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+
+  if (rc->source_alt_ref_pending) {
+    twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+
+    // Final setup for second arf and its overlay.
+    if (cpi->multi_arf_enabled) {
+      twopass->gf_group.bit_allocation[2] =
+        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
+      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+    }
+  } else {
+    twopass->gf_group.update_type[frame_index] = GF_UPDATE;
+    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
   }
 }
 
@@ -1512,8 +1476,7 @@
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->gf_group_index = 0;
-    vp9_zero(twopass->gf_group_bit_allocation);
+    vp9_zero(twopass->gf_group);
   }
 
   vp9_clear_system_state();
@@ -1635,24 +1598,14 @@
     }
   }
 
-#if CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled) {
-    // Initialize frame coding order variables.
-    cpi->new_frame_coding_order_period = 0;
-    cpi->next_frame_in_order = 0;
-    cpi->arf_buffered = 0;
-    vp9_zero(cpi->frame_coding_order);
-    vp9_zero(cpi->arf_buffer_idx);
-    vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
-  }
-#endif
-
   // Set the interval until the next gf.
   if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
     rc->baseline_gf_interval = i - 1;
   else
     rc->baseline_gf_interval = i;
 
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
   // Should we use the alternate reference frame.
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
@@ -1665,62 +1618,11 @@
                                    &b_boost);
     rc->source_alt_ref_pending = 1;
 
-#if CONFIG_MULTIPLE_ARF
-    // Set the ARF schedule.
-    if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(rc->baseline_gf_interval - 1), 2, 1, 0);
-    }
-#endif
   } else {
     rc->gfu_boost = (int)boost_score;
     rc->source_alt_ref_pending = 0;
-#if CONFIG_MULTIPLE_ARF
-    // Set the GF schedule.
-    if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, rc->baseline_gf_interval - 1, 2, 0, 0);
-      assert(cpi->new_frame_coding_order_period ==
-             rc->baseline_gf_interval);
-    }
-#endif
   }
 
-#if CONFIG_MULTIPLE_ARF
-  if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) {
-    int max_level = INT_MIN;
-    // Replace level indicator of -1 with correct level.
-    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
-      if (cpi->arf_weight[i] > max_level) {
-        max_level = cpi->arf_weight[i];
-      }
-    }
-    ++max_level;
-    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
-      if (cpi->arf_weight[i] == -1) {
-        cpi->arf_weight[i] = max_level;
-      }
-    }
-    cpi->max_arf_level = max_level;
-  }
-#if 0
-  if (cpi->multi_arf_enabled) {
-    printf("\nSchedule: ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->frame_coding_order[i]);
-    }
-    printf("\n");
-    printf("ARFref:   ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->arf_buffer_idx[i]);
-    }
-    printf("\n");
-    printf("Weight:   ");
-    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
-      printf("%4d ", cpi->arf_weight[i]);
-    }
-    printf("\n");
-  }
-#endif
-#endif
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
@@ -1870,8 +1772,7 @@
   cpi->common.frame_type = KEY_FRAME;
 
   // Reset the GF group data structures.
-  twopass->gf_group_index = 0;
-  vp9_zero(twopass->gf_group_bit_allocation);
+  vp9_zero(twopass->gf_group);
 
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -2062,7 +1963,9 @@
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
-  twopass->gf_group_bit_allocation[0] = kf_bits;
+  twopass->gf_group.bit_allocation[0] = kf_bits;
+  twopass->gf_group.update_type[0] = KF_UPDATE;
+  twopass->gf_group.rf_level[0] = KF_STD;
 
   // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2090,6 +1993,44 @@
   }
 }
 
+// Define the reference buffers that will be updated post encode.
+void configure_buffer_updates(VP9_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2114,14 +2055,12 @@
   if (!twopass->stats_in)
     return;
 
-  // Increment the gf group index.
-  ++twopass->gf_group_index;
-
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (cpi->refresh_alt_ref_frame) {
+  if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
     int target_rate;
-    target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+    configure_buffer_updates(cpi);
+    target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 #ifdef LONG_TERM_VBR_CORRECTION
@@ -2185,15 +2124,7 @@
 
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-#if CONFIG_MULTIPLE_ARF
-    if (cpi->multi_arf_enabled) {
-      define_fixed_arf_period(cpi);
-    } else {
-#endif
-      define_gf_group(cpi, &this_frame_copy);
-#if CONFIG_MULTIPLE_ARF
-    }
-#endif
+    define_gf_group(cpi, &this_frame_copy);
 
     if (twopass->gf_zeromotion_pct > 995) {
       // As long as max_thresh for encode breakout is small enough, it is ok
@@ -2217,7 +2148,9 @@
     }
   }
 
-  target_rate = twopass->gf_group_bit_allocation[twopass->gf_group_index];
+  configure_buffer_updates(cpi);
+
+  target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
   else
@@ -2280,4 +2213,7 @@
     twopass->kf_group_bits -= bits_used;
   }
   twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
+
+  // Increment the gf group index ready for the next frame.
+  ++twopass->gf_group.index;
 }

diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 8206521..1ee56a3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h

@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
 
 #include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -39,6 +40,25 @@
   int64_t spatial_layer_id;
 } FIRSTPASS_STATS;
 
+typedef enum {
+  KF_UPDATE = 0,
+  LF_UPDATE = 1,
+  GF_UPDATE = 2,
+  ARF_UPDATE = 3,
+  OVERLAY_UPDATE = 4,
+  FRAME_UPDATE_TYPES = 5
+} FRAME_UPDATE_TYPE;
+
+typedef struct {
+  unsigned char index;
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
 typedef struct {
   unsigned int section_intra_rating;
   unsigned int next_iiratio;
@@ -68,8 +88,7 @@
 
   int active_worst_quality;
 
-  int gf_group_index;
-  int gf_group_bit_allocation[MAX_LAG_BUFFERS * 2];
+  GF_GROUP gf_group;
 } TWO_PASS;
 
 struct VP9_COMP;

diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index abe71e6..e743517 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c

@@ -18,18 +18,6 @@
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
-// The max of past frames we want to keep in the queue.
-#define MAX_PRE_FRAMES 1
-
-struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
-  struct lookahead_entry *buf; /* Buffer list */
-};
-
-
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
                                    unsigned int *idx) {

diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index ff63c0d..f9cc3c8 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h

@@ -14,6 +14,11 @@
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
+#ifdef CONFIG_SPATIAL_SVC
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -25,10 +30,22 @@
   int64_t             ts_start;
   int64_t             ts_end;
   unsigned int        flags;
+
+#ifdef CONFIG_SPATIAL_SVC
+  vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS];
+#endif
 };
 
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
 
-struct lookahead_ctx;
+struct lookahead_ctx {
+  unsigned int max_sz;         /* Absolute size of the queue */
+  unsigned int sz;             /* Number of buffers currently in the queue */
+  unsigned int read_idx;       /* Read index */
+  unsigned int write_idx;      /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};
 
 /**\brief Initializes the lookahead stage
  *

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3f4fcd1..4c340ea 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -23,9 +23,89 @@
 #include "vp9/common/vp9_reconintra.h"
 
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 
+static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                       const TileInfo *const tile,
+                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                       int_mv *mv_ref_list,
+                       int mi_row, int mi_col) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+
+  int different_ref_found = 0;
+  int context_counter = 0;
+  int const_motion = 0;
+
+  // Blank the reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1));
+    }
+  }
+
+  const_motion = 1;
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
+                                                    xd->mi_stride]->mbmi;
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[0]);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found && !refmv_count) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride]->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+      }
+    }
+  }
+
+ Done:
+
+  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return const_motion;
+}
+
 static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
                                     int_mv *tmp_mv, int *rate_mv) {
@@ -172,15 +252,31 @@
   else
     x->skip_txfm = 0;
 
-  // TODO(jingning) This is a temporary solution to account for frames with
-  // light changes. Need to customize the rate-distortion modeling for non-RD
-  // mode decision.
-  if ((sse >> 3) > var)
-    sse = var;
-  vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize],
-                               ac_quant >> 3, &rate, &dist);
-  *out_rate_sum = rate;
+  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+                               dc_quant >> 3, &rate, &dist);
+  *out_rate_sum = rate >> 1;
   *out_dist_sum = dist << 3;
+
+  vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
+
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+  int i;
+
+  for (i = 0; i < len; i++) {
+    if (!p[i].in_use) {
+      p[i].in_use = 1;
+      return i;
+    }
+  }
+  return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+  p->in_use = 0;
 }
 
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
@@ -228,6 +324,32 @@
   int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
                                       get_chessboard_index(cm)) % 2;
+  int const_motion[MAX_REF_FRAMES] = { 0 };
+
+  // For speed 6, the result of interp filter is reused later in actual encoding
+  // process.
+  int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  int pixels_in_block = bh * bw;
+  // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
+  PRED_BUFFER tmp[4];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+  struct buf_2d orig_dst = pd->dst;
+  PRED_BUFFER *best_pred = NULL;
+  PRED_BUFFER *this_mode_pred = NULL;
+  int i;
+
+  if (cpi->sf.reuse_inter_pred_sby) {
+    for (i = 0; i < 3; i++) {
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+      tmp[i].stride = bw;
+      tmp[i].in_use = 0;
+    }
+
+    tmp[3].data = pd->dst.buf;
+    tmp[3].stride = pd->dst.stride;
+    tmp[3].in_use = 0;
+  }
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -241,18 +363,36 @@
   mbmi->ref_frame[0] = NONE;
   mbmi->ref_frame[1] = NONE;
   mbmi->tx_size = MIN(max_txsize_lookup[bsize],
-                      tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  mbmi->interp_filter = cpi->common.interp_filter == SWITCHABLE ?
-                        EIGHTTAP : cpi->common.interp_filter;
+                      tx_mode_to_biggest_tx_size[cm->tx_mode]);
+  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
+                        EIGHTTAP : cm->interp_filter;
   mbmi->skip = 0;
   mbmi->segment_id = segment_id;
 
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      vp9_setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+
+      if (cm->coding_use_prev_mi)
+        vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
+                         candidates, mi_row, mi_col);
+      else
+        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
+                                             ref_frame, candidates,
+                                             mi_row, mi_col);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &frame_mv[NEARESTMV][ref_frame],
+                            &frame_mv[NEARMV][ref_frame]);
+
+      if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8)
+        vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                    ref_frame, bsize);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -286,6 +426,10 @@
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
+      if (const_motion[ref_frame] &&
+          (this_mode == NEARMV || this_mode == ZEROMV))
+        continue;
+
       if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
@@ -324,6 +468,16 @@
       // Search for the best prediction filter type, when the resulting
       // motion vector is at sub-pixel accuracy level for luma component, i.e.,
       // the last three bits are all zeros.
+      if (cpi->sf.reuse_inter_pred_sby) {
+        if (this_mode == NEARESTMV) {
+          this_mode_pred = &tmp[3];
+        } else {
+          this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+          pd->dst.buf = this_mode_pred->data;
+          pd->dst.stride = bw;
+        }
+      }
+
       if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
           pred_filter_search &&
           ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
@@ -334,6 +488,7 @@
         unsigned int pf_sse[3];
         int64_t best_cost = INT64_MAX;
         INTERP_FILTER best_filter = SWITCHABLE, filter;
+        PRED_BUFFER *current_pred = this_mode_pred;
 
         for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
           int64_t cost;
@@ -345,12 +500,28 @@
                         vp9_get_switchable_rate(cpi) + pf_rate[filter],
                         pf_dist[filter]);
           if (cost < best_cost) {
-              best_filter = filter;
-              best_cost = cost;
-              skip_txfm = x->skip_txfm;
+            best_filter = filter;
+            best_cost = cost;
+            skip_txfm = x->skip_txfm;
+
+            if (cpi->sf.reuse_inter_pred_sby) {
+              if (this_mode_pred != current_pred) {
+                free_pred_buffer(this_mode_pred);
+                this_mode_pred = current_pred;
+              }
+
+              if (filter < EIGHTTAP_SHARP) {
+                current_pred = &tmp[get_pred_buffer(tmp, 3)];
+                pd->dst.buf = current_pred->data;
+                pd->dst.stride = bw;
+              }
+            }
           }
         }
 
+        if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
+          free_pred_buffer(current_pred);
+
         mbmi->interp_filter = best_filter;
         rate = pf_rate[mbmi->interp_filter];
         dist = pf_dist[mbmi->interp_filter];
@@ -370,31 +541,35 @@
 
       // Skipping checking: test to see if this block can be reconstructed by
       // prediction only.
-      if (!x->in_active_map) {
-        x->skip = 1;
-      } else if (cpi->allow_encode_breakout && x->encode_breakout) {
+      if (cpi->allow_encode_breakout) {
         const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
         unsigned int var = var_y, sse = sse_y;
         // Skipping threshold for ac.
         unsigned int thresh_ac;
         // Skipping threshold for dc.
         unsigned int thresh_dc;
-        // Set a maximum for threshold to avoid big PSNR loss in low bit rate
-        // case. Use extreme low threshold for static frames to limit skipping.
-        const unsigned int max_thresh = 36000;
-        // The encode_breakout input
-        const unsigned int min_thresh =
-            MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+        if (x->encode_breakout > 0) {
+          // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+          // case. Use extreme low threshold for static frames to limit
+          // skipping.
+          const unsigned int max_thresh = 36000;
+          // The encode_breakout input
+          const unsigned int min_thresh =
+              MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 
-        // Calculate threshold according to dequant value.
-        thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-        thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+          // Calculate threshold according to dequant value.
+          thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+          thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
-        // Adjust ac threshold according to partition size.
-        thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
+          // Adjust ac threshold according to partition size.
+          thresh_ac >>=
+              8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
-        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+          thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+        } else {
+          thresh_ac = 0;
+          thresh_dc = 0;
+        }
 
         // Y skipping condition checking for ac and dc.
         if (var <= thresh_ac && (sse - var) <= thresh_dc) {
@@ -451,6 +626,16 @@
         best_pred_filter = mbmi->interp_filter;
         best_ref_frame = ref_frame;
         skip_txfm = x->skip_txfm;
+
+        if (cpi->sf.reuse_inter_pred_sby) {
+          if (best_pred != NULL)
+            free_pred_buffer(best_pred);
+
+          best_pred = this_mode_pred;
+        }
+      } else {
+        if (cpi->sf.reuse_inter_pred_sby)
+          free_pred_buffer(this_mode_pred);
       }
 
       if (x->skip)
@@ -458,6 +643,19 @@
     }
   }
 
+  // If best prediction is not in dst buf, then copy the prediction block from
+  // temp buf to dst buf.
+  if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
+    uint8_t *copy_from, *copy_to;
+
+    pd->dst = orig_dst;
+    copy_to = pd->dst.buf;
+
+    copy_from = best_pred->data;
+
+    vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
+                      bw, bh);
+  }
 
   mbmi->mode = best_mode;
   mbmi->interp_filter = best_pred_filter;
@@ -469,18 +667,49 @@
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   if (!x->skip && best_rd > inter_mode_thresh &&
-      bsize < cpi->sf.max_intra_bsize) {
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
-                              mbmi->tx_size, this_mode,
-                              &p->src.buf[0], p->src.stride,
-                              &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
+      bsize <= cpi->sf.max_intra_bsize) {
+    int i, j;
+    const int step   = 1 << mbmi->tx_size;
+    const int width  = num_4x4_blocks_wide_lookup[bsize];
+    const int height = num_4x4_blocks_high_lookup[bsize];
 
-      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+    int rate2 = 0;
+    int64_t dist2 = 0;
+    const int dst_stride = pd->dst.stride;
+    const int src_stride = p->src.stride;
+    int block_idx = 0;
+
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      if (cpi->sf.reuse_inter_pred_sby) {
+        pd->dst.buf = tmp[0].data;
+        pd->dst.stride = bw;
+      }
+
+      for (j = 0; j < height; j += step) {
+        for (i = 0; i < width; i += step) {
+          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
+                                  mbmi->tx_size, this_mode,
+                                  &p->src.buf[4 * (j * dst_stride + i)],
+                                  src_stride,
+                                  &pd->dst.buf[4 * (j * dst_stride + i)],
+                                  dst_stride, i, j, 0);
+          model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+          rate2 += rate;
+          dist2 += dist;
+          ++block_idx;
+        }
+      }
+
+      rate = rate2;
+      dist = dist2;
+
       rate += cpi->mbmode_cost[this_mode];
       rate += intra_cost_penalty;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
+      if (cpi->sf.reuse_inter_pred_sby)
+        pd->dst = orig_dst;
+
       if (this_rd + intra_mode_cost < best_rd) {
         best_rd = this_rd;
         *returnrate = rate;
@@ -494,9 +723,9 @@
       }
     }
   }
+
 #if CONFIG_DENOISING
-  vp9_denoiser_denoise(&cpi->denoiser, x, cpi->common.mi_grid_visible, mi_row,
-                       mi_col, bsize);
+  vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, bsize);
 #endif
 
   return INT64_MAX;

diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index a9c948d..3d89974 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h

@@ -17,6 +17,12 @@
 extern "C" {
 #endif
 
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
 int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             const struct TileInfo *const tile,
                             int mi_row, int mi_col,

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index f775003..e110983 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -186,6 +186,8 @@
 }
 
 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
   if (pass == 0 && oxcf->rc_mode == VPX_CBR) {
     rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
     rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
@@ -227,9 +229,9 @@
   rc->tot_q = 0.0;
   rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q);
 
-  rc->rate_correction_factor = 1.0;
-  rc->key_frame_rate_correction_factor = 1.0;
-  rc->gf_rate_correction_factor = 1.0;
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 1.0;
+  }
 }
 
 int vp9_rc_drop_frame(VP9_COMP *cpi) {
@@ -271,28 +273,40 @@
 }
 
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+
   if (cpi->common.frame_type == KEY_FRAME) {
-    return cpi->rc.key_frame_rate_correction_factor;
+    return rc->rate_correction_factors[KF_STD];
+  } else if (cpi->pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    return rc->rate_correction_factors[rf_lvl];
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !cpi->rc.is_src_frame_alt_ref &&
+        !rc->is_src_frame_alt_ref &&
         !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
-      return cpi->rc.gf_rate_correction_factor;
+      return rc->rate_correction_factors[GF_ARF_STD];
     else
-      return cpi->rc.rate_correction_factor;
+      return rc->rate_correction_factors[INTER_NORMAL];
   }
 }
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
   if (cpi->common.frame_type == KEY_FRAME) {
-    cpi->rc.key_frame_rate_correction_factor = factor;
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (cpi->pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rc->rate_correction_factors[rf_lvl] = factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !cpi->rc.is_src_frame_alt_ref &&
+        !rc->is_src_frame_alt_ref &&
         !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
-      cpi->rc.gf_rate_correction_factor = factor;
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
-      cpi->rc.rate_correction_factor = factor;
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
   }
 }
 
@@ -628,8 +642,8 @@
 
   if (frame_is_intra_only(cm)) {
     active_best_quality = rc->best_quality;
-#if !CONFIG_MULTIPLE_ARF
-    // Handle the special case for key frames forced when we have75 reached
+
+    // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
     if (rc->this_key_frame_forced) {
@@ -660,13 +674,6 @@
       active_best_quality += vp9_compute_qdelta(rc, q_val,
                                                 q_val * q_adj_factor);
     }
-#else
-    double current_q;
-    // Force the KF quantizer to be 30% of the active_worst_quality.
-    current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    active_best_quality = active_worst_quality
-        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
-#endif
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
@@ -768,23 +775,7 @@
         q = *top_index;
     }
   }
-#if CONFIG_MULTIPLE_ARF
-  // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      cpi->oxcf.rc_mode != VPX_Q) {
-    double new_q;
-    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    int level = cpi->this_frame_weight;
-    assert(level >= 0);
-    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = active_worst_quality +
-        vp9_compute_qdelta(rc, current_q, new_q);
 
-    *bottom_index = q;
-    *top_index    = q;
-    printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  }
-#endif
   assert(*top_index <= rc->worst_quality &&
          *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
@@ -805,7 +796,6 @@
   int q;
 
   if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
-#if !CONFIG_MULTIPLE_ARF
     // Handle the special case for key frames forced when we have75 reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
@@ -840,13 +830,6 @@
       active_best_quality += vp9_compute_qdelta(rc, q_val,
                                                 q_val * q_adj_factor);
     }
-#else
-    double current_q;
-    // Force the KF quantizer to be 30% of the active_worst_quality.
-    current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    active_best_quality = active_worst_quality
-        + vp9_compute_qdelta(rc, current_q, current_q * 0.3);
-#endif
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
@@ -909,21 +892,20 @@
   *bottom_index = active_best_quality;
 
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  vp9_clear_system_state();
   {
-    int qdelta = 0;
-    vp9_clear_system_state();
-
-    // Limit Q range for the adaptive loop.
-    if ((cm->frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi)) &&
-        !rc->this_key_frame_forced) {
-      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 2.0);
-    } else if (!rc->is_src_frame_alt_ref &&
-               (oxcf->rc_mode != VPX_CBR) &&
-               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-      qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                          active_worst_quality, 1.75);
-    }
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+      1.00,  // INTER_NORMAL
+      1.00,  // INTER_HIGH
+      1.50,  // GF_ARF_LOW
+      1.75,  // GF_ARF_STD
+      2.00,  // KF_STD
+    };
+    const double rate_factor =
+      rate_factor_deltas[gf_group->rf_level[gf_group->index]];
+    int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                            active_worst_quality, rate_factor);
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -945,23 +927,7 @@
         q = *top_index;
     }
   }
-#if CONFIG_MULTIPLE_ARF
-  // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      cpi->oxcf.rc_mode != VPX_Q) {
-    double new_q;
-    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
-    int level = cpi->this_frame_weight;
-    assert(level >= 0);
-    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = active_worst_quality +
-        vp9_compute_qdelta(rc, current_q, new_q);
 
-    *bottom_index = q;
-    *top_index    = q;
-    printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  }
-#endif
   assert(*top_index <= rc->worst_quality &&
          *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
@@ -1026,11 +992,8 @@
   RATE_CONTROL *const rc = &cpi->rc;
   rc->frames_since_golden = 0;
 
-#if CONFIG_MULTIPLE_ARF
-  if (!cpi->multi_arf_enabled)
-#endif
-    // Clear the alternate reference update pending flag.
-    rc->source_alt_ref_pending = 0;
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;
 
   // Set the alternate reference frame active flag
   rc->source_alt_ref_active = 1;
@@ -1044,8 +1007,13 @@
     // this frame refreshes means next frames don't unless specified by user
     rc->frames_since_golden = 0;
 
-    if (!rc->source_alt_ref_pending)
+    if (cpi->pass == 2) {
+      if (!rc->source_alt_ref_pending &&
+          cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD)
       rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
+      rc->source_alt_ref_active = 0;
+    }
 
     // Decrement count down till next gf
     if (rc->frames_till_gf_update_due > 0)
@@ -1388,6 +1356,8 @@
 
   // Extended interval for genuinely static scenes
   rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
+  if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2))
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
 
   if (is_altref_enabled(oxcf)) {
     if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)

diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index f1a4a3f..a15235c 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h

@@ -23,6 +23,15 @@
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9
 
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_HIGH = 1,
+  GF_ARF_LOW = 2,
+  GF_ARF_STD = 3,
+  KF_STD = 4,
+  RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+
 typedef struct {
   // Rate targetting variables
   int base_frame_target;           // A baseline frame target before adjustment
@@ -37,9 +46,7 @@
   int last_boost;
   int kf_boost;
 
-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
 
   int frames_since_golden;
   int frames_till_gf_update_due;

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 429dcb1..9402d4a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1277,9 +1277,6 @@
     MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
     MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
 
-    if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
-      continue;
-
     if (cpi->common.frame_type == KEY_FRAME) {
       const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
       const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
@@ -1450,16 +1447,8 @@
 
 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
                        int mode_context) {
-  const MACROBLOCK *const x = &cpi->mb;
-  const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
-
-  // Don't account for mode here if segment skip is enabled.
-  if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
-    assert(is_inter_mode(mode));
-    return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
-  } else {
-    return 0;
-  }
+  assert(is_inter_mode(mode));
+  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
 }
 
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2077,9 +2066,9 @@
   return bsi->segment_rd;
 }
 
-static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
-                    uint8_t *ref_y_buffer, int ref_y_stride,
-                    int ref_frame, BLOCK_SIZE block_size ) {
+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int_mv this_mv;
@@ -2218,12 +2207,12 @@
              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
-static void setup_pred_block(const MACROBLOCKD *xd,
-                             struct buf_2d dst[MAX_MB_PLANE],
-                             const YV12_BUFFER_CONFIG *src,
-                             int mi_row, int mi_col,
-                             const struct scale_factors *scale,
-                             const struct scale_factors *scale_uv) {
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv) {
   int i;
 
   dst[0].buf = src->y_buffer;
@@ -2261,7 +2250,7 @@
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+  vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
@@ -2275,8 +2264,8 @@
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
-    mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
-            ref_frame, block_size);
+    vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                ref_frame, block_size);
 }
 
 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
@@ -2802,13 +2791,7 @@
     *rate2 += vp9_get_switchable_rate(cpi);
 
   if (!is_comp_pred) {
-    if (!x->in_active_map ||
-        vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      if (psse)
-        *psse = 0;
-      *distortion = 0;
-      x->skip = 1;
-    } else if (cpi->allow_encode_breakout && x->encode_breakout) {
+    if (cpi->allow_encode_breakout && x->encode_breakout) {
       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
       unsigned int var, sse;
@@ -3117,13 +3100,6 @@
     }
   }
 
-  // If the segment skip feature is enabled....
-  // then do nothing if the current mode is not allowed..
-  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
-    mode_skip_mask = ~(1 << THR_ZEROMV);
-    inter_mode_mask = (1 << ZEROMV);
-  }
-
   // Disable this drop out case if the ref frame
   // segment level feature is enabled for this segment. This is to
   // prevent the possibility that we end up unable to pick any mode.
@@ -3162,21 +3138,6 @@
     mode_skip_mask |= all_intra_modes;
   }
 
-  if (!x->in_active_map) {
-    int mode_index;
-    assert(cpi->ref_frame_flags & VP9_LAST_FLAG);
-    if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0)
-      mode_index = THR_NEARESTMV;
-    else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0)
-      mode_index = THR_NEARMV;
-    else
-      mode_index = THR_ZEROMV;
-    mode_skip_mask = ~(1 << mode_index);
-    mode_skip_start = MAX_MODES;
-    inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
-                      (1 << NEWMV);
-  }
-
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3266,17 +3227,14 @@
         }
       }
     } else {
-      if (x->in_active_map &&
-          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-        const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
-        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                inter_mode_mask, this_mode, ref_frames))
-          continue;
-      }
+      const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
+      if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
+                              inter_mode_mask, this_mode, ref_frames))
+        continue;
     }
 
     mbmi->mode = this_mode;
-    mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode;
+    mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
     // Evaluate all sub-pel filters irrespective of whether we can use
@@ -3348,31 +3306,20 @@
     }
 
     if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
-                                                         SEG_LVL_SKIP);
-
       if (skippable) {
+        vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
         // for best yrd calculation
         rate_uv = 0;
 
-        if (mb_skip_allowed) {
-          int prob_skip_cost;
-
-          // Cost the skip mb case
-          vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
-          if (skip_prob) {
-            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-            rate2 += prob_skip_cost;
-          }
+        // Cost the skip mb case
+        if (skip_prob) {
+          int prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+          rate2 += prob_skip_cost;
         }
-      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -3387,7 +3334,7 @@
           rate_uv = 0;
           this_skip2 = 1;
         }
-      } else if (mb_skip_allowed) {
+      } else {
         // Add in the cost of the no skip flag.
         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
@@ -3596,16 +3543,6 @@
     vp9_zero(best_tx_diff);
   }
 
-  if (!x->in_active_map) {
-    assert(mbmi->ref_frame[0] == LAST_FRAME);
-    assert(mbmi->ref_frame[1] == NONE);
-    assert(mbmi->mode == NEARESTMV ||
-           mbmi->mode == NEARMV ||
-           mbmi->mode == ZEROMV);
-    assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0);
-    assert(mbmi->mode == mbmi->uv_mode);
-  }
-
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   store_coding_context(x, ctx, best_mode_index,
                        best_pred_diff, best_tx_diff, best_filter_diff);
@@ -3613,6 +3550,113 @@
   return best_rd;
 }
 
+int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
+                                           const TileInfo *const tile,
+                                           int mi_row, int mi_col,
+                                           int *returnrate,
+                                           int64_t *returndistortion,
+                                           BLOCK_SIZE bsize,
+                                           PICK_MODE_CONTEXT *ctx,
+                                           int64_t best_rd_so_far) {
+  VP9_COMMON *const cm = &cpi->common;
+  RD_OPT *const rd_opt = &cpi->rd;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_tx_diff[TX_MODES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vp9_prob comp_mode_p;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+  estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
+    x->pred_mv_sad[i] = INT_MAX;
+
+  *returnrate = INT_MAX;
+
+  assert(vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP));
+
+  mbmi->mode = ZEROMV;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->mv[0].as_int = 0;
+  x->skip = 1;
+
+  // Search for best switchable filter by checking the variance of
+  // pred error irrespective of whether the filter will be used
+  rd_opt->mask_filter = 0;
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    rd_opt->filter_cache[i] = INT64_MAX;
+
+  if (cm->interp_filter != BILINEAR) {
+    best_filter = EIGHTTAP;
+    if (cm->interp_filter == SWITCHABLE &&
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mbmi->interp_filter = i;
+        rs = vp9_get_switchable_rate(cpi);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = mbmi->interp_filter;
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  if (cm->interp_filter == SWITCHABLE) {
+    mbmi->interp_filter = best_filter;
+    rate2 += vp9_get_switchable_rate(cpi);
+  } else {
+    mbmi->interp_filter = cm->interp_filter;
+  }
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  *returnrate = rate2;
+  *returndistortion = distortion2;
+
+  if (this_rd >= best_rd_so_far)
+    return INT64_MAX;
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter));
+
+  update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
+
+  vp9_zero(best_pred_diff);
+  vp9_zero(best_filter_diff);
+  vp9_zero(best_tx_diff);
+
+  if (!x->select_tx_size)
+    swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
+  store_coding_context(x, ctx, THR_ZEROMV,
+                       best_pred_diff, best_tx_diff, best_filter_diff);
+
+  return this_rd;
+}
 
 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                       const TileInfo *const tile,

diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 6e56317..3dfe2d0 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h

@@ -13,7 +13,10 @@
 
 #include <limits.h>
 
-#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_context_tree.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,21 +33,104 @@
 
 #define INVALID_MV 0x80008000
 
+#define MAX_MODES 30
+#define MAX_REFS  6
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_DC,
+
+  THR_NEWMV,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARA,
+  THR_COMP_NEARESTLA,
+  THR_COMP_NEARESTGA,
+
+  THR_TM,
+
+  THR_COMP_NEARLA,
+  THR_COMP_NEWLA,
+  THR_NEARG,
+  THR_COMP_NEARGA,
+  THR_COMP_NEWGA,
+
+  THR_ZEROMV,
+  THR_ZEROG,
+  THR_ZEROA,
+  THR_COMP_ZEROLA,
+  THR_COMP_ZEROGA,
+
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D207_PRED,
+  THR_D153_PRED,
+  THR_D63_PRED,
+  THR_D117_PRED,
+  THR_D45_PRED,
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+  THR_GOLD,
+  THR_ALTR,
+  THR_COMP_LA,
+  THR_COMP_GA,
+  THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+  int64_t tx_select_diff[TX_MODES];
+  // FIXME(rbultje) can this overflow?
+  int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
+
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int64_t mask_filter;
+
+  int RDMULT;
+  int RDDIV;
+} RD_OPT;
+
+
 struct TileInfo;
+struct VP9_COMP;
+struct macroblock;
 
-int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex);
+int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
 
-void vp9_initialize_rd_consts(VP9_COMP *cpi);
+void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
 
-void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex);
 
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist);
 
-int vp9_get_switchable_rate(const VP9_COMP *cpi);
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi);
 
-void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_setup_buffer_inter(struct VP9_COMP *cpi, struct macroblock *x,
                             const TileInfo *const tile,
                             MV_REFERENCE_FRAME ref_frame,
                             BLOCK_SIZE block_size,
@@ -53,14 +139,14 @@
                             int_mv frame_near_mv[MAX_REF_FRAMES],
                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
                                                    int ref_frame);
 
-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                int *r, int64_t *d, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                   const struct TileInfo *const tile,
                                   int mi_row, int mi_col,
                                   int *returnrate,
@@ -69,7 +155,18 @@
                                   PICK_MODE_CONTEXT *ctx,
                                   int64_t best_rd_so_far);
 
-int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                           struct macroblock *x,
+                                           const TileInfo *const tile,
+                                           int mi_row, int mi_col,
+                                           int *returnrate,
+                                           int64_t *returndistortion,
+                                           BLOCK_SIZE bsize,
+                                           PICK_MODE_CONTEXT *ctx,
+                                           int64_t best_rd_so_far);
+
+int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                      struct macroblock *x,
                                       const struct TileInfo *const tile,
                                       int mi_row, int mi_col,
                                       int *returnrate,
@@ -85,15 +182,25 @@
                               ENTROPY_CONTEXT t_above[16],
                               ENTROPY_CONTEXT t_left[16]);
 
-void vp9_set_rd_speed_thresholds(VP9_COMP *cpi);
+void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
-void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi);
+void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }
 
+void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride,
+                 int ref_frame, BLOCK_SIZE block_size);
+
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6beb872..1a14da3 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -84,16 +84,17 @@
 
   if (speed >= 2) {
     if (MIN(cm->width, cm->height) >= 720) {
-      sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+      sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
       sf->last_partitioning_redo_frequency = 3;
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
       sf->last_partitioning_redo_frequency = 2;
-      sf->lf_motion_threshold = NO_MOITION_THRESHOLD;
+      sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
     }
-    sf->adaptive_pred_interp_filter = 2;
+
+    sf->adaptive_pred_interp_filter = 0;
     sf->reference_masking = 1;
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
@@ -114,7 +115,7 @@
     else
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
 
-    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->last_partitioning_redo_frequency = 3;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
@@ -198,7 +199,7 @@
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
-    sf->lf_motion_threshold = LOW_MOITION_THRESHOLD;
+    sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
     sf->use_lp32x32fdct = 1;
@@ -249,6 +250,8 @@
   }
 
   if (speed >= 5) {
+    sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
+        RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
     sf->max_partition_size = BLOCK_32X32;
     sf->min_partition_size = BLOCK_8X8;
     sf->partition_check =
@@ -270,6 +273,10 @@
     sf->source_var_thresh = 360;
 
     sf->tx_size_search_method = USE_TX_8X8;
+    sf->max_intra_bsize = BLOCK_8X8;
+
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
   }
 
   if (speed >= 7) {
@@ -335,6 +342,7 @@
   for (i = 0; i < BLOCK_SIZES; ++i)
     sf->inter_mode_mask[i] = INTER_ALL;
   sf->max_intra_bsize = BLOCK_64X64;
+  sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
   sf->always_this_block_size = BLOCK_16X16;

diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c796421..75070a7 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h

@@ -44,8 +44,8 @@
 } SUBPEL_SEARCH_METHODS;
 
 typedef enum {
-  NO_MOITION_THRESHOLD = 0,
-  LOW_MOITION_THRESHOLD = 7
+  NO_MOTION_THRESHOLD = 0,
+  LOW_MOTION_THRESHOLD = 7
 } MOTION_THRESHOLD;
 
 typedef enum {
@@ -353,6 +353,11 @@
 
   // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
   unsigned int source_var_thresh;
+
+  // When partition is pre-set, the inter prediction result from pick_inter_mode
+  // can be reused in final block encoding process. It is enabled only for real-
+  // time mode speed 6.
+  int reuse_inter_pred_sby;
 } SPEED_FEATURES;
 
 struct VP9_COMP;

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1b99575..07c17b2 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c

@@ -12,6 +12,7 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_extend.h"
 
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
@@ -31,6 +32,7 @@
   for (layer = 0; layer < layer_end; ++layer) {
     LAYER_CONTEXT *const lc = &svc->layer_context[layer];
     RATE_CONTROL *const lrc = &lc->rc;
+    int i;
     lc->current_video_frame_in_layer = 0;
     lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
     lrc->ni_av_qi = oxcf->worst_allowed_q;
@@ -42,8 +44,10 @@
     lrc->ni_frames = 0;
     lrc->decimation_count = 0;
     lrc->decimation_factor = 0;
-    lrc->rate_correction_factor = 1.0;
-    lrc->key_frame_rate_correction_factor = 1.0;
+
+    for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+      lrc->rate_correction_factors[i] = 1.0;
+    }
 
     if (svc->number_temporal_layers > 1) {
       lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
@@ -206,3 +210,101 @@
          cpi->svc.spatial_layer_id > 0 &&
          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
 }
+
+int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx,
+                           YV12_BUFFER_CONFIG *src, int64_t ts_start,
+                           int64_t ts_end, unsigned int flags) {
+  struct lookahead_entry *buf;
+  int i, index;
+
+  if (vp9_lookahead_push(ctx, src, ts_start, ts_end, flags))
+    return 1;
+
+  index = ctx->write_idx - 1;
+  if (index < 0)
+    index += ctx->max_sz;
+
+  buf = ctx->buf + index;
+
+  if (buf == NULL)
+    return 1;
+
+  // Store svc parameters for each layer
+  for (i = 0; i < cpi->svc.number_spatial_layers; ++i)
+    buf->svc_params[i] = cpi->svc.layer_context[i].svc_params_received;
+
+  return 0;
+}
+
+static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
+  int layer_id;
+  vpx_svc_parameters_t *layer_param;
+  vpx_enc_frame_flags_t flags;
+
+  // Find the next layer to be encoded
+  for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) {
+    if (buf->svc_params[layer_id].spatial_layer >=0)
+      break;
+  }
+
+  if (layer_id == cpi->svc.number_spatial_layers)
+    return 1;
+
+  layer_param = &buf->svc_params[layer_id];
+  buf->flags = flags = layer_param->flags;
+  cpi->svc.spatial_layer_id = layer_param->spatial_layer;
+  cpi->svc.temporal_layer_id = layer_param->temporal_layer;
+  cpi->lst_fb_idx = layer_param->lst_fb_idx;
+  cpi->gld_fb_idx = layer_param->gld_fb_idx;
+  cpi->alt_fb_idx = layer_param->alt_fb_idx;
+
+  if (vp9_set_size_literal(cpi, layer_param->width, layer_param->height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
+
+  cpi->oxcf.worst_allowed_q =
+      vp9_quantizer_to_qindex(layer_param->max_quantizer);
+  cpi->oxcf.best_allowed_q =
+      vp9_quantizer_to_qindex(layer_param->min_quantizer);
+
+  vp9_change_config(cpi, &cpi->oxcf);
+
+  vp9_set_high_precision_mv(cpi, 1);
+
+  // Retrieve the encoding flags for each layer and apply it to encoder.
+  // It includes reference frame flags and update frame flags.
+  vp9_apply_encoding_flags(cpi, flags);
+
+  return 0;
+}
+
+struct lookahead_entry *vp9_svc_lookahead_peek(VP9_COMP *const cpi,
+                                               struct lookahead_ctx *ctx,
+                                               int index, int copy_params) {
+  struct lookahead_entry *buf = vp9_lookahead_peek(ctx, index);
+
+  if (buf != NULL && copy_params != 0) {
+    if (copy_svc_params(cpi, buf) != 0)
+      return NULL;
+  }
+  return buf;
+}
+
+struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
+                                              struct lookahead_ctx *ctx,
+                                              int drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = vp9_svc_lookahead_peek(cpi, ctx, 0, 1);
+    if (buf != NULL) {
+      // Only remove the buffer when pop the highest layer. Simply set the
+      // spatial_layer to -1 for lower layers.
+      buf->svc_params[cpi->svc.spatial_layer_id].spatial_layer = -1;
+      if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+        vp9_lookahead_pop(ctx, drain);
+      }
+    }
+  }
+
+  return buf;
+}

diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 36e2027..3ebb831 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h

@@ -28,6 +28,7 @@
   struct vpx_fixed_buf rc_twopass_stats_in;
   unsigned int current_video_frame_in_layer;
   int is_key_frame;
+  vpx_svc_parameters_t svc_params_received;
 } LAYER_CONTEXT;
 
 typedef struct {
@@ -74,6 +75,23 @@
 // Check if current layer is key frame in spatial upper layer
 int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
 
+// Copy the source image, flags and svc parameters into a new framebuffer
+// with the expected stride/border
+int vp9_svc_lookahead_push(const struct VP9_COMP *const cpi,
+                           struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                           int64_t ts_start, int64_t ts_end,
+                           unsigned int flags);
+
+// Get the next source buffer to encode
+struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
+                                              struct lookahead_ctx *ctx,
+                                              int drain);
+
+// Get a future source buffer to encode
+struct lookahead_entry *vp9_svc_lookahead_peek(struct VP9_COMP *const cpi,
+                                               struct lookahead_ctx *ctx,
+                                               int index, int copy_params);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index edd59ab..b150161 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -88,8 +88,8 @@
   size_t                  pending_frame_magnitude;
   vpx_image_t             preview_img;
   vp8_postproc_cfg_t      preview_ppcfg;
-  vpx_codec_pkt_list_decl(64) pkt_list;
-  unsigned int                fixed_kf_cntr;
+  vpx_codec_pkt_list_decl(128) pkt_list;
+  unsigned int                 fixed_kf_cntr;
 };
 
 static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -795,42 +795,7 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
-               VP8_EFLAG_NO_REF_ARF)) {
-    int ref = 7;
-
-    if (flags & VP8_EFLAG_NO_REF_LAST)
-      ref ^= VP9_LAST_FLAG;
-
-    if (flags & VP8_EFLAG_NO_REF_GF)
-      ref ^= VP9_GOLD_FLAG;
-
-    if (flags & VP8_EFLAG_NO_REF_ARF)
-      ref ^= VP9_ALT_FLAG;
-
-    vp9_use_as_reference(ctx->cpi, ref);
-  }
-
-  if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
-               VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
-               VP8_EFLAG_FORCE_ARF)) {
-    int upd = 7;
-
-    if (flags & VP8_EFLAG_NO_UPD_LAST)
-      upd ^= VP9_LAST_FLAG;
-
-    if (flags & VP8_EFLAG_NO_UPD_GF)
-      upd ^= VP9_GOLD_FLAG;
-
-    if (flags & VP8_EFLAG_NO_UPD_ARF)
-      upd ^= VP9_ALT_FLAG;
-
-    vp9_update_reference(ctx->cpi, upd);
-  }
-
-  if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
-    vp9_update_entropy(ctx->cpi, 0);
-  }
+  vp9_apply_encoding_flags(ctx->cpi, flags);
 
   // Handle fixed keyframe intervals
   if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -843,7 +808,7 @@
 
   // Initialize the encoder instance on the first frame.
   if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
-    unsigned int lib_flags;
+    unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
     size_t size, cx_data_sz;
@@ -853,9 +818,6 @@
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
       ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
 
-    // Convert API flags to internal codec lib flags
-    lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
     /* vp9 use 10,000,000 ticks/second as time stamp */
     dst_time_stamp = (pts * 10000000 * ctx->cfg.g_timebase.num)
                      / ctx->cfg.g_timebase.den;
@@ -865,7 +827,9 @@
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
 
-      if (vp9_receive_raw_frame(ctx->cpi, lib_flags,
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (vp9_receive_raw_frame(ctx->cpi, flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
         res = update_error_state(ctx, &cpi->common.error);
@@ -874,7 +838,6 @@
 
     cx_data = ctx->cx_data;
     cx_data_sz = ctx->cx_data_sz;
-    lib_flags = 0;
 
     /* Any pending invisible frames? */
     if (ctx->pending_cx_data) {
@@ -902,7 +865,12 @@
         VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
 
         // Pack invisible frames with the next visible frame
-        if (cpi->common.show_frame == 0) {
+        if (cpi->common.show_frame == 0
+#ifdef CONFIG_SPATIAL_SVC
+            || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+                cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+#endif
+            ) {
           if (ctx->pending_cx_data == 0)
             ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
@@ -925,7 +893,12 @@
           / ctx->cfg.g_timebase.num / 10000000);
         pkt.data.frame.flags = lib_flags << 16;
 
-        if (lib_flags & FRAMEFLAGS_KEY)
+        if (lib_flags & FRAMEFLAGS_KEY
+#ifdef CONFIG_SPATIAL_SVC
+            || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+                cpi->svc.layer_context[0].is_key_frame)
+#endif
+            )
           pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
 
         if (cpi->common.show_frame == 0) {
@@ -1165,24 +1138,19 @@
   VP9_COMP *const cpi = ctx->cpi;
   vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *);
 
-  if (params == NULL)
+  if (params == NULL || params->spatial_layer < 0 ||
+      params->spatial_layer >= cpi->svc.number_spatial_layers)
     return VPX_CODEC_INVALID_PARAM;
 
-  cpi->svc.spatial_layer_id = params->spatial_layer;
-  cpi->svc.temporal_layer_id = params->temporal_layer;
+  if (params->spatial_layer == 0) {
+    int i;
+    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+      cpi->svc.layer_context[i].svc_params_received.spatial_layer = -1;
+    }
+  }
 
-  cpi->lst_fb_idx = params->lst_fb_idx;
-  cpi->gld_fb_idx = params->gld_fb_idx;
-  cpi->alt_fb_idx = params->alt_fb_idx;
-
-  if (vp9_set_size_literal(ctx->cpi, params->width, params->height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
-  ctx->cfg.rc_max_quantizer = params->max_quantizer;
-  ctx->cfg.rc_min_quantizer = params->min_quantizer;
-
-  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-  vp9_change_config(ctx->cpi, &ctx->oxcf);
+  cpi->svc.layer_context[params->spatial_layer].svc_params_received =
+      *params;
 
   return VPX_CODEC_OK;
 }

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 3b5d4bf..fd868ae 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -32,7 +32,6 @@
   vpx_codec_priv_t        base;
   vpx_codec_dec_cfg_t     cfg;
   vp9_stream_info_t       si;
-  struct VP9Decoder *pbi;
   int                     postproc_cfg_set;
   vp8_postproc_cfg_t      postproc_cfg;
   vpx_decrypt_cb          decrypt_cb;
@@ -42,6 +41,11 @@
   int                     frame_parallel_decode;  // frame-based threading.
   int                     last_show_frame;  // Index of last output frame.
 
+  VP9Worker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_thread_id;
+  int                     next_output_thread_id;
+
   // External frame buffer info to save for VP9 common.
   void *ext_priv;  // Private data associated with the external frame buffers.
   vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
@@ -85,11 +89,17 @@
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->pbi) {
-    vp9_decoder_remove(ctx->pbi);
-    ctx->pbi = NULL;
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VP9Worker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_decoder_remove(worker_data->pbi);
+      vpx_free(worker_data);
+    }
   }
 
+  vpx_free(ctx->frame_workers);
   vpx_free(ctx);
 
   return VPX_CODEC_OK;
@@ -102,9 +112,6 @@
                                                 void *decrypt_state) {
   uint8_t clear_buffer[9];
 
-  if (data_sz <= 8)
-    return VPX_CODEC_UNSUP_BITSTREAM;
-
   if (data + data_sz <= data)
     return VPX_CODEC_INVALID_PARAM;
 
@@ -125,12 +132,16 @@
 
     if (frame_marker != VP9_FRAME_MARKER)
       return VPX_CODEC_UNSUP_BITSTREAM;
+
     if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM;
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
       return VPX_CODEC_OK;
     }
 
+    if (data_sz <= 8)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
     si->is_kf = !vp9_rb_read_bit(&rb);
     if (si->is_kf) {
       const int sRGB = 7;
@@ -187,32 +198,42 @@
   return VPX_CODEC_OK;
 }
 
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
                            const struct vpx_internal_error_info *error) {
   if (error->error_code)
-    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
 
   return error->error_code;
 }
 
 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  VP9_COMMON *const cm = &ctx->pbi->common;
+  int i;
 
-  cm->new_fb_idx = -1;
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &worker_data->pbi->common;
 
-  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-    cm->get_fb_cb = ctx->get_ext_fb_cb;
-    cm->release_fb_cb = ctx->release_ext_fb_cb;
-    cm->cb_priv = ctx->ext_priv;
-  } else {
-    cm->get_fb_cb = vp9_get_frame_buffer;
-    cm->release_fb_cb = vp9_release_frame_buffer;
+    cm->new_fb_idx = -1;
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      cm->get_fb_cb = ctx->get_ext_fb_cb;
+      cm->release_fb_cb = ctx->release_ext_fb_cb;
+      cm->cb_priv = ctx->ext_priv;
+    } else {
+      cm->get_fb_cb = vp9_get_frame_buffer;
+      cm->release_fb_cb = vp9_release_frame_buffer;
 
-    if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to initialize internal frame buffers");
+      if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
 
-    cm->cb_priv = &cm->int_frame_buffers;
+      cm->cb_priv = &cm->int_frame_buffers;
+    }
   }
 }
 
@@ -231,15 +252,58 @@
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  ctx->pbi = vp9_decoder_create();
-  if (ctx->pbi == NULL)
-    return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = worker_data->data;
+  (void)arg2;
+  worker_data->result = vp9_receive_compressed_data(worker_data->pbi,
+                                                    worker_data->data_size,
+                                                    &data);
+  worker_data->data_end = data;
+  return !worker_data->result;
+}
 
-  ctx->pbi->max_threads = ctx->cfg.threads;
-  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
-  ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+
   ctx->last_show_frame = -1;
+  ctx->next_submit_thread_id = 0;
+  ctx->next_output_thread_id = 0;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+
+  ctx->frame_workers = (VP9Worker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *worker_data = NULL;
+    vp9_worker_init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    worker_data = (FrameWorkerData *)worker->data1;
+    worker_data->pbi = vp9_decoder_create();
+    if (worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    worker->hook = (VP9WorkerHook)frame_worker_hook;
+  }
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
@@ -248,14 +312,14 @@
     set_default_ppflags(&ctx->postproc_cfg);
 
   init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
   vp9_ppflags_t flags = {0};
-  VP9_COMMON *cm = NULL;
-
   (void)deadline;
 
   // Determine the stream parameters. Note that we rely on peek_si to
@@ -272,22 +336,35 @@
       return VPX_CODEC_ERROR;
   }
 
-  // Initialize the decoder instance on the first frame
-  if (ctx->pbi == NULL) {
-    init_decoder(ctx);
-    if (ctx->pbi == NULL)
-      return VPX_CODEC_ERROR;
+  // Initialize the decoder workers on the first frame
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
   }
 
-  // Set these even if already initialized.  The caller may have changed the
-  // decrypt config between frames.
-  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
-  ctx->pbi->decrypt_state = ctx->decrypt_state;
+  if (!ctx->frame_parallel_decode) {
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    worker_data->data = *data;
+    worker_data->data_size = data_sz;
+    worker_data->user_priv = user_priv;
 
-  cm = &ctx->pbi->common;
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
-  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
-    return update_error_state(ctx, &cm->error);
+    vp9_worker_execute(worker);
+    if (worker->had_error)
+      return update_error_state(ctx, &worker_data->pbi->common.error);
+
+    // Update data pointer after decode.
+    *data = worker_data->data_end;
+  } else {
+    // TODO(hkuang): Implement frame parallel decode.
+    return VPX_CODEC_INCAPABLE;
+  }
 
   if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
     set_ppflags(ctx, &flags);
@@ -306,10 +383,17 @@
   return *data;
 }
 
-static void parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                   uint32_t sizes[8], int *count,
-                                   vpx_decrypt_cb decrypt_cb,
-                                   void *decrypt_state) {
+static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                              size_t data_sz,
+                                              uint32_t sizes[8], int *count,
+                                              vpx_decrypt_cb decrypt_cb,
+                                              void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
   uint8_t marker;
 
   assert(data_sz);
@@ -321,56 +405,45 @@
     const uint32_t mag = ((marker >> 3) & 0x3) + 1;
     const size_t index_sz = 2 + mag * frames;
 
-    if (data_sz >= index_sz) {
-      uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
-                                    data + data_sz - index_sz);
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
 
-      if (marker == marker2) {
-        // Found a valid superframe index.
-        uint32_t i, j;
-        const uint8_t *x = &data[data_sz - index_sz + 1];
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);
 
-        // Frames has a maximum of 8 and mag has a maximum of 4.
-        uint8_t clear_buffer[32];
-        assert(sizeof(clear_buffer) >= frames * mag);
-        if (decrypt_cb) {
-          decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
-          x = clear_buffer;
-        }
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
 
-        for (i = 0; i < frames; ++i) {
-          uint32_t this_sz = 0;
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
 
-          for (j = 0; j < mag; ++j)
-            this_sz |= (*x++) << (j * 8);
-          sizes[i] = this_sz;
-        }
-
-        *count = frames;
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
       }
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
     }
   }
-}
-
-static vpx_codec_err_t decode_one_iter(vpx_codec_alg_priv_t *ctx,
-                                       const uint8_t **data_start_ptr,
-                                       const uint8_t *data_end,
-                                       uint32_t frame_size, void *user_priv,
-                                       long deadline) {
-  const vpx_codec_err_t res = decode_one(ctx, data_start_ptr, frame_size,
-                                         user_priv, deadline);
-  if (res != VPX_CODEC_OK)
-    return res;
-
-  // Account for suboptimal termination by the encoder.
-  while (*data_start_ptr < data_end) {
-    const uint8_t marker = read_marker(ctx->decrypt_cb, ctx->decrypt_state,
-                                       *data_start_ptr);
-    if (marker)
-      break;
-    (*data_start_ptr)++;
-  }
-
   return VPX_CODEC_OK;
 }
 
@@ -378,7 +451,7 @@
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
+  const uint8_t * const data_end = data + data_sz;
   vpx_codec_err_t res;
   uint32_t frame_sizes[8];
   int frame_count;
@@ -386,32 +459,86 @@
   if (data == NULL || data_sz == 0)
     return VPX_CODEC_INVALID_PARAM;
 
-  parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                         ctx->decrypt_cb, ctx->decrypt_state);
+  res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                               ctx->decrypt_cb, ctx->decrypt_state);
+  if (res != VPX_CODEC_OK)
+    return res;
 
-  if (frame_count > 0) {
-    int i;
+  if (ctx->frame_parallel_decode) {
+    // Decode in frame parallel mode. When decoding in this mode, the frame
+    // passed to the decoder must be either a normal frame or a superframe with
+    // superframe index so the decoder could get each frame's start position
+    // in the superframe.
+    if (frame_count > 0) {
+      int i;
 
-    for (i = 0; i < frame_count; ++i) {
-      const uint32_t frame_size = frame_sizes[i];
-      if (data_start < data ||
-          frame_size > (uint32_t)(data_end - data_start)) {
-        ctx->base.err_detail = "Invalid frame size in index";
-        return VPX_CODEC_CORRUPT_FRAME;
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        vpx_codec_err_t res;
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        data_start += frame_size;
       }
-
-      res = decode_one_iter(ctx, &data_start, data_end, frame_size,
-                            user_priv, deadline);
+    } else {
+      res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
       if (res != VPX_CODEC_OK)
         return res;
+
+      // Extra data detected after the frame.
+      if (data_start < data_end - 1) {
+        set_error_detail(ctx, "Fail to decode frame in parallel mode");
+        return VPX_CODEC_INCAPABLE;
+      }
     }
   } else {
-    while (data_start < data_end) {
-      res = decode_one_iter(ctx, &data_start, data_end,
-                            (uint32_t)(data_end - data_start),
-                            user_priv, deadline);
-      if (res != VPX_CODEC_OK)
-        return res;
+    // Decode in serial mode.
+    if (frame_count > 0) {
+      int i;
+
+      for (i = 0; i < frame_count; ++i) {
+        const uint8_t *data_start_copy = data_start;
+        const uint32_t frame_size = frame_sizes[i];
+        vpx_codec_err_t res;
+        if (data_start < data
+            || frame_size > (uint32_t) (data_end - data_start)) {
+          set_error_detail(ctx, "Invalid frame size in index");
+          return VPX_CODEC_CORRUPT_FRAME;
+        }
+
+        res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
+                         deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        data_start += frame_size;
+      }
+    } else {
+      while (data_start < data_end) {
+        const uint32_t frame_size = (uint32_t) (data_end - data_start);
+        const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size,
+                                               user_priv, deadline);
+        if (res != VPX_CODEC_OK)
+          return res;
+
+        // Account for suboptimal termination by the encoder.
+        while (data_start < data_end) {
+          const uint8_t marker = read_marker(ctx->decrypt_cb,
+                                             ctx->decrypt_state, data_start);
+          if (marker)
+            break;
+          ++data_start;
+        }
+      }
     }
   }
 
@@ -424,13 +551,15 @@
 
   // iter acts as a flip flop, so an image is only returned on the first
   // call to get_frame.
-  if (*iter == NULL && ctx->pbi != NULL) {
+  if (*iter == NULL && ctx->frame_workers != NULL) {
     YV12_BUFFER_CONFIG sd;
     vp9_ppflags_t flags = {0, 0, 0};
 
-    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
-      VP9_COMMON *cm = &ctx->pbi->common;
-      yuvconfig2image(&ctx->img, &sd, NULL);
+    VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_thread_id];
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *const cm = &worker_data->pbi->common;
+      yuvconfig2image(&ctx->img, &sd, worker_data->user_priv);
       ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       img = &ctx->img;
       *iter = img;
@@ -442,7 +571,7 @@
               &cm->frame_bufs[ctx->last_show_frame].raw_frame_buffer);
         }
       }
-      ctx->last_show_frame = ctx->pbi->common.new_fb_idx;
+      ctx->last_show_frame = worker_data->pbi->common.new_fb_idx;
     }
   }
 
@@ -455,7 +584,7 @@
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->pbi == NULL) {
+  } else if (ctx->frame_workers == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -471,12 +600,19 @@
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&ctx->pbi->common,
+    return vp9_set_reference_dec(&worker_data->pbi->common,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -487,13 +623,19 @@
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
     YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-
-    return vp9_copy_reference_dec(ctx->pbi,
+    return vp9_copy_reference_dec(worker_data->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -504,11 +646,18 @@
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
     YV12_BUFFER_CONFIG* fb;
-
-    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);
-    yuvconfig2image(&data->img, fb, NULL);
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    vp9_get_reference_dec(worker_data->pbi, data->idx, &fb);
+    yuvconfig2image(&data->img, fb, worker_data->user_priv);
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -545,11 +694,20 @@
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (update_info) {
-    if (ctx->pbi)
-      *update_info = ctx->pbi->refresh_frame_flags;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      *update_info = worker_data->pbi->refresh_frame_flags;
+    } else {
       return VPX_CODEC_ERROR;
+    }
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -561,11 +719,20 @@
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (corrupted) {
-    if (ctx->pbi)
-      *corrupted = ctx->pbi->common.frame_to_show->corrupted;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      *corrupted = worker_data->pbi->common.frame_to_show->corrupted;
+    } else {
       return VPX_CODEC_ERROR;
+    }
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -576,9 +743,17 @@
                                              va_list args) {
   int *const display_size = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (display_size) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &worker_data->pbi->common;
       display_size[0] = cm->display_width;
       display_size[1] = cm->display_height;
     } else {

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 9dbb678..6a34f7e 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -105,11 +105,9 @@
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 endif
 
@@ -124,7 +122,9 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 17e165b..4efba9c 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c

@@ -24,6 +24,7 @@
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __MINGW32__
 #define strtok_r strtok_s
@@ -47,6 +48,14 @@
 static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
 static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
 
+// One encoded frame
+typedef struct FrameData {
+  void                     *buf;    // compressed data buffer
+  size_t                    size;  // length of compressed data
+  vpx_codec_frame_flags_t   flags;    /**< flags for this frame */
+  struct FrameData         *next;
+} FrameData;
+
 typedef struct SvcInternal {
   char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
   char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
@@ -72,15 +81,15 @@
 
   // state variables
   int encode_frame_count;
+  int frame_received;
   int frame_within_gop;
   vpx_enc_frame_flags_t enc_frame_flags;
   int layers;
   int layer;
   int is_keyframe;
 
-  size_t frame_size;
-  size_t buffer_size;
-  void *buffer;
+  FrameData *frame_list;
+  FrameData *frame_temp;
 
   char *rc_stats_buf;
   size_t rc_stats_buf_size;
@@ -90,128 +99,54 @@
   vpx_codec_ctx_t *codec_ctx;
 } SvcInternal;
 
-// Superframe is used to generate an index of individual frames (i.e., layers)
-struct Superframe {
-  int count;
-  uint32_t sizes[SUPERFRAME_SLOTS];
-  uint32_t magnitude;
-  uint8_t buffer[SUPERFRAME_BUFFER_SIZE];
-  size_t index_size;
-};
-
-// One encoded frame layer
-struct LayerData {
-  void *buf;    // compressed data buffer
-  size_t size;  // length of compressed data
-  struct LayerData *next;
-};
-
-// create LayerData from encoder output
-static struct LayerData *ld_create(void *buf, size_t size) {
-  struct LayerData *const layer_data =
-      (struct LayerData *)malloc(sizeof(*layer_data));
-  if (layer_data == NULL) {
+// create FrameData from encoder output
+static struct FrameData *fd_create(void *buf, size_t size,
+                                   vpx_codec_frame_flags_t flags) {
+  struct FrameData *const frame_data =
+      (struct FrameData *)vpx_malloc(sizeof(*frame_data));
+  if (frame_data == NULL) {
     return NULL;
   }
-  layer_data->buf = malloc(size);
-  if (layer_data->buf == NULL) {
-    free(layer_data);
+  frame_data->buf = vpx_malloc(size);
+  if (frame_data->buf == NULL) {
+    vpx_free(frame_data);
     return NULL;
   }
-  memcpy(layer_data->buf, buf, size);
-  layer_data->size = size;
-  return layer_data;
+  vpx_memcpy(frame_data->buf, buf, size);
+  frame_data->size = size;
+  frame_data->flags = flags;
+  return frame_data;
 }
 
-// free LayerData
-static void ld_free(struct LayerData *layer_data) {
-  if (layer_data) {
-    if (layer_data->buf) {
-      free(layer_data->buf);
-      layer_data->buf = NULL;
-    }
-    free(layer_data);
+// free FrameData
+static void fd_free(struct FrameData *p) {
+  if (p) {
+    if (p->buf)
+      vpx_free(p->buf);
+    vpx_free(p);
   }
 }
 
-// add layer data to list
-static void ld_list_add(struct LayerData **list, struct LayerData *layer_data) {
-  struct LayerData **p = list;
+// add FrameData to list
+static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) {
+  struct FrameData **p = list;
 
   while (*p != NULL) p = &(*p)->next;
   *p = layer_data;
   layer_data->next = NULL;
 }
 
-// get accumulated size of layer data
-static size_t ld_list_get_buffer_size(struct LayerData *list) {
-  struct LayerData *p;
-  size_t size = 0;
-
-  for (p = list; p != NULL; p = p->next) {
-    size += p->size;
-  }
-  return size;
-}
-
-// copy layer data to buffer
-static void ld_list_copy_to_buffer(struct LayerData *list, uint8_t *buffer) {
-  struct LayerData *p;
-
-  for (p = list; p != NULL; p = p->next) {
-    buffer[0] = 1;
-    memcpy(buffer, p->buf, p->size);
-    buffer += p->size;
-  }
-}
-
-// free layer data list
-static void ld_list_free(struct LayerData *list) {
-  struct LayerData *p = list;
+// free FrameData list
+static void fd_free_list(struct FrameData *list) {
+  struct FrameData *p = list;
 
   while (p) {
     list = list->next;
-    ld_free(p);
+    fd_free(p);
     p = list;
   }
 }
 
-static void sf_create_index(struct Superframe *sf) {
-  uint8_t marker = 0xc0;
-  int i;
-  uint32_t mag, mask;
-  uint8_t *bufp;
-
-  if (sf->count == 0 || sf->count >= 8) return;
-
-  // Add the number of frames to the marker byte
-  marker |= sf->count - 1;
-
-  // Choose the magnitude
-  for (mag = 0, mask = 0xff; mag < 4; ++mag) {
-    if (sf->magnitude < mask) break;
-    mask <<= 8;
-    mask |= 0xff;
-  }
-  marker |= mag << 3;
-
-  // Write the index
-  sf->index_size = 2 + (mag + 1) * sf->count;
-  bufp = sf->buffer;
-
-  *bufp++ = marker;
-  for (i = 0; i < sf->count; ++i) {
-    int this_sz = sf->sizes[i];
-    uint32_t j;
-
-    for (j = 0; j <= mag; ++j) {
-      *bufp++ = this_sz & 0xff;
-      this_sz >>= 8;
-    }
-  }
-  *bufp++ = marker;
-}
-
 static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
   if (svc_ctx == NULL) return NULL;
   if (svc_ctx->internal == NULL) {
@@ -574,8 +509,6 @@
   // modify encoder configuration
   enc_cfg->ss_number_layers = si->layers;
   enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
-  // Lag in frames not currently supported
-  enc_cfg->g_lag_in_frames = 0;
 
   // TODO(ivanmaltz): determine if these values need to be set explicitly for
   // svc, or if the normal default/override mechanism can be used
@@ -608,6 +541,34 @@
   return VPX_CODEC_OK;
 }
 
+static void accumulate_frame_size_for_each_layer(SvcInternal *const si,
+                                                 const uint8_t *const buf,
+                                                 const size_t size) {
+  uint8_t marker = buf[size - 1];
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    uint8_t marker2 = buf[size - index_sz];
+
+    if (size >= index_sz && marker2 == marker) {
+      // found a valid superframe index
+      uint32_t i, j;
+      const uint8_t *x = &buf[size - index_sz + 1];
+
+      // frames has a maximum of 8 and mag has a maximum of 4.
+      for (i = 0; i < frames; i++) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; j++)
+          this_sz |= (*x++) << (j * 8);
+        si->bytes_sum[i] += this_sz;
+      }
+    }
+  }
+}
+
 // SVC Algorithm flags - these get mapped to VP8_EFLAG_* defined in vp8cx.h
 
 // encoder should reference the last frame
@@ -846,15 +807,12 @@
   vpx_codec_err_t res;
   vpx_codec_iter_t iter;
   const vpx_codec_cx_pkt_t *cx_pkt;
-  struct LayerData *cx_layer_list = NULL;
-  struct LayerData *layer_data;
-  struct Superframe superframe;
+  int layer_for_psnr = 0;
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  memset(&superframe, 0, sizeof(superframe));
   svc_log_reset(svc_ctx);
   si->rc_stats_buf_used = 0;
 
@@ -863,7 +821,6 @@
     si->frame_within_gop = 0;
   }
   si->is_keyframe = (si->frame_within_gop == 0);
-  si->frame_size = 0;
 
   if (rawimg != NULL) {
     svc_log(svc_ctx, SVC_LOG_DEBUG,
@@ -872,124 +829,90 @@
             si->frame_within_gop);
   }
 
-  // encode each layer
-  for (si->layer = 0; si->layer < si->layers; ++si->layer) {
-    if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
-        si->is_keyframe && (si->layer == 1 || si->layer == 3)) {
-      svc_log(svc_ctx, SVC_LOG_DEBUG, "Skip encoding layer %d\n", si->layer);
-      continue;
-    }
-
-    if (rawimg != NULL) {
+  if (rawimg != NULL) {
+    // encode each layer
+    for (si->layer = 0; si->layer < si->layers; ++si->layer) {
+      if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+          si->is_keyframe && (si->layer == 1 || si->layer == 3)) {
+        svc_log(svc_ctx, SVC_LOG_DEBUG, "Skip encoding layer %d\n", si->layer);
+        continue;
+      }
       calculate_enc_frame_flags(svc_ctx);
       set_svc_parameters(svc_ctx, codec_ctx);
     }
+  }
 
-    res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration,
-                           si->enc_frame_flags, deadline);
-    if (res != VPX_CODEC_OK) {
-      return res;
-    }
-    // save compressed data
-    iter = NULL;
-    while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
-      switch (cx_pkt->kind) {
-        case VPX_CODEC_CX_FRAME_PKT: {
-          const uint32_t frame_pkt_size = (uint32_t)(cx_pkt->data.frame.sz);
-          si->bytes_sum[si->layer] += frame_pkt_size;
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, size: %u\n",
-                  si->encode_frame_count, si->layer, frame_pkt_size);
-          layer_data =
-              ld_create(cx_pkt->data.frame.buf, (size_t)frame_pkt_size);
-          if (layer_data == NULL) {
-            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating LayerData\n");
-            return VPX_CODEC_OK;
+  res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0,
+                         deadline);
+  if (res != VPX_CODEC_OK) {
+    return res;
+  }
+  // save compressed data
+  iter = NULL;
+  while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
+    switch (cx_pkt->kind) {
+      case VPX_CODEC_CX_FRAME_PKT: {
+        fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf,
+                                               cx_pkt->data.frame.sz,
+                                               cx_pkt->data.frame.flags));
+        accumulate_frame_size_for_each_layer(si, cx_pkt->data.frame.buf,
+                                             cx_pkt->data.frame.sz);
+
+        svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
+                "pts: %d\n", si->frame_received,
+                (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0,
+                (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+
+        ++si->frame_received;
+        layer_for_psnr = 0;
+        break;
+      }
+      case VPX_CODEC_PSNR_PKT: {
+        int i;
+        svc_log(svc_ctx, SVC_LOG_DEBUG,
+                "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                si->frame_received, layer_for_psnr,
+                cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
+                cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
+        svc_log(svc_ctx, SVC_LOG_DEBUG,
+                "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                si->frame_received, layer_for_psnr,
+                cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
+                cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
+        for (i = 0; i < COMPONENTS; i++) {
+          si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i];
+          si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i];
+        }
+        ++layer_for_psnr;
+        break;
+      }
+      case VPX_CODEC_STATS_PKT: {
+        size_t new_size = si->rc_stats_buf_used +
+            cx_pkt->data.twopass_stats.sz;
+
+        if (new_size > si->rc_stats_buf_size) {
+          char *p = (char*)realloc(si->rc_stats_buf, new_size);
+          if (p == NULL) {
+            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
+            return VPX_CODEC_MEM_ERROR;
           }
-          ld_list_add(&cx_layer_list, layer_data);
+          si->rc_stats_buf = p;
+          si->rc_stats_buf_size = new_size;
+        }
 
-          // save layer size in superframe index
-          superframe.sizes[superframe.count++] = frame_pkt_size;
-          superframe.magnitude |= frame_pkt_size;
-          break;
-        }
-        case VPX_CODEC_PSNR_PKT: {
-          int i;
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->encode_frame_count, si->layer,
-                  cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
-                  cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->encode_frame_count, si->layer,
-                  cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
-                  cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
-          for (i = 0; i < COMPONENTS; i++) {
-            si->psnr_sum[si->layer][i] += cx_pkt->data.psnr.psnr[i];
-            si->sse_sum[si->layer][i] += cx_pkt->data.psnr.sse[i];
-          }
-          break;
-        }
-        case VPX_CODEC_STATS_PKT: {
-          size_t new_size = si->rc_stats_buf_used +
-              cx_pkt->data.twopass_stats.sz;
-
-          if (new_size > si->rc_stats_buf_size) {
-            char *p = (char*)realloc(si->rc_stats_buf, new_size);
-            if (p == NULL) {
-              svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
-              break;
-            }
-            si->rc_stats_buf = p;
-            si->rc_stats_buf_size = new_size;
-          }
-
-          memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
-                 cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
-          si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
-          break;
-        }
-        default: {
-          break;
-        }
+        memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
+               cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
+        si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
+        break;
+      }
+      default: {
+        break;
       }
     }
-    if (rawimg == NULL) {
-      break;
-    }
   }
-  if (codec_ctx->config.enc->g_pass != VPX_RC_FIRST_PASS) {
-    // add superframe index to layer data list
-    sf_create_index(&superframe);
-    layer_data = ld_create(superframe.buffer, superframe.index_size);
-    ld_list_add(&cx_layer_list, layer_data);
 
-    // get accumulated size of layer data
-    si->frame_size = ld_list_get_buffer_size(cx_layer_list);
-    if (si->frame_size > 0) {
-      // all layers encoded, create single buffer with concatenated layers
-      if (si->frame_size > si->buffer_size) {
-        free(si->buffer);
-        si->buffer = malloc(si->frame_size);
-        if (si->buffer == NULL) {
-          ld_list_free(cx_layer_list);
-          return VPX_CODEC_MEM_ERROR;
-        }
-        si->buffer_size = si->frame_size;
-      }
-      // copy layer data into packet
-      ld_list_copy_to_buffer(cx_layer_list, (uint8_t *)si->buffer);
-
-      ld_list_free(cx_layer_list);
-
-      svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
-              "pts: %d\n", si->encode_frame_count, si->is_keyframe,
-              (int)si->frame_size, (int)pts);
-    }
-  }
   if (rawimg != NULL) {
     ++si->frame_within_gop;
     ++si->encode_frame_count;
@@ -1004,16 +927,27 @@
   return si->message_buffer;
 }
 
-void *vpx_svc_get_buffer(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->buffer;
+// We will maintain a list of output frame buffers since with lag_in_frame
+// we need to output all frame buffers at the end. vpx_svc_get_buffer() will
+// remove a frame buffer from the list the put it to a temporal pointer, which
+// will be removed at the next vpx_svc_get_buffer() or when closing encoder.
+void *vpx_svc_get_buffer(SvcContext *svc_ctx) {
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL;
+
+  if (si->frame_temp)
+    fd_free(si->frame_temp);
+
+  si->frame_temp = si->frame_list;
+  si->frame_list = si->frame_list->next;
+
+  return si->frame_temp->buf;
 }
 
 size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
   const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->frame_size;
+  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
+  return si->frame_list->size;
 }
 
 int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
@@ -1024,8 +958,8 @@
 
 int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
   const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->is_keyframe;
+  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
+  return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0;
 }
 
 void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
@@ -1112,7 +1046,8 @@
   // SvcInternal if it was not already allocated
   si = (SvcInternal *)svc_ctx->internal;
   if (si != NULL) {
-    free(si->buffer);
+    fd_free(si->frame_temp);
+    fd_free_list(si->frame_list);
     if (si->rc_stats_buf) {
       free(si->rc_stats_buf);
     }

diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index 5d0fbbd..058ee20 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h

@@ -104,14 +104,16 @@
 const char *vpx_svc_get_message(const SvcContext *svc_ctx);
 
 /**
- * return size of encoded data to be returned by vpx_svc_get_buffer
+ * return size of encoded data to be returned by vpx_svc_get_buffer.
+ * it needs to be called before vpx_svc_get_buffer.
  */
 size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx);
 
 /**
- * return buffer with encoded data
+ * return buffer with encoded data. encoder will maintain a list of frame
+ * buffers. each call of vpx_svc_get_buffer() will return one frame.
  */
-void *vpx_svc_get_buffer(const SvcContext *svc_ctx);
+void *vpx_svc_get_buffer(SvcContext *svc_ctx);
 
 /**
  * return size of two pass rate control stats data to be returned by