Merge "[spatial svc] Remove encoding modes since we only need one mode at this time."

diff --git a/README b/README
index f209105..f9c24ff 100644
--- a/README
+++ b/README

@@ -55,6 +55,7 @@
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-none-rvct
+    arm64-darwin-gcc
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct

diff --git a/build/make/configure.sh b/build/make/configure.sh
index d4124c7..0fe8ead 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -799,7 +799,7 @@
     arm*)
         # on arm, isa versions are supersets
         case ${tgt_isa} in
-        armv8)
+        arm64|armv8)
             soft_enable neon
             ;;
         armv7|armv7s)

diff --git a/configure b/configure
index 9a7de73..0dca4c3 100755
--- a/configure
+++ b/configure

@@ -96,6 +96,7 @@
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
+all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8

diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index a4cbd45..223f37e 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c

@@ -48,9 +48,6 @@
     ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
             "be applied to key frames if -qn is not specified (lowest to "
             "highest layer)");
-static const arg_def_t quantizers_keyframe_arg =
-    ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
-        "to highest layer)");
 static const arg_def_t passes_arg =
     ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
 static const arg_def_t pass_arg =
@@ -69,10 +66,9 @@
 static const arg_def_t *svc_args[] = {
   &frames_arg,        &width_arg,         &height_arg,
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
-  &quantizers_keyframe_arg,               &passes_arg,      &pass_arg,
-  &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   NULL
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  &passes_arg,
+  &pass_arg,          &fpf_name_arg,      &min_q_arg,       &max_q_arg,
+  &min_bitrate_arg,   &max_bitrate_arg,   NULL
 };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -168,9 +164,7 @@
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
       vpx_svc_set_scale_factors(svc_ctx, arg.val);
     } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
-    } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
+      vpx_svc_set_quantizers(svc_ctx, arg.val);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
       if (passes < 1 || passes > 2) {

diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index be651b4..5d2bfff 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc

@@ -17,27 +17,45 @@
 
 namespace {
 
+const int kMaxPSNR = 100;
+
 class CpuSpeedTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWith2Params<
         libvpx_test::TestMode, int> {
  protected:
-  CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {}
+  CpuSpeedTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)),
+        min_psnr_(kMaxPSNR) {}
   virtual ~CpuSpeedTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPSNR;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
-      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
-      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
-      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
     }
   }
 
@@ -45,7 +63,15 @@
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
     }
   }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_)
+      min_psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
+  double min_psnr_;
 };
 
 TEST_P(CpuSpeedTest, TestQ0) {
@@ -53,7 +79,6 @@
   // without a mismatch when passing in a very low max q.  This pushes
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
   cfg_.rc_2pass_vbr_minsection_pct = 5;
   cfg_.rc_2pass_vbr_minsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
@@ -63,7 +88,10 @@
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
                                        20);
 
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
 }
 
 
@@ -72,7 +100,6 @@
   // without a mismatch when passing in a very low max q.  This pushes
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
-  cfg_.g_lag_in_frames = 25;
   cfg_.rc_2pass_vbr_minsection_pct = 5;
   cfg_.rc_2pass_vbr_minsection_pct = 2000;
   cfg_.rc_target_bitrate = 12000;
@@ -89,7 +116,6 @@
   // when passing in a very high min q.  This pushes the encoder to producing
   // lots of small partitions which might will test the other condition.
 
-  cfg_.g_lag_in_frames = 25;
   cfg_.rc_2pass_vbr_minsection_pct = 5;
   cfg_.rc_2pass_vbr_minsection_pct = 2000;
   cfg_.rc_target_bitrate = 200;
@@ -108,6 +134,7 @@
 
 VP9_INSTANTIATE_TEST_CASE(
     CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+                      ::libvpx_test::kRealTime),
     ::testing::Range(0, 8));
 }  // namespace

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 655b090..8bea4cc 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -19,7 +19,8 @@
 
 vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
                                     vpx_codec_stream_info_t *stream_info) {
-  return vpx_codec_peek_stream_info(CodecInterface(), cxdata, size,
+  return vpx_codec_peek_stream_info(CodecInterface(),
+                                    cxdata, static_cast<unsigned int>(size),
                                     stream_info);
 }
 
@@ -46,7 +47,8 @@
   const bool is_vp8 = strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
 
   // Decode frames.
-  for (video->Begin(); video->cxdata(); video->Next()) {
+  for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata();
+       video->Next()) {
     PreDecodeFrameHook(*video, decoder);
 
     vpx_codec_stream_info_t stream_info;

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 4933658..d3f370e 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -97,6 +97,7 @@
   "invalid-vp90-01.webm",
   "invalid-vp90-02.webm",
   "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf",
+  "invalid-vp90-03.webm",
 };
 
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))

diff --git a/test/svc_test.cc b/test/svc_test.cc
index 963ed67..417790b 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc

@@ -167,48 +167,20 @@
   codec_initialized_ = true;
 }
 
-TEST_F(SvcTest, SetKeyFrameQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_,
-                                       "quantizers-keyframe=not-quantizers");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "quantizers-keyframe=40,45");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
 TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30", 0);
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  res = vpx_svc_set_quantizers(&svc_, NULL, 0);
+  res = vpx_svc_set_quantizers(&svc_, NULL);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40", 0);
+  res = vpx_svc_set_quantizers(&svc_, "40");
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  res = vpx_svc_set_quantizers(&svc_, "40,30", 0);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-}
-
-TEST_F(SvcTest, SetKeyFrameQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,31", 1);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, NULL, 1);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_quantizers(&svc_, "40,30", 1);
+  res = vpx_svc_set_quantizers(&svc_, "40,30");
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
@@ -239,7 +211,7 @@
 TEST_F(SvcTest, FirstFrameHasLayers) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -274,7 +246,7 @@
 TEST_F(SvcTest, EncodeThreeFrames) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");
   int decoded_frames = 0;
   vpx_codec_err_t res_dec;
   int frame_size;
@@ -350,7 +322,7 @@
 TEST_F(SvcTest, GetLayerResolution) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -389,7 +361,7 @@
   svc_.spatial_layers = 2;
   codec_enc_.g_pass = VPX_RC_FIRST_PASS;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  vpx_svc_set_quantizers(&svc_, "40,30");
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index bc6f77e..1c960c0 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -4,6 +4,8 @@
 25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02.webm
 2dadee5306245fa5eeb0f99652d0e17afbcba96d  invalid-vp90-02.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03.webm
+8fe6fd82bf537340f586f97a7ae31fb37ccda302  invalid-vp90-03.webm.res
 b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf

diff --git a/test/test.mk b/test/test.mk
index af344e5..a76863f 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -758,14 +758,14 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-fuzz-flicker.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-fuzz-flicker.webm.md5
 
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.ivf.res
 

diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
index 38eef1c..f9aef33 100644
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc

@@ -13,6 +13,7 @@
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
+#include "test/acm_random.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/ivf_video_source.h"
@@ -22,17 +23,27 @@
 #include "test/webm_video_source.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
+#include "vpx/vp8.h"
 
 namespace {
 
 using std::string;
+using libvpx_test::ACMRandom;
 
 #if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv) <<
+      "user_priv pointer value does not match.";
+}
+
 // Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
 // compares the user_priv from return img with the original user_priv to see if
 // they match. Both the pointer values and the values inside the addresses
 // should match.
 string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
@@ -41,7 +52,8 @@
 
   libvpx_test::MD5 md5;
   int frame_num = 0;
-  for (video.Begin(); video.cxdata(); video.Next()) {
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
     void *user_priv = reinterpret_cast<void *>(&frame_num);
     const vpx_codec_err_t res =
         decoder.DecodeFrame(video.cxdata(), video.frame_size(),
@@ -56,16 +68,17 @@
     // Get decompressed data.
     while ((img = dec_iter.Next())) {
       if (frame_num == 0) {
-        // user_priv pointer value should be the same.
-        EXPECT_EQ(img->user_priv, reinterpret_cast<void *>(NULL)) <<
-            "user_priv pointer value does not match.";
+        CheckUserPrivateData(img->user_priv, NULL);
       } else {
-        // user_priv pointer value should be the same.
-        EXPECT_EQ(img->user_priv, reinterpret_cast<void *>(&frame_num)) <<
-            "user_priv pointer value does not match.";
-        // value in user_priv pointer should also be the same.
-        EXPECT_EQ(*reinterpret_cast<int *>(img->user_priv), frame_num) <<
-            "Value in user_priv does not match.";
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct vp9_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(VP9_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, &frame_num);
       }
       md5.Add(img);
     }

diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 8e546d5..e50d393 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c

@@ -393,12 +393,12 @@
                   int                         low_var_thresh,
                   int                         flag)
 {
+    int mbr;
     double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
     int ppl = (int)(level + .5);
-    int mb_rows = source->y_width >> 4;
-    int mb_cols = source->y_height >> 4;
+    int mb_rows = cm->mb_rows;
+    int mb_cols = cm->mb_cols;
     unsigned char *limits = cm->pp_limits_buffer;;
-    int mbr, mbc;
     (void) post;
     (void) low_var_thresh;
     (void) flag;

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 3e40774..b013fe5 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -552,6 +552,9 @@
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
     add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
     specialize qw/vp8_denoiser_filter sse2 neon/;
+    add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
+    specialize qw/vp8_denoiser_filter_uv sse2/;
+
 }
 
 # End of encoder only functions

diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 0f2e5f1..9ad4113 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c

@@ -191,6 +191,148 @@
     return FILTER_BLOCK;
 }
 
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
+                             int mc_avg_uv_stride,
+                             unsigned char *running_avg_uv,
+                             int avg_uv_stride,
+                             unsigned char *sig,
+                             int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_uv_start = running_avg_uv;
+    unsigned char *sig_start = sig;
+    int sum_diff_thresh;
+    int r, c;
+    int sum_diff = 0;
+    int sum_block = 0;
+    int adj_val[3] = {3, 4, 6};
+    int shift_inc1 = 0;
+    int shift_inc2 = 1;
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. Add another increment for
+     * blocks that are labeled for increase denoising. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
+      if (increase_denoising) {
+        shift_inc1 = 1;
+        shift_inc2 = 2;
+      }
+      adj_val[0] += shift_inc2;
+      adj_val[1] += shift_inc2;
+      adj_val[2] += shift_inc2;
+    }
+
+    // Avoid denoising color signal if its close to average level.
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        sum_block += sig[c];
+      }
+      sig += sig_stride;
+    }
+    if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+      return COPY_BLOCK;
+    }
+
+    sig -= sig_stride * 8;
+    for (r = 0; r < 8; ++r) {
+      for (c = 0; c < 8; ++c) {
+        int diff = 0;
+        int adjustment = 0;
+        int absdiff = 0;
+
+        diff = mc_running_avg_uv[c] - sig[c];
+        absdiff = abs(diff);
+
+        // When |diff| <= |3 + shift_inc1|, use pixel value from
+        // last denoised raw.
+        if (absdiff <= 3 + shift_inc1) {
+          running_avg_uv[c] = mc_running_avg_uv[c];
+          sum_diff += diff;
+        } else {
+          if (absdiff >= 4 && absdiff <= 7)
+            adjustment = adj_val[0];
+          else if (absdiff >= 8 && absdiff <= 15)
+            adjustment = adj_val[1];
+          else
+            adjustment = adj_val[2];
+          if (diff > 0) {
+            if ((sig[c] + adjustment) > 255)
+              running_avg_uv[c] = 255;
+            else
+              running_avg_uv[c] = sig[c] + adjustment;
+            sum_diff += adjustment;
+          } else {
+            if ((sig[c] - adjustment) < 0)
+              running_avg_uv[c] = 0;
+            else
+              running_avg_uv[c] = sig[c] - adjustment;
+            sum_diff -= adjustment;
+          }
+        }
+      }
+      /* Update pointers for next iteration. */
+      sig += sig_stride;
+      mc_running_avg_uv += mc_avg_uv_stride;
+      running_avg_uv += avg_uv_stride;
+    }
+
+    sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
+    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising), check
+      // if we can still apply some (weaker) temporal filtering to this block,
+      // that would otherwise not be denoised at all. Simplest is to apply
+      // an additional adjustment to running_avg_y to bring it closer to sig.
+      // The adjustment is capped by a maximum delta, and chosen such that
+      // in most cases the resulting sum_diff will be within the
+      // accceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over threshold.
+      int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        sig -= sig_stride * 8;
+        mc_running_avg_uv -= mc_avg_uv_stride * 8;
+        running_avg_uv -= avg_uv_stride * 8;
+        for (r = 0; r < 8; ++r) {
+          for (c = 0; c < 8; ++c) {
+            int diff = mc_running_avg_uv[c] - sig[c];
+            int adjustment = abs(diff);
+            if (adjustment > delta)
+              adjustment = delta;
+            if (diff > 0) {
+              // Bring denoised signal down.
+              if (running_avg_uv[c] - adjustment < 0)
+                running_avg_uv[c] = 0;
+              else
+                running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              sum_diff -= adjustment;
+            } else if (diff < 0) {
+              // Bring denoised signal up.
+              if (running_avg_uv[c] + adjustment > 255)
+                running_avg_uv[c] = 255;
+              else
+                running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              sum_diff += adjustment;
+            }
+          }
+          // TODO(marpan): Check here if abs(sum_diff) has gone below the
+          // threshold sum_diff_thresh, and if so, we can exit the row loop.
+          sig += sig_stride;
+          mc_running_avg_uv += mc_avg_uv_stride;
+          running_avg_uv += avg_uv_stride;
+        }
+        if (abs(sum_diff) > sum_diff_thresh)
+          return COPY_BLOCK;
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+
+    vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
+                    sig_stride);
+    return FILTER_BLOCK;
+}
+
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
                           int num_mb_rows, int num_mb_cols)
 {
@@ -260,6 +402,8 @@
     unsigned int motion_magnitude2;
     unsigned int sse_thresh;
     int sse_diff_thresh = 0;
+    // Denoise the UV channel.
+    int apply_color_denoise = 0;
     // Spatial loop filter: only applied selectively based on
     // temporal filter state of block relative to top/left neighbors.
     int apply_spatial_loop_filter = 1;
@@ -267,6 +411,8 @@
     MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
 
     enum vp8_denoiser_decision decision = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_u = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_v = FILTER_BLOCK;
 
     if (zero_frame)
     {
@@ -376,11 +522,37 @@
 
         /* Filter. */
         decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
-                                         running_avg_y, avg_y_stride,
-                                         x->thismb, 16, motion_magnitude2,
-                                         x->increase_denoising);
+                                       running_avg_y, avg_y_stride,
+                                       x->thismb, 16, motion_magnitude2,
+                                       x->increase_denoising);
         denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
             kFilterNonZeroMV : kFilterZeroMV;
+        // Only denoise UV for zero motion, and if y channel was denoised.
+        if (apply_color_denoise &&
+            motion_magnitude2 == 0 &&
+            decision == FILTER_BLOCK) {
+          unsigned char *mc_running_avg_u =
+              denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
+          unsigned char *running_avg_u =
+              denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
+          unsigned char *mc_running_avg_v =
+              denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
+          unsigned char *running_avg_v =
+              denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
+          int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
+          int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
+          int signal_stride = x->block[16].src_stride;
+          decision_u =
+              vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
+                                      running_avg_u, avg_uv_stride,
+                                      x->block[16].src + *x->block[16].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+          decision_v =
+              vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
+                                      running_avg_v, avg_uv_stride,
+                                      x->block[20].src + *x->block[20].base_src,
+                                      signal_stride, motion_magnitude2, 0);
+        }
     }
     if (decision == COPY_BLOCK)
     {
@@ -393,7 +565,21 @@
                 denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
         denoiser->denoise_state[block_index] = kNoFilter;
     }
-    // Option to selectively deblock the denoised signal.
+    if (apply_color_denoise) {
+      if (decision_u == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+      if (decision_v == COPY_BLOCK) {
+        vp8_copy_mem8x8(
+            x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
+            denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
+            denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
+      }
+    }
+    // Option to selectively deblock the denoised signal, for y channel only.
     if (apply_spatial_loop_filter) {
       loop_filter_info lfi;
       int apply_filter_col = 0;

diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 6db0785..8f1bfa5 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h

@@ -22,6 +22,11 @@
 #define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
+#define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
+#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
+#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
+
 enum vp8_denoiser_decision
 {
   COPY_BLOCK,

diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index ff439dd..b84795c 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c

@@ -17,10 +17,23 @@
 #include <emmintrin.h>
 #include "vpx_ports/emmintrin_compat.h"
 
-union sum_union {
-    __m128i v;
-    signed char e[16];
-};
+/* Compute the sum of all pixel differences of this MB. */
+static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
+  const __m128i k_1 = _mm_set1_epi16(1);
+  const __m128i acc_diff_lo = _mm_srai_epi16(
+      _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi = _mm_srai_epi16(
+      _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                          _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                         _mm_srli_si128(hgfe_dcba, 4));
+  unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);
+
+  return abs(sum_diff);
+}
 
 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                              int mc_avg_y_stride,
@@ -31,7 +44,7 @@
 {
     unsigned char *running_avg_y_start = running_avg_y;
     unsigned char *sig_start = sig;
-    int sum_diff_thresh;
+    unsigned int sum_diff_thresh;
     int r;
     int shift_inc  = (increase_denoising &&
         motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
@@ -103,16 +116,10 @@
 
     {
         /* Compute the sum of all pixel differences of this MB. */
-        union sum_union s;
-        int sum_diff = 0;
-        s.v = acc_diff;
-        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
-                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
-                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
-
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
         sum_diff_thresh = SUM_DIFF_THRESHOLD;
         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
-        if (abs(sum_diff) > sum_diff_thresh) {
+        if (abs_sum_diff > sum_diff_thresh) {
           // Before returning to copy the block (i.e., apply no denoising),
           // checK if we can still apply some (weaker) temporal filtering to
           // this block, that would otherwise not be denoised at all. Simplest
@@ -123,7 +130,7 @@
 
           // The delta is set by the excess of absolute pixel diff over the
           // threshold.
-          int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
           // Only apply the adjustment for max delta up to 3.
           if (delta < 4) {
             const __m128i k_delta = _mm_set1_epi8(delta);
@@ -162,16 +169,9 @@
              mc_running_avg_y += mc_avg_y_stride;
              running_avg_y += avg_y_stride;
             }
-            {
-              // Update the sum of all pixel differences of this MB.
-              union sum_union s;
-              s.v = acc_diff;
-              sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
-                       + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
-                       + s.e[12] + s.e[13] + s.e[14] + s.e[15];
-              if (abs(sum_diff) > sum_diff_thresh) {
-                return COPY_BLOCK;
-              }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
             }
           } else {
             return COPY_BLOCK;
@@ -182,3 +182,198 @@
     vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
     return FILTER_BLOCK;
 }
+
+int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
+                             int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
+                             unsigned char *sig, int sig_stride,
+                             unsigned int motion_magnitude,
+                             int increase_denoising) {
+    unsigned char *running_avg_start = running_avg;
+    unsigned char *sig_start = sig;
+    unsigned int sum_diff_thresh;
+    int r;
+    int shift_inc  = (increase_denoising &&
+        motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?
+                        7 + shift_inc : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
+
+    {
+      const __m128i k_1 = _mm_set1_epi16(1);
+      __m128i vec_sum_block = _mm_setzero_si128();
+
+      // Avoid denoising color signal if its close to average level.
+      for (r = 0; r < 8; ++r) {
+        const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
+        const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
+        vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
+        sig += sig_stride;
+      }
+      sig -= sig_stride * 8;
+      {
+        const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
+        const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
+                                                _mm_srli_si128(hg_fe_dc_ba, 8));
+        const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
+                                               _mm_srli_si128(hgfe_dcba, 4));
+        const int sum_block = _mm_cvtsi128_si32(hgfedcba);
+        if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
+          return COPY_BLOCK;
+        }
+      }
+    }
+
+    for (r = 0; r < 4; ++r) {
+        /* Calculate differences */
+        const __m128i v_sig_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&sig[0])));
+        const __m128i v_sig = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                         (double *)(&sig[sig_stride])));
+        const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+            _mm_load_sd((double *)(&mc_running_avg[0])));
+        const __m128i v_mc_running_avg = _mm_castpd_si128(
+            _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                         (double *)(&mc_running_avg[mc_avg_stride])));
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
+        __m128i v_running_avg;
+
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
+
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
+
+        /* Calculate filtered value. */
+        v_running_avg = _mm_adds_epu8(v_sig, padj);
+        v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
+
+        _mm_storel_pd((double *)&running_avg[0],
+                      _mm_castsi128_pd(v_running_avg));
+        _mm_storeh_pd((double *)&running_avg[avg_stride],
+                      _mm_castsi128_pd(v_running_avg));
+
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
+         */
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride * 2;
+        mc_running_avg += mc_avg_stride * 2;
+        running_avg += avg_stride * 2;
+    }
+
+    {
+        unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+        sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
+        if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
+        if (abs_sum_diff > sum_diff_thresh) {
+          // Before returning to copy the block (i.e., apply no denoising),
+          // checK if we can still apply some (weaker) temporal filtering to
+          // this block, that would otherwise not be denoised at all. Simplest
+          // is to apply an additional adjustment to running_avg_y to bring it
+          // closer to sig. The adjustment is capped by a maximum delta, and
+          // chosen such that in most cases the resulting sum_diff will be
+          // within the accceptable range given by sum_diff_thresh.
+
+          // The delta is set by the excess of absolute pixel diff over the
+          // threshold.
+          int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
+          // Only apply the adjustment for max delta up to 3.
+          if (delta < 4) {
+            const __m128i k_delta = _mm_set1_epi8(delta);
+            sig -= sig_stride * 8;
+            mc_running_avg -= mc_avg_stride * 8;
+            running_avg -= avg_stride * 8;
+            for (r = 0; r < 4; ++r) {
+              // Calculate differences.
+              const __m128i v_sig_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&sig[0])));
+              const __m128i v_sig = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
+                               (double *)(&sig[sig_stride])));
+              const __m128i v_mc_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&mc_running_avg[0])));
+              const __m128i v_mc_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
+                               (double *)(&mc_running_avg[mc_avg_stride])));
+              const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
+              const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
+              // Obtain the sign. FF if diff is negative.
+              const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+              // Clamp absolute difference to delta to get the adjustment.
+              const __m128i adj =
+                  _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+              // Restore the sign and get positive and negative adjustments.
+              __m128i padj, nadj;
+              const __m128i v_running_avg_low = _mm_castpd_si128(
+                  _mm_load_sd((double *)(&running_avg[0])));
+              __m128i v_running_avg = _mm_castpd_si128(
+                  _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
+                               (double *)(&running_avg[avg_stride])));
+              padj = _mm_andnot_si128(diff_sign, adj);
+              nadj = _mm_and_si128(diff_sign, adj);
+              // Calculate filtered value.
+              v_running_avg = _mm_subs_epu8(v_running_avg, padj);
+              v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
+
+              _mm_storel_pd((double *)&running_avg[0],
+                            _mm_castsi128_pd(v_running_avg));
+              _mm_storeh_pd((double *)&running_avg[avg_stride],
+                            _mm_castsi128_pd(v_running_avg));
+
+             // Accumulate the adjustments.
+             acc_diff = _mm_subs_epi8(acc_diff, padj);
+             acc_diff = _mm_adds_epi8(acc_diff, nadj);
+
+             // Update pointers for next iteration.
+             sig += sig_stride * 2;
+             mc_running_avg += mc_avg_stride * 2;
+             running_avg += avg_stride * 2;
+            }
+            abs_sum_diff = abs_sum_diff_16x1(acc_diff);
+            if (abs_sum_diff > sum_diff_thresh) {
+              return COPY_BLOCK;
+            }
+          } else {
+            return COPY_BLOCK;
+          }
+        }
+    }
+
+    vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
+    return FILTER_BLOCK;
+}

diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c
index a0b1e03..733b3a9 100644
--- a/vp9/common/vp9_frame_buffers.c
+++ b/vp9/common/vp9_frame_buffers.c

@@ -76,6 +76,7 @@
 int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) {
   InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
   (void)cb_priv;
-  int_fb->in_use = 0;
+  if (int_fb)
+    int_fb->in_use = 0;
   return 0;
 }

diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 61682c4..0fe58c5 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c

@@ -11,181 +11,6 @@
 
 #include "vp9/common/vp9_mvref_common.h"
 
-#define MVREF_NEIGHBOURS 8
-
-typedef struct position {
-  int row;
-  int col;
-} POSITION;
-
-typedef enum {
-  BOTH_ZERO = 0,
-  ZERO_PLUS_PREDICTED = 1,
-  BOTH_PREDICTED = 2,
-  NEW_PLUS_NON_INTRA = 3,
-  BOTH_NEW = 4,
-  INTRA_PLUS_NON_INTRA = 5,
-  BOTH_INTRA = 6,
-  INVALID_CASE = 9
-} motion_vector_context;
-
-// This is used to figure out a context for the ref blocks. The code flattens
-// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
-// adding 9 for each intra block, 3 for each zero mv and 1 for each new
-// motion vector. This single number is then converted into a context
-// with a single lookup ( counter_to_context ).
-static const int mode_2_counter[MB_MODE_COUNT] = {
-  9,  // DC_PRED
-  9,  // V_PRED
-  9,  // H_PRED
-  9,  // D45_PRED
-  9,  // D135_PRED
-  9,  // D117_PRED
-  9,  // D153_PRED
-  9,  // D207_PRED
-  9,  // D63_PRED
-  9,  // TM_PRED
-  0,  // NEARESTMV
-  0,  // NEARMV
-  3,  // ZEROMV
-  1,  // NEWMV
-};
-
-// There are 3^3 different combinations of 3 counts that can be either 0,1 or
-// 2. However the actual count can never be greater than 2 so the highest
-// counter we need is 18. 9 is an invalid counter that's never used.
-static const int counter_to_context[19] = {
-  BOTH_PREDICTED,  // 0
-  NEW_PLUS_NON_INTRA,  // 1
-  BOTH_NEW,  // 2
-  ZERO_PLUS_PREDICTED,  // 3
-  NEW_PLUS_NON_INTRA,  // 4
-  INVALID_CASE,  // 5
-  BOTH_ZERO,  // 6
-  INVALID_CASE,  // 7
-  INVALID_CASE,  // 8
-  INTRA_PLUS_NON_INTRA,  // 9
-  INTRA_PLUS_NON_INTRA,  // 10
-  INVALID_CASE,  // 11
-  INTRA_PLUS_NON_INTRA,  // 12
-  INVALID_CASE,  // 13
-  INVALID_CASE,  // 14
-  INVALID_CASE,  // 15
-  INVALID_CASE,  // 16
-  INVALID_CASE,  // 17
-  BOTH_INTRA  // 18
-};
-
-static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
-  // 4X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 4X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X4
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X8
-  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
-  // 8X16
-  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
-  // 16X8
-  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
-  // 16X16
-  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 16X32
-  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
-  // 32X16
-  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X32
-  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
-  // 32X64
-  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
-  // 64X32
-  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
-  // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
-};
-
-static const int idx_n_column_to_subblock[4][2] = {
-  {1, 2},
-  {1, 3},
-  {3, 2},
-  {3, 3}
-};
-
-// clamp_mv_ref
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// This function returns either the appropriate sub block or block's mv
-// on whether the block_size < 8x8 and we have check_sub_blocks set.
-static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
-                                      int search_col, int block_idx) {
-  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
-          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
-              .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
-}
-
-
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
-                              const MV_REFERENCE_FRAME this_ref_frame,
-                              const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
-    mv.as_mv.row *= -1;
-    mv.as_mv.col *= -1;
-  }
-  return mv;
-}
-
-// This macro is used to add a motion vector mv_ref list if it isn't
-// already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
-  do { \
-    if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
-        goto Done; \
-      } \
-    } else { \
-      mv_ref_list[refmv_count++] = (mv); \
-    } \
-  } while (0)
-
-// If either reference frame is different, not INTRA, and they
-// are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
-  do { \
-    if (is_inter_block(mbmi)) { \
-      if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
-      if (has_second_ref(mbmi) && \
-          (mbmi)->ref_frame[1] != ref_frame && \
-          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
-    } \
-  } while (0)
-
-
-// Checks that the given mi_row, mi_col and search point
-// are inside the borders of the tile.
-static INLINE int is_inside(const TileInfo *const tile,
-                            int mi_col, int mi_row, int mi_rows,
-                            const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
-           mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
-           mi_col + mi_pos->col >= tile->mi_col_end);
-}
-
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,

diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 903ac02..7bce3fa 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h

@@ -21,6 +21,181 @@
 #define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
                                 VP9_INTERP_EXTEND) << 3)
 
+#define MVREF_NEIGHBOURS 8
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D207_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
+static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
+  // 4X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 4X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X4
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X8
+  {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}},
+  // 8X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // 16X8
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // 16X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 16X32
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // 32X16
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X32
+  {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // 32X64
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // 64X32
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // 64X64
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+};
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
+               xd->mb_to_right_edge + MV_BORDER,
+               xd->mb_to_top_edge - MV_BORDER,
+               xd->mb_to_bottom_edge + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
+                                      int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv];
+}
+
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(mv) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != mv_ref_list[0].as_int) { \
+        mv_ref_list[refmv_count] = (mv); \
+        goto Done; \
+      } \
+    } else { \
+      mv_ref_list[refmv_count++] = (mv); \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+    } \
+  } while (0)
+
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile,
+                            int mi_col, int mi_row, int mi_rows,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < 0 ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= mi_rows ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
 // TODO(jingning): this mv clamping function should be block size dependent.
 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,

diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c
index d3405fc..2f58323 100644
--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c

@@ -33,14 +33,6 @@
   return (other_size << REF_SCALE_SHIFT) / this_size;
 }
 
-static int check_scale_factors(int other_w, int other_h,
-                               int this_w, int this_h) {
-  return 2 * this_w >= other_w &&
-         2 * this_h >= other_h &&
-         this_w <= 16 * other_w &&
-         this_h <= 16 * other_h;
-}
-
 MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) {
   const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK;
   const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK;
@@ -54,7 +46,7 @@
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h) {
-  if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     sf->x_scale_fp = REF_INVALID_SCALE;
     sf->y_scale_fp = REF_INVALID_SCALE;
     return;

diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index 04aae65..ad6f5d7 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h

@@ -50,6 +50,14 @@
          (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
 }
 
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                      int this_width, int this_height) {
+  return 2 * this_width >= ref_width &&
+         2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width &&
+         this_height <= 16 * ref_height;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9220a9e..1effef3 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -667,9 +667,17 @@
   if (!found)
     read_frame_size(rb, &width, &height);
 
-  if (width <= 0 || height <= 0)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Referenced frame with invalid size");
+  // Check that each of the frames that this frame references has valid
+  // dimensions.
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    const int ref_width = ref_frame->buf->y_width;
+    const int ref_height = ref_frame->buf->y_height;
+
+    if (!valid_ref_frame_size(ref_width, ref_height, width, height))
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has invalid size");
+  }
 
   apply_frame_size(cm, width, height);
   setup_display_size(cm, rb);
@@ -1142,12 +1150,12 @@
       setup_frame_size(cm, rb);
     } else {
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
-
       for (i = 0; i < REFS_PER_FRAME; ++i) {
         const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
-        cm->frame_refs[i].idx = idx;
-        cm->frame_refs[i].buf = &cm->frame_bufs[idx].buf;
+        RefBuffer *const ref_frame = &cm->frame_refs[i];
+        ref_frame->idx = idx;
+        ref_frame->buf = &cm->frame_bufs[idx].buf;
         cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
       }
 

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 84cb84a..245c5f1 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -210,7 +210,10 @@
   }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    --cm->frame_bufs[cm->new_fb_idx].ref_count;
+  }
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
@@ -239,7 +242,9 @@
   }
 
   // Check if the previous frame was a frame without any references to it.
-  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
     cm->release_fb_cb(cm->cb_priv,
                       &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);

diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index a727e2a..01c07f1 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h

@@ -40,6 +40,23 @@
   int sync_range;
 } VP9LfSync;
 
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+} FrameWorkerData;
+
 // Allocate memory for loopfilter row synchronization.
 void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
                            int rows, int width);

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c69ed16..1bf826a 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -904,7 +904,7 @@
              (cpi->refresh_golden_frame << cpi->alt_fb_idx);
     } else {
       int arf_idx = cpi->alt_fb_idx;
-      if (cpi->pass == 2) {
+      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
         const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b9349a4..5387e68 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -2256,7 +2256,8 @@
                                sf->always_this_block_size);
         rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-      } else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
+      } else if (cpi->skippable_frame ||
+                 sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
         BLOCK_SIZE bsize;
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
         bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
@@ -2389,7 +2390,7 @@
                  rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                      ALLOW_32X32 : TX_MODE_SELECT;
     } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) {
-      return ALLOW_8X8;
+      return TX_MODE_SELECT;
     } else {
       unsigned int total = 0;
       int i;

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 11fd584..1afbcf6 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -740,6 +740,8 @@
   cpi->alt_is_last = 0;
   cpi->gold_is_alt = 0;
 
+  cpi->skippable_frame = 0;
+
   // Create the encoder segmentation map and set all entries to 0
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
@@ -1514,7 +1516,7 @@
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
-      if (cpi->pass == 2) {
+      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
         const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
@@ -1967,6 +1969,29 @@
   }
 }
 
+static void configure_skippable_frame(VP9_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+
+  SVC *const svc = &cpi->svc;
+  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
+                             (svc->number_temporal_layers == 1);
+  TWO_PASS *const twopass = is_spatial_svc ?
+                            &svc->layer_context[svc->spatial_layer_id].twopass
+                            : &cpi->twopass;
+
+  cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
+    twopass->stats_in - 2 > twopass->stats_in_start &&
+    twopass->stats_in < twopass->stats_in_end &&
+    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+    == 1 &&
+    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+    == 1 &&
+    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
@@ -2062,6 +2087,13 @@
   if (cpi->pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
 
+  // Check if the current frame is skippable for the partition search in the
+  // second pass according to the first pass stats
+  if (cpi->pass == 2 &&
+      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
+    configure_skippable_frame(cpi);
+  }
+
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
   if (cpi->pass == 0 &&
@@ -2397,7 +2429,7 @@
   return arf_src_index;
 }
 
-static void is_src_altref(VP9_COMP *cpi) {
+static void check_src_altref(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
   if (cpi->pass == 2) {
@@ -2513,9 +2545,8 @@
       cm->show_frame = 1;
       cm->intra_only = 0;
 
-      // Check to see if the frame to be encoded is an overlay for a previous
-      // arf frame and if so configure it as such.
-      is_src_altref(cpi);
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi);
     }
   }
 

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 47649a8..ee98baa 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -262,6 +262,8 @@
   int alt_is_last;  // Alt same as last ( short circuit altref search)
   int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
+  int skippable_frame;
+
   int scaled_ref_idx[3];
   int lst_fb_idx;
   int gld_fb_idx;

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 0261b98..971b159 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -1310,7 +1310,7 @@
   double modified_err = 0.0;
   double err_fraction;
   int mid_boost_bits = 0;
-  int middle_frame_idx;
+  int mid_frame_idx;
   unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
 
   key_frame = cpi->common.frame_type == KEY_FRAME ||
@@ -1360,7 +1360,8 @@
     twopass->gf_group.arf_src_offset[frame_index] =
       (unsigned char)(rc->baseline_gf_interval - 1);
     twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+    twopass->gf_group.arf_ref_idx[frame_index] =
+      arf_buffer_indices[cpi->multi_arf_enabled && rc->source_alt_ref_active];
     ++frame_index;
 
     if (cpi->multi_arf_enabled) {
@@ -1376,7 +1377,7 @@
   }
 
   // Define middle frame
-  middle_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+  mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
   // Allocate bits to the other frames in the group.
   for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
@@ -1397,7 +1398,7 @@
       mid_boost_bits += (target_frame_size >> 4);
       target_frame_size -= (target_frame_size >> 4);
 
-      if (frame_index <= middle_frame_idx)
+      if (frame_index <= mid_frame_idx)
         arf_idx = 1;
     }
     twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
@@ -1427,9 +1428,9 @@
     // Final setup for second arf and its overlay.
     if (cpi->multi_arf_enabled) {
       twopass->gf_group.bit_allocation[2] =
-        twopass->gf_group.bit_allocation[middle_frame_idx] + mid_boost_bits;
-      twopass->gf_group.update_type[middle_frame_idx] = OVERLAY_UPDATE;
-      twopass->gf_group.bit_allocation[middle_frame_idx] = 0;
+        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
+      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
     }
   } else {
     twopass->gf_group.update_type[frame_index] = GF_UPDATE;

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 149fafd..87a3510 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -27,6 +27,85 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 
+static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                       const TileInfo *const tile,
+                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                       int_mv *mv_ref_list,
+                       int mi_row, int mi_col) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+
+  int different_ref_found = 0;
+  int context_counter = 0;
+  int const_motion = 0;
+
+  // Blank the reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1));
+    }
+  }
+
+  const_motion = 1;
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
+                                                    xd->mi_stride]->mbmi;
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate->mv[0]);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found && !refmv_count) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride]->mbmi;
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+      }
+    }
+  }
+
+ Done:
+
+  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];
+
+  // Clamp vectors
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return const_motion;
+}
+
 static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
                                     int_mv *tmp_mv, int *rate_mv) {
@@ -173,6 +252,17 @@
   else
     x->skip_txfm = 0;
 
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
+                          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      xd->mi[0]->mbmi.tx_size = TX_8X8;
+  } else {
+    xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
+                         tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
   vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
                                dc_quant >> 3, &rate, &dist);
   *out_rate_sum = rate >> 1;
@@ -214,6 +304,8 @@
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE this_mode, best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+  TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
+                             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -245,6 +337,7 @@
   int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
                                       get_chessboard_index(cm)) % 2;
+  int const_motion[MAX_REF_FRAMES] = { 0 };
 
   // For speed 6, the result of interp filter is reused later in actual encoding
   // process.
@@ -292,9 +385,27 @@
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      vp9_setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+
+      if (cm->coding_use_prev_mi)
+        vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
+                         candidates, mi_row, mi_col);
+      else
+        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0],
+                                             ref_frame, candidates,
+                                             mi_row, mi_col);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &frame_mv[NEARESTMV][ref_frame],
+                            &frame_mv[NEARMV][ref_frame]);
+
+      if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8)
+        vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                    ref_frame, bsize);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -328,6 +439,10 @@
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
       int rate_mv = 0;
 
+      if (const_motion[ref_frame] &&
+          (this_mode == NEARMV || this_mode == ZEROMV))
+        continue;
+
       if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
         continue;
 
@@ -384,6 +499,7 @@
         int64_t pf_dist[3];
         unsigned int pf_var[3];
         unsigned int pf_sse[3];
+        TX_SIZE pf_tx_size[3];
         int64_t best_cost = INT64_MAX;
         INTERP_FILTER best_filter = SWITCHABLE, filter;
         PRED_BUFFER *current_pred = this_mode_pred;
@@ -397,6 +513,7 @@
           cost = RDCOST(x->rdmult, x->rddiv,
                         vp9_get_switchable_rate(cpi) + pf_rate[filter],
                         pf_dist[filter]);
+          pf_tx_size[filter] = mbmi->tx_size;
           if (cost < best_cost) {
             best_filter = filter;
             best_cost = cost;
@@ -421,6 +538,7 @@
           free_pred_buffer(current_pred);
 
         mbmi->interp_filter = best_filter;
+        mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
         rate = pf_rate[mbmi->interp_filter];
         dist = pf_dist[mbmi->interp_filter];
         var_y = pf_var[mbmi->interp_filter];
@@ -439,37 +557,49 @@
 
       // Skipping checking: test to see if this block can be reconstructed by
       // prediction only.
-      if (cpi->allow_encode_breakout && x->encode_breakout) {
+      if (cpi->allow_encode_breakout) {
         const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
         unsigned int var = var_y, sse = sse_y;
         // Skipping threshold for ac.
         unsigned int thresh_ac;
         // Skipping threshold for dc.
         unsigned int thresh_dc;
-        // Set a maximum for threshold to avoid big PSNR loss in low bit rate
-        // case. Use extreme low threshold for static frames to limit skipping.
-        const unsigned int max_thresh = 36000;
-        // The encode_breakout input
-        const unsigned int min_thresh =
-            MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+        if (x->encode_breakout > 0) {
+          // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+          // case. Use extreme low threshold for static frames to limit
+          // skipping.
+          const unsigned int max_thresh = 36000;
+          // The encode_breakout input
+          const unsigned int min_thresh =
+              MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 
-        // Calculate threshold according to dequant value.
-        thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-        thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+          // Calculate threshold according to dequant value.
+          thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+          thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
 
-        // Adjust ac threshold according to partition size.
-        thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
+          // Adjust ac threshold according to partition size.
+          thresh_ac >>=
+              8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
-        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+          thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+        } else {
+          thresh_ac = 0;
+          thresh_dc = 0;
+        }
 
         // Y skipping condition checking for ac and dc.
         if (var <= thresh_ac && (sse - var) <= thresh_dc) {
           unsigned int sse_u, sse_v;
           unsigned int var_u, var_v;
 
-          // Skip u v prediction for less calculation, that won't affect
-          // result much.
+          // Skip UV prediction unless breakout is zero (lossless) to save
+          // computation with low impact on the result
+          if (x->encode_breakout == 0) {
+            xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
+            xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
+            vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+          }
+
           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
                                           x->plane[1].src.stride,
                                           xd->plane[1].dst.buf,
@@ -516,6 +646,7 @@
         *returndistortion = dist;
         best_mode = this_mode;
         best_pred_filter = mbmi->interp_filter;
+        best_tx_size = mbmi->tx_size;
         best_ref_frame = ref_frame;
         skip_txfm = x->skip_txfm;
 
@@ -549,10 +680,11 @@
                       bw, bh);
   }
 
-  mbmi->mode = best_mode;
+  mbmi->mode          = best_mode;
   mbmi->interp_filter = best_pred_filter;
-  mbmi->ref_frame[0] = best_ref_frame;
-  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  mbmi->tx_size       = best_tx_size;
+  mbmi->ref_frame[0]  = best_ref_frame;
+  mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
   x->skip_txfm = skip_txfm;
 
@@ -560,31 +692,57 @@
   // threshold.
   if (!x->skip && best_rd > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
+    int i, j;
+    const int width  = num_4x4_blocks_wide_lookup[bsize];
+    const int height = num_4x4_blocks_high_lookup[bsize];
+
+    int rate2 = 0;
+    int64_t dist2 = 0;
+    const int dst_stride = pd->dst.stride;
+    const int src_stride = p->src.stride;
+    int block_idx = 0;
+
+    TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
+                              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    const int step = 1 << tmp_tx_size;
+
     for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
       if (cpi->sf.reuse_inter_pred_sby) {
         pd->dst.buf = tmp[0].data;
         pd->dst.stride = bw;
       }
 
-      vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
-                              mbmi->tx_size, this_mode,
-                              &p->src.buf[0], p->src.stride,
-                              &pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
+      for (j = 0; j < height; j += step) {
+        for (i = 0; i < width; i += step) {
+          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
+                                  tmp_tx_size, this_mode,
+                                  &p->src.buf[4 * (j * dst_stride + i)],
+                                  src_stride,
+                                  &pd->dst.buf[4 * (j * dst_stride + i)],
+                                  dst_stride, i, j, 0);
+          model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+          rate2 += rate;
+          dist2 += dist;
+          ++block_idx;
+        }
+      }
 
-      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
-
-      if (cpi->sf.reuse_inter_pred_sby)
-        pd->dst = orig_dst;
+      rate = rate2;
+      dist = dist2;
 
       rate += cpi->mbmode_cost[this_mode];
       rate += intra_cost_penalty;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
+      if (cpi->sf.reuse_inter_pred_sby)
+        pd->dst = orig_dst;
+
       if (this_rd + intra_mode_cost < best_rd) {
         best_rd = this_rd;
         *returnrate = rate;
         *returndistortion = dist;
         mbmi->mode = this_mode;
+        mbmi->tx_size = tmp_tx_size;
         mbmi->ref_frame[0] = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index d7017f2..1730389 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -272,10 +272,8 @@
     sf->search_type_check_frequency = 50;
     sf->source_var_thresh = 360;
 
-    sf->tx_size_search_method = USE_TX_8X8;
-    // TODO(yunqingwang): max_intra_bsize is used to decide if DC_PRED mode
-    // is checked for a partition block. Later, we can try to allow large
-    // partitions to do intra mode checking.
+    sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
+        USE_LARGESTALL : USE_TX_8X8;
     sf->max_intra_bsize = BLOCK_8X8;
 
     // This feature is only enabled when partition search is disabled.

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index c3ca7ee..fd868ae 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -32,15 +32,19 @@
   vpx_codec_priv_t        base;
   vpx_codec_dec_cfg_t     cfg;
   vp9_stream_info_t       si;
-  struct VP9Decoder *pbi;
   int                     postproc_cfg_set;
   vp8_postproc_cfg_t      postproc_cfg;
   vpx_decrypt_cb          decrypt_cb;
   void                   *decrypt_state;
   vpx_image_t             img;
-  int                     img_avail;
   int                     invert_tile_order;
   int                     frame_parallel_decode;  // frame-based threading.
+  int                     last_show_frame;  // Index of last output frame.
+
+  VP9Worker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_thread_id;
+  int                     next_output_thread_id;
 
   // External frame buffer info to save for VP9 common.
   void *ext_priv;  // Private data associated with the external frame buffers.
@@ -85,11 +89,17 @@
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->pbi) {
-    vp9_decoder_remove(ctx->pbi);
-    ctx->pbi = NULL;
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VP9Worker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_decoder_remove(worker_data->pbi);
+      vpx_free(worker_data);
+    }
   }
 
+  vpx_free(ctx->frame_workers);
   vpx_free(ctx);
 
   return VPX_CODEC_OK;
@@ -188,32 +198,42 @@
   return VPX_CODEC_OK;
 }
 
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
                            const struct vpx_internal_error_info *error) {
   if (error->error_code)
-    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
 
   return error->error_code;
 }
 
 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  VP9_COMMON *const cm = &ctx->pbi->common;
+  int i;
 
-  cm->new_fb_idx = -1;
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &worker_data->pbi->common;
 
-  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-    cm->get_fb_cb = ctx->get_ext_fb_cb;
-    cm->release_fb_cb = ctx->release_ext_fb_cb;
-    cm->cb_priv = ctx->ext_priv;
-  } else {
-    cm->get_fb_cb = vp9_get_frame_buffer;
-    cm->release_fb_cb = vp9_release_frame_buffer;
+    cm->new_fb_idx = -1;
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      cm->get_fb_cb = ctx->get_ext_fb_cb;
+      cm->release_fb_cb = ctx->release_ext_fb_cb;
+      cm->cb_priv = ctx->ext_priv;
+    } else {
+      cm->get_fb_cb = vp9_get_frame_buffer;
+      cm->release_fb_cb = vp9_release_frame_buffer;
 
-    if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to initialize internal frame buffers");
+      if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
 
-    cm->cb_priv = &cm->int_frame_buffers;
+      cm->cb_priv = &cm->int_frame_buffers;
+    }
   }
 }
 
@@ -232,14 +252,58 @@
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  ctx->pbi = vp9_decoder_create();
-  if (ctx->pbi == NULL)
-    return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = worker_data->data;
+  (void)arg2;
+  worker_data->result = vp9_receive_compressed_data(worker_data->pbi,
+                                                    worker_data->data_size,
+                                                    &data);
+  worker_data->data_end = data;
+  return !worker_data->result;
+}
 
-  ctx->pbi->max_threads = ctx->cfg.threads;
-  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
-  ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_thread_id = 0;
+  ctx->next_output_thread_id = 0;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+
+  ctx->frame_workers = (VP9Worker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *worker_data = NULL;
+    vp9_worker_init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    worker_data = (FrameWorkerData *)worker->data1;
+    worker_data->pbi = vp9_decoder_create();
+    if (worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    worker->hook = (VP9WorkerHook)frame_worker_hook;
+  }
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
@@ -248,20 +312,16 @@
     set_default_ppflags(&ctx->postproc_cfg);
 
   init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = {0, 0, 0};
-  VP9_COMMON *cm = NULL;
-
+  vp9_ppflags_t flags = {0};
   (void)deadline;
 
-  vp9_zero(sd);
-  ctx->img_avail = 0;
-
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -276,33 +336,39 @@
       return VPX_CODEC_ERROR;
   }
 
-  // Initialize the decoder instance on the first frame
-  if (ctx->pbi == NULL) {
-    init_decoder(ctx);
-    if (ctx->pbi == NULL)
-      return VPX_CODEC_ERROR;
+  // Initialize the decoder workers on the first frame
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
   }
 
-  // Set these even if already initialized.  The caller may have changed the
-  // decrypt config between frames.
-  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
-  ctx->pbi->decrypt_state = ctx->decrypt_state;
+  if (!ctx->frame_parallel_decode) {
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    worker_data->data = *data;
+    worker_data->data_size = data_sz;
+    worker_data->user_priv = user_priv;
 
-  cm = &ctx->pbi->common;
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
-  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
-    return update_error_state(ctx, &cm->error);
+    vp9_worker_execute(worker);
+    if (worker->had_error)
+      return update_error_state(ctx, &worker_data->pbi->common.error);
+
+    // Update data pointer after decode.
+    *data = worker_data->data_end;
+  } else {
+    // TODO(hkuang): Implement frame parallel decode.
+    return VPX_CODEC_INCAPABLE;
+  }
 
   if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
     set_ppflags(ctx, &flags);
 
-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
-
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
-
   return VPX_CODEC_OK;
 }
 
@@ -412,7 +478,7 @@
         vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
@@ -430,7 +496,7 @@
 
       // Extra data detected after the frame.
       if (data_start < data_end - 1) {
-        ctx->base.err_detail = "Fail to decode frame in parallel mode";
+        set_error_detail(ctx, "Fail to decode frame in parallel mode");
         return VPX_CODEC_INCAPABLE;
       }
     }
@@ -445,7 +511,7 @@
         vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
@@ -483,15 +549,31 @@
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  if (ctx->img_avail) {
-    // iter acts as a flip flop, so an image is only returned on the first
-    // call to get_frame.
-    if (!(*iter)) {
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = {0, 0, 0};
+
+    VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_thread_id];
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    if (vp9_get_raw_frame(worker_data->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *const cm = &worker_data->pbi->common;
+      yuvconfig2image(&ctx->img, &sd, worker_data->user_priv);
+      ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
       img = &ctx->img;
       *iter = img;
+      // Decrease reference count of last output frame in frame parallel mode.
+      if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+        --cm->frame_bufs[ctx->last_show_frame].ref_count;
+        if (cm->frame_bufs[ctx->last_show_frame].ref_count == 0) {
+          cm->release_fb_cb(cm->cb_priv,
+              &cm->frame_bufs[ctx->last_show_frame].raw_frame_buffer);
+        }
+      }
+      ctx->last_show_frame = worker_data->pbi->common.new_fb_idx;
     }
   }
-  ctx->img_avail = 0;
 
   return img;
 }
@@ -502,7 +584,7 @@
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->pbi == NULL) {
+  } else if (ctx->frame_workers == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -518,12 +600,19 @@
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&ctx->pbi->common,
+    return vp9_set_reference_dec(&worker_data->pbi->common,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -534,13 +623,19 @@
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
     YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-
-    return vp9_copy_reference_dec(ctx->pbi,
+    return vp9_copy_reference_dec(worker_data->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -551,11 +646,18 @@
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
     YV12_BUFFER_CONFIG* fb;
-
-    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);
-    yuvconfig2image(&data->img, fb, NULL);
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    vp9_get_reference_dec(worker_data->pbi, data->idx, &fb);
+    yuvconfig2image(&data->img, fb, worker_data->user_priv);
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -592,11 +694,20 @@
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (update_info) {
-    if (ctx->pbi)
-      *update_info = ctx->pbi->refresh_frame_flags;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      *update_info = worker_data->pbi->refresh_frame_flags;
+    } else {
       return VPX_CODEC_ERROR;
+    }
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -608,11 +719,20 @@
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (corrupted) {
-    if (ctx->pbi)
-      *corrupted = ctx->pbi->common.frame_to_show->corrupted;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      *corrupted = worker_data->pbi->common.frame_to_show->corrupted;
+    } else {
       return VPX_CODEC_ERROR;
+    }
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -623,9 +743,17 @@
                                              va_list args) {
   int *const display_size = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (display_size) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &worker_data->pbi->common;
       display_size[0] = cm->display_width;
       display_size[1] = cm->display_height;
     } else {

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 9dbb678..6a34f7e 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -105,11 +105,9 @@
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 endif
 
@@ -124,7 +122,9 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 8a9dfed..6c15f6e 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c

@@ -59,14 +59,11 @@
 typedef struct SvcInternal {
   char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
   char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
-  char quantizers_keyframe[OPTION_BUFFER_SIZE];  // set by
-                                                 // vpx_svc_set_quantizers
   char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors
 
   // values extracted from option, quantizers
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int quantizer_keyframe[VPX_SS_MAX_LAYERS];
   int quantizer[VPX_SS_MAX_LAYERS];
 
   // accumulated statistics
@@ -198,8 +195,7 @@
 }
 
 static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values,
-                                              const int is_keyframe) {
+                                              const char *quantizer_values) {
   char *input_string;
   char *token;
   const char *delim = ",";
@@ -210,11 +206,6 @@
   SvcInternal *const si = get_svc_internal(svc_ctx);
 
   if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
-    if (is_keyframe) {
-      // If there non settings for key frame, we will apply settings from
-      // non key frame. So just simply return here.
-      return VPX_CODEC_INVALID_PARAM;
-    }
     input_string = strdup(DEFAULT_QUANTIZER_VALUES);
   } else {
     input_string = strdup(quantizer_values);
@@ -235,12 +226,7 @@
     } else {
       q = 0;
     }
-    if (is_keyframe) {
-      si->quantizer_keyframe[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers]
-      = q;
-    } else {
-      si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
-    }
+    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
   }
   if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
@@ -325,7 +311,6 @@
   char *option_name;
   char *option_value;
   char *input_ptr;
-  int is_keyframe_qaunt_set = 0;
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (options == NULL) return VPX_CODEC_OK;
@@ -348,17 +333,8 @@
       res = parse_scale_factors(svc_ctx, option_value);
       if (res != VPX_CODEC_OK) break;
     } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value, 0);
+      res = parse_quantizer_values(svc_ctx, option_value);
       if (res != VPX_CODEC_OK) break;
-      if (!is_keyframe_qaunt_set) {
-        SvcInternal *const si = get_svc_internal(svc_ctx);
-        memcpy(get_svc_internal(svc_ctx)->quantizer_keyframe, si->quantizer,
-               sizeof(si->quantizer));
-      }
-    } else if (strcmp("quantizers-keyframe", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value, 1);
-      if (res != VPX_CODEC_OK) break;
-      is_keyframe_qaunt_set = 1;
     } else {
       svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
       res = VPX_CODEC_INVALID_PARAM;
@@ -381,19 +357,13 @@
 }
 
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizers,
-                                       const int is_for_keyframe) {
+                                       const char *quantizers) {
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (is_for_keyframe) {
-    strncpy(si->quantizers_keyframe, quantizers, sizeof(si->quantizers));
-    si->quantizers_keyframe[sizeof(si->quantizers_keyframe) - 1] = '\0';
-  } else {
-    strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
-    si->quantizers[sizeof(si->quantizers) - 1] = '\0';
-  }
+  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
+  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
   return VPX_CODEC_OK;
 }
 
@@ -440,13 +410,9 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  res = parse_quantizer_values(svc_ctx, si->quantizers, 0);
+  res = parse_quantizer_values(svc_ctx, si->quantizers);
   if (res != VPX_CODEC_OK) return res;
 
-  res = parse_quantizer_values(svc_ctx, si->quantizers_keyframe, 1);
-  if (res != VPX_CODEC_OK)
-    memcpy(si->quantizer_keyframe, si->quantizer, sizeof(si->quantizer));
-
   res = parse_scale_factors(svc_ctx, si->scale_factors);
   if (res != VPX_CODEC_OK) return res;
 
@@ -666,13 +632,8 @@
   layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
 
   if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
-    if (vpx_svc_is_keyframe(svc_ctx)) {
-      svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
-      svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
-    } else {
-      svc_params.min_quantizer = si->quantizer[layer_index];
-      svc_params.max_quantizer = si->quantizer[layer_index];
-    }
+    svc_params.min_quantizer = si->quantizer[layer_index];
+    svc_params.max_quantizer = si->quantizer[layer_index];
   } else {
     svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
     svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;

diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index 8bfdba5..e0de263 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h

@@ -56,8 +56,7 @@
  * e.g., "60,53,39,33,27"
  */
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizer_values,
-                                       const int is_for_keyframe);
+                                       const char *quantizer_values);
 
 /**
  * Set SVC scale factors