Merge "Merge branch 'masterbase' into nextgenv2" into nextgenv2
diff --git a/CHANGELOG b/CHANGELOG
index 7746cc6..7db420e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+Next Release
+  - Incompatible changes:
+    The VP9 encoder's default keyframe interval changed to 128 from 9999.
+
 2015-11-09 v1.5.0 "Javan Whistling Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 0248ede..fc775ef 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -347,8 +347,7 @@
     double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
     int                  psnr_count[NUM_ENCODERS] = {0};
 
-    double               cx_time = 0;
-    struct  timeval      tv1, tv2, difftv;
+    int64_t              cx_time = 0;
 
     /* Set the required target bitrates for each resolution level.
      * If target bitrate for highest-resolution level is set to 0,
@@ -582,6 +581,7 @@
 
     while(frame_avail || got_data)
     {
+        struct vpx_usec_timer timer;
         vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
         const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
 
@@ -636,18 +636,18 @@
             vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
         }
 
-        gettimeofday(&tv1, NULL);
         /* Encode each frame at multi-levels */
         /* Note the flags must be set to 0 in the encode call if they are set
            for each frame with the vpx_codec_control(), as done above. */
+        vpx_usec_timer_start(&timer);
         if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
             frame_cnt, 1, 0, arg_deadline))
         {
             die_codec(&codec[0], "Failed to encode frame");
         }
-        gettimeofday(&tv2, NULL);
-        timersub(&tv2, &tv1, &difftv);
-        cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+        vpx_usec_timer_mark(&timer);
+        cx_time += vpx_usec_timer_elapsed(&timer);
+
         for (i=NUM_ENCODERS-1; i>=0 ; i--)
         {
             got_data = 0;
@@ -686,8 +686,10 @@
         frame_cnt++;
     }
     printf("\n");
-    printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
-           1000000 * (double)frame_cnt / (double)cx_time);
+    printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+            frame_cnt,
+            1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+            1000000 * (double)frame_cnt / (double)cx_time);
 
     fclose(infile);
 
diff --git a/test/altref_test.cc b/test/altref_test.cc
index af25b72..0799f42 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -14,6 +14,8 @@
 #include "test/util.h"
 namespace {
 
+#if CONFIG_VP8_ENCODER
+
 // lookahead range: [kLookAheadMin, kLookAheadMax).
 const int kLookAheadMin = 5;
 const int kLookAheadMax = 26;
@@ -63,7 +65,106 @@
   EXPECT_GE(altref_count(), 1);
 }
 
-
 VP8_INSTANTIATE_TEST_CASE(AltRefTest,
                           ::testing::Range(kLookAheadMin, kLookAheadMax));
+
+#endif  // CONFIG_VP8_ENCODER
+
+class AltRefForcedKeyTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  AltRefForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)),
+        forced_kf_frame_num_(1),
+        frame_num_(0) {}
+  virtual ~AltRefForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.g_threads = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      // override test default for tile columns if necessary.
+#if CONFIG_VP9_ENCODER
+      if (GET_PARAM(0) == &libvpx_test::kVP9) {
+        encoder->Control(VP9E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+#if CONFIG_VP10_ENCODER
+      if (GET_PARAM(0) == &libvpx_test::kVP10) {
+        encoder->Control(VP9E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (frame_num_ == forced_kf_frame_num_) {
+      ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
+          << "Frame #" << frame_num_ << " isn't a keyframe!";
+    }
+    ++frame_num_;
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  unsigned int forced_kf_frame_num_;
+  unsigned int frame_num_;
+};
+
+TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
+  const vpx_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
+  const vpx_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+VP8_INSTANTIATE_TEST_CASE(
+    AltRefForcedKeyTestLarge,
+    ::testing::Values(::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 9));
+
+VP9_INSTANTIATE_TEST_CASE(
+    AltRefForcedKeyTestLarge,
+    ::testing::Values(::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+    AltRefForcedKeyTestLarge,
+    ::testing::Values(::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 9));
+
 }  // namespace
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 6a938a0..572834c 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,7 +26,8 @@
       : EncoderTest(GET_PARAM(0)),
         encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)),
-        min_psnr_(kMaxPSNR) {}
+        min_psnr_(kMaxPSNR),
+        tune_content_(VP9E_CONTENT_DEFAULT) {}
   virtual ~CpuSpeedTest() {}
 
   virtual void SetUp() {
@@ -49,6 +50,7 @@
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -66,6 +68,7 @@
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
   double min_psnr_;
+  int tune_content_;
 };
 
 TEST_P(CpuSpeedTest, TestQ0) {
@@ -103,6 +106,21 @@
   EXPECT_GE(min_psnr_, kMaxPSNR);
 }
 
+TEST_P(CpuSpeedTest, TestTuneScreen) {
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  tune_content_ = VP9E_CONTENT_SCREEN;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
   // Validate that this non multiple of 64 wide clip encodes and decodes
   // without a mismatch when passing in a very low max q.  This pushes
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 9d5074e..5467c46 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -519,6 +519,9 @@
   cfg_.rc_end_usage = VPX_CBR;
   cfg_.rc_target_bitrate = 200;
   cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
 
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
@@ -774,10 +777,6 @@
         svc_params_.max_quantizers[i] = 63;
         svc_params_.min_quantizers[i] = 0;
       }
-      svc_params_.scaling_factor_num[0] = 144;
-      svc_params_.scaling_factor_den[0] = 288;
-      svc_params_.scaling_factor_num[1] = 288;
-      svc_params_.scaling_factor_den[1] = 288;
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
       encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
@@ -814,8 +813,6 @@
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
       duration_ = (last_pts_ + 1) * timebase_;
-      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
-          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
       file_datarate_ = file_size_in_kb / duration_;
     }
   }
@@ -839,7 +836,6 @@
   int64_t bits_total_;
   double duration_;
   double file_datarate_;
-  double effective_datarate_;
   size_t bits_in_last_frame_;
   vpx_svc_extra_cfg_t svc_params_;
   int speed_setting_;
@@ -884,7 +880,49 @@
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate.
+  for (int i = 200; i <= 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -907,25 +945,26 @@
   cfg_.rc_dropframe_thresh = 10;
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 200);
-  // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
-  // layer target_bitrate. Also check if test can pass at lower bitrate (~200k).
-  for (int i = 400; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
+  cfg_.rc_target_bitrate = 400;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  for (int j = 64; j <= 67; j++) {
+    cfg_.kf_max_dist = j;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
         cfg_.ts_number_layers, cfg_.temporal_layering_mode);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
             << " The datarate for the file exceeds the target by too much!";
     ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
         << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -946,6 +985,7 @@
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
   ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
                                        30, 1, 0, 300);
   cfg_.rc_target_bitrate = 800;
@@ -953,19 +993,143 @@
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
       cfg_.ts_number_layers, cfg_.temporal_layering_mode);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
           << " The datarate for the file exceeds the target by too much!";
   ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
       << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+     cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  for (int j = 32; j <= 35; j++) {
+    cfg_.kf_max_dist = j;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+       cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
+        << " The datarate for the file is lower than the target by too much!";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 4;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 7));
+                          ::testing::Range(2, 9));
 VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
                           ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 8));
+                          ::testing::Range(5, 9));
 }  // namespace
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index b16f14c..778a36c 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -430,7 +430,7 @@
 
 using std::tr1::make_tuple;
 
-#if HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     MMX, Loop8Test6Param,
     ::testing::Values(
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 0177308..eaebd75 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -463,6 +463,17 @@
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
+  virtual void MismatchHook(const vpx_image_t *img1,
+                             const vpx_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() {
+      return mismatch_nframes_;
+  }
+
   void DefaultConfig() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -488,6 +499,8 @@
   std::vector< FrameInfo > frame_info_list_;
   int set_cpu_used_;
   bool change_bitrate_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
 };
 
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@@ -497,6 +510,8 @@
   // Disable internal resize for this test.
   cfg_.rc_resize_allowed = 0;
   change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
@@ -510,6 +525,7 @@
         << "Frame " << frame << " had unexpected width";
     EXPECT_EQ(expected_h, info->h)
         << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
 
@@ -523,6 +539,8 @@
   cfg_.g_w = 352;
   cfg_.g_h = 288;
   change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
@@ -542,6 +560,7 @@
 
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
@@ -554,6 +573,8 @@
   cfg_.g_w = 352;
   cfg_.g_h = 288;
   change_bitrate_ = true;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   // Disable dropped frames.
   cfg_.rc_dropframe_thresh = 0;
   // Starting bitrate low.
@@ -583,6 +604,7 @@
 
   // Verify that we get 2 resize events in this test.
   ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
 vpx_img_fmt_t CspForFrameNumber(int frame) {
diff --git a/test/test.mk b/test/test.mk
index 1f120ce..db2e361 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -19,6 +19,7 @@
 LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
@@ -28,7 +29,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
diff --git a/third_party/x86inc/README.libvpx b/third_party/x86inc/README.libvpx
index e91e305..8d3cd96 100644
--- a/third_party/x86inc/README.libvpx
+++ b/third_party/x86inc/README.libvpx
@@ -1,5 +1,5 @@
-URL: http://git.videolan.org/?p=x264.git
-Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d
+URL: https://git.videolan.org/git/x264.git
+Version: d23d18655249944c1ca894b451e2c82c7a584c62
 License: ISC
 License File: LICENSE
 
@@ -13,12 +13,8 @@
 Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
   exist in libvpx.
 Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
-Catch all elf formats for 'hidden' status and SECTION notes.
-Avoid 'amdnop' when building with nasm.
 Set 'private_extern' visibility for macho targets.
 Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
 Use .text with no alignment for aout
 Only use 'hidden' visibility with Chromium
-Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
-  'ALIGNMODE'.
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index e7d3fa5..b647dff 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2015 x264 project
+;* Copyright (C) 2005-2016 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
@@ -66,16 +66,35 @@
     %endif
 %endif
 
-%ifidn   __OUTPUT_FORMAT__,elf32
-    %define mangle(x) x
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf64
-    %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,x64
-    %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,win64
-    %define mangle(x) x
+    %define FORMAT_ELF 1
+%endif
+
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,macho32
+     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+     %define FORMAT_MACHO 1
+%endif
+
+; Set PREFIX for libvpx builds.
+%if FORMAT_ELF
+    %undef PREFIX
+%elif WIN64
+    %undef PREFIX
 %else
+    %define PREFIX
+%endif
+
+%ifdef PREFIX
     %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
 %endif
 
 ; In some instances macho32 tables get misaligned when using .rodata.
@@ -94,14 +113,6 @@
     %endif
 %endmacro
 
-%macro SECTION_TEXT 0-1 16
-    %ifidn __OUTPUT_FORMAT__,aout
-        SECTION .text
-    %else
-        SECTION .text align=%1
-    %endif
-%endmacro
-
 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
 ; from original code is added in for 64bit.
 %ifidn __OUTPUT_FORMAT__,elf32
@@ -188,8 +199,16 @@
 %ifdef PIC
     default rel
 %endif
+
+%ifndef GET_GOT_DEFINED
+    %define GET_GOT_DEFINED 0
+%endif
 ; Done with PIC macros
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
@@ -237,6 +256,7 @@
     %define r%1w %2w
     %define r%1b %2b
     %define r%1h %2h
+    %define %2q %2
     %if %0 == 2
         %define r%1m  %2d
         %define r%1mp %2
@@ -261,9 +281,9 @@
     %define e%1h %3
     %define r%1b %2
     %define e%1b %2
-%if ARCH_X86_64 == 0
-    %define r%1  e%1
-%endif
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
 %endmacro
 
 DECLARE_REG_SIZE ax, al, ah
@@ -373,7 +393,7 @@
 
 %macro ASSERT 1
     %if (%1) == 0
-        %error assert failed
+        %error assertion ``%1'' failed
     %endif
 %endmacro
 
@@ -464,8 +484,10 @@
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
                 %assign regs_used (regs_used + 1)
-            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
-                %warning "Stack pointer will overwrite register argument"
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments
+                %assign regs_used 5 + UNIX64 * 3
             %endif
         %endif
     %endif
@@ -579,9 +601,9 @@
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL rsp
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
@@ -618,17 +640,17 @@
 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
@@ -674,29 +696,29 @@
 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
     POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
 
 %if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%macro WIN64_PUSH_XMM 0
-%endmacro
+    %macro WIN64_SPILL_XMM 1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 1
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
 %endif
 
 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
@@ -709,24 +731,26 @@
     %else
         rep ret
     %endif
+    annotate_function_size
 %endmacro
 
 %define last_branch_adr $$
 %macro AUTO_REP_RET 0
-    %ifndef cpuflags
-        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
-    %elif notcpuflag(ssse3)
-        times ((last_branch_adr-$)>>31)+1 rep
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
     %endif
     ret
+    annotate_function_size
 %endmacro
 
 %macro BRANCH_INSTR 0-*
     %rep %0
         %macro %1 1-2 %1
             %2 %1
-            %%branch_instr:
-            %xdefine last_branch_adr %%branch_instr
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
         %endmacro
         %rotate 1
     %endrep
@@ -741,6 +765,7 @@
     %elif %2
         jmp %1
     %endif
+    annotate_function_size
 %endmacro
 
 ;=============================================================================
@@ -762,6 +787,7 @@
     cglobal_internal 0, %1 %+ SUFFIX, %2
 %endmacro
 %macro cglobal_internal 2-3+
+    annotate_function_size
     %if %1
         %xdefine %%FUNCTION_PREFIX private_prefix
         ; libvpx explicitly sets visibility in shared object builds. Avoid
@@ -782,17 +808,10 @@
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
-    %ifidn __OUTPUT_FORMAT__,elf32
+    %xdefine current_function_section __SECT__
+    %if FORMAT_ELF
         global %2:function %%VISIBILITY
-    %elifidn __OUTPUT_FORMAT__,elf64
-        global %2:function %%VISIBILITY
-    %elifidn __OUTPUT_FORMAT__,macho32
-        %ifdef __NASM_VER__
-            global %2
-        %else
-            global %2:private_extern
-        %endif
-    %elifidn __OUTPUT_FORMAT__,macho64
+    %elif FORMAT_MACHO
         %ifdef __NASM_VER__
             global %2
         %else
@@ -822,16 +841,16 @@
 
 ; like cextern, but without the prefix
 %macro cextern_naked 1
-    %xdefine %1 mangle(%1)
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
     CAT_XDEFINE cglobaled_, %1, 1
     extern %1
 %endmacro
 
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %ifidn __OUTPUT_FORMAT__,elf32
-        global %1:data hidden
-    %elifidn __OUTPUT_FORMAT__,elf64
+    %if FORMAT_ELF
         global %1:data hidden
     %else
         global %1
@@ -839,14 +858,29 @@
     %1: %2
 %endmacro
 
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf32
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%elifidn __OUTPUT_FORMAT__,elf64
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
 %endif
 
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
 ; cpuflags
 
 %assign cpuflags_mmx      (1<<0)
@@ -875,12 +909,9 @@
 %assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
 %assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
 
-%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
-
-%ifdef __NASM_VER__
-    %use smartalign
-%endif
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
 
 ; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
@@ -917,12 +948,18 @@
         %endif
     %endif
 
-    %ifdef __NASM_VER__
-        ALIGNMODE k7
-    %elif ARCH_X86_64 || cpuflag(sse2)
-        CPU amdnop
+    %if ARCH_X86_64 || cpuflag(sse2)
+        %ifdef __NASM_VER__
+            ALIGNMODE k8
+        %else
+            CPU amdnop
+        %endif
     %else
-        CPU basicnop
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
     %endif
 %endmacro
 
@@ -951,14 +988,14 @@
     %define movnta movntq
     %assign %%i 0
     %rep 8
-    CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nnmm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, mm %+ %%i
+        CAT_XDEFINE nnmm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     %rep 8
-    CAT_UNDEF m, %%i
-    CAT_UNDEF nnmm, %%i
-    %assign %%i %%i+1
+        CAT_UNDEF m, %%i
+        CAT_UNDEF nnmm, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -969,7 +1006,7 @@
     %define mmsize 16
     %define num_mmregs 8
     %if ARCH_X86_64
-    %define num_mmregs 16
+        %define num_mmregs 16
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -977,9 +1014,9 @@
     %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
-    CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nnxmm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, xmm %+ %%i
+        CAT_XDEFINE nnxmm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -990,7 +1027,7 @@
     %define mmsize 32
     %define num_mmregs 8
     %if ARCH_X86_64
-    %define num_mmregs 16
+        %define num_mmregs 16
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -998,9 +1035,9 @@
     %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
-    CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nnymm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, ymm %+ %%i
+        CAT_XDEFINE nnymm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -1024,7 +1061,7 @@
 %assign i 0
 %rep 16
     DECLARE_MMCAST i
-%assign i i+1
+    %assign i i+1
 %endrep
 
 ; I often want to use macros that permute their arguments. e.g. there's no
@@ -1042,23 +1079,23 @@
 ; doesn't cost any cycles.
 
 %macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
-    %xdefine %%tmp%2 m%2
-    %rotate 2
-%endrep
-%rep %0/2
-    %xdefine m%1 %%tmp%2
-    CAT_XDEFINE nn, m%1, %1
-    %rotate 2
-%endrep
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
 %endmacro
 
 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-%ifnum %1 ; SWAP 0, 1, ...
-    SWAP_INTERNAL_NUM %1, %2
-%else ; SWAP m0, m1, ...
-    SWAP_INTERNAL_NAME %1, %2
-%endif
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
 %endmacro
 
 %macro SWAP_INTERNAL_NUM 2-*
@@ -1068,7 +1105,7 @@
         %xdefine m%2 %%tmp
         CAT_XDEFINE nn, m%1, %1
         CAT_XDEFINE nn, m%2, %2
-    %rotate 1
+        %rotate 1
     %endrep
 %endmacro
 
@@ -1076,7 +1113,7 @@
     %xdefine %%args nn %+ %1
     %rep %0-1
         %xdefine %%args %%args, nn %+ %2
-    %rotate 1
+        %rotate 1
     %endrep
     SWAP_INTERNAL_NUM %%args
 %endmacro
@@ -1093,7 +1130,7 @@
     %assign %%i 0
     %rep num_mmregs
         CAT_XDEFINE %%f, %%i, m %+ %%i
-    %assign %%i %%i+1
+        %assign %%i %%i+1
     %endrep
 %endmacro
 
@@ -1103,20 +1140,20 @@
         %rep num_mmregs
             CAT_XDEFINE m, %%i, %1_m %+ %%i
             CAT_XDEFINE nn, m %+ %%i, %%i
-        %assign %%i %%i+1
+            %assign %%i %%i+1
         %endrep
     %endif
 %endmacro
 
 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
 %macro call 1
-    call_internal %1, %1 %+ SUFFIX
+    call_internal %1 %+ SUFFIX, %1
 %endmacro
 %macro call_internal 2
-    %xdefine %%i %1
-    %ifndef cglobaled_%1
-        %ifdef cglobaled_%2
-            %xdefine %%i %2
+    %xdefine %%i %2
+    %ifndef cglobaled_%2
+        %ifdef cglobaled_%1
+            %xdefine %%i %1
         %endif
     %endif
     call %%i
@@ -1159,7 +1196,7 @@
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
+    %assign i i+1
 %endrep
 %undef i
 
@@ -1536,7 +1573,7 @@
     %else
         CAT_XDEFINE q, j, i
     %endif
-%assign i i+1
+    %assign i i+1
 %endrep
 %undef i
 %undef j
@@ -1559,55 +1596,54 @@
 FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
 FMA_INSTR pmadcswd, pmaddwd, paddd
 
-; convert FMA4 to FMA3 if possible
-%macro FMA4_INSTR 4
-    %macro %1 4-8 %1, %2, %3, %4
-        %if cpuflag(fma4)
-            v%5 %1, %2, %3, %4
-        %elifidn %1, %2
-            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
-        %elifidn %1, %3
-            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
-        %elifidn %1, %4
-            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
-        %else
-            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifid %3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
 %endmacro
 
-FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
-FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
-FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
-FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
 
-FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
-FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
-FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
-FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
-
-FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
-FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
-FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
-FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
-
-FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
-FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
-FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
-FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
-
-FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
-FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
-FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
-FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
-
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
-    movddup %1, %2
-%else
-    vbroadcastsd %1, %2
-%endif
-%endmacro
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
 %endif
diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index 3ac93b5..104a91a 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@@ -296,15 +296,15 @@
 
 static INLINE uint8_t vp10_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
                                    int ref_idx) {
-  if (ref_mv_stack[ref_idx].weight > REF_CAT_LEVEL &&
-      ref_mv_stack[ref_idx + 1].weight > REF_CAT_LEVEL) {
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL) {
     if (ref_mv_stack[ref_idx].weight == ref_mv_stack[ref_idx + 1].weight)
       return 0;
     else
       return 1;
   }
 
-  if (ref_mv_stack[ref_idx].weight > REF_CAT_LEVEL &&
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
     return 2;
 
@@ -316,7 +316,6 @@
       return 4;
   }
 
-  assert(0);
   return 0;
 }
 #endif
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index abb9e0b..97d091a 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -4828,6 +4828,20 @@
   arf_src_index = get_arf_src_index(cpi);
 
   if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = vp10_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = vp10_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 113865f..5ae44e8 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -23,7 +23,7 @@
  */
 static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
 static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
-static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 80;
 
 /*
  * The filter function was modified to reduce the computational complexity.
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 9a379a6..148ccda 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (600)  // ~(16 * 16 * 1.5)
+#define SUM_DIFF_THRESHOLD 448
+#define SUM_DIFF_THRESHOLD_HIGH 512
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 4f689c4..2a0c298 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -518,7 +518,6 @@
 
     cpi->b_multi_threaded = 0;
     cpi->encoding_thread_count = 0;
-    cpi->b_lpf_running = 0;
 
     pthread_mutex_init(&cpi->mt_mutex, NULL);
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 354bdfe..0efdac4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1531,15 +1531,6 @@
     if (!oxcf)
         return;
 
-#if CONFIG_MULTITHREAD
-    /*  wait for the last picture loopfilter thread done */
-    if (cpi->b_lpf_running)
-    {
-        sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
-    }
-#endif
-
     if (cm->version != oxcf->Version)
     {
         cm->version = oxcf->Version;
@@ -3589,15 +3580,6 @@
     /* Clear down mmx registers to allow floating point in what follows */
     vp8_clear_system_state();
 
-#if CONFIG_MULTITHREAD
-    /*  wait for the last picture loopfilter thread done */
-    if (cpi->b_lpf_running)
-    {
-        sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
-    }
-#endif
-
     if(cpi->force_next_frame_intra)
     {
         cm->frame_type = KEY_FRAME;  /* delayed intra frame */
@@ -4326,8 +4308,6 @@
             vp8_setup_key_frame(cpi);
         }
 
-
-
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
         {
             if(cpi->oxcf.error_resilient_mode)
@@ -4793,7 +4773,6 @@
     {
         /* start loopfilter in separate thread */
         sem_post(&cpi->h_event_start_lpf);
-        cpi->b_lpf_running = 1;
     }
     else
 #endif
@@ -4825,11 +4804,10 @@
     vp8_pack_bitstream(cpi, dest, dest_end, size);
 
 #if CONFIG_MULTITHREAD
-    /* if PSNR packets are generated we have to wait for the lpf */
-    if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+    /* wait for the lpf thread done */
+    if (cpi->b_multi_threaded)
     {
         sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
     }
 #endif
 
@@ -5757,14 +5735,6 @@
     {
         int ret;
 
-#if CONFIG_MULTITHREAD
-        if(cpi->b_lpf_running)
-        {
-            sem_wait(&cpi->h_event_end_lpf);
-            cpi->b_lpf_running = 0;
-        }
-#endif
-
 #if CONFIG_POSTPROC
         cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b436548..86f401c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -536,7 +536,6 @@
     int mt_sync_range;
     int b_multi_threaded;
     int encoding_thread_count;
-    int b_lpf_running;
 
     pthread_t *h_encoding_thread;
     pthread_t h_filter_thread;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0ea0632..51fbe54 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,7 +50,8 @@
 static const int skin_mean[5][2] =
     {{7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
 static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
-static const int skin_threshold[2] = {1570636, 800000};       // q18
+static const int skin_threshold[6] = {1570636, 1400000, 800000, 800000, 800000,
+    800000};  // q18
 
 // Evaluates the Mahalanobis distance measure for the input CbCr values.
 static int evaluate_skin_color_difference(int cb, int cr, int idx) {
@@ -73,7 +74,7 @@
 }
 
 // Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr)
+static int is_skin_color(int y, int cb, int cr, int consec_zeromv)
 {
   if (y < 40 || y > 220)
   {
@@ -88,13 +89,31 @@
     else
     {
       int i = 0;
-      for (; i < 5; i++)
-      {
-        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[1])
-        {
-          return 1;
-        }
-      }
+      // No skin if block has been zero motion for long consecutive time.
+      if (consec_zeromv > 80)
+        return 0;
+      // Exit on grey.
+       if (cb == 128 && cr == 128)
+         return 0;
+       // Exit on very strong cb.
+       if (cb > 150 && cr < 110)
+         return 0;
+       for (; i < 5; i++) {
+         int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+         if (skin_color_diff < skin_threshold[i + 1]) {
+            if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+              return 0;
+            else if (consec_zeromv > 30 &&
+                     skin_color_diff > (skin_threshold[i + 1] >> 1))
+              return 0;
+            else
+             return 1;
+         }
+         // Exit if difference is much large than the threshold.
+         if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+           return 0;
+         }
+       }
       return 0;
     }
   }
@@ -851,8 +870,10 @@
         x->src.v_buffer[4 * x->src.uv_stride + 3] +
         x->src.v_buffer[4 * x->src.uv_stride + 4]) >> 2;
     x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode)
-      x->is_skin = is_skin_color(y, cb, cr);
+    if (!cpi->oxcf.screen_content_mode) {
+      int block_index = mb_row * cpi->common.mb_cols + mb_col;
+      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
+    }
     }
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9ce137d..32c7219 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -189,54 +189,31 @@
                                           uint8_t *dst, int stride,
                                           int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless) {
-        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_8X8:
-            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_16X16:
-            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_32X32:
-            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-        }
-      }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->lossless) {
+      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
     } else {
-      if (xd->lossless) {
-        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_8X8:
-            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_16X16:
-            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_32X32:
-            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-            return;
-        }
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_8X8:
+          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_16X16:
+          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_32X32:
+          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
       }
     }
-#else
+  } else {
     if (xd->lossless) {
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
@@ -258,18 +235,40 @@
           return;
       }
     }
+  }
+#else
+  if (xd->lossless) {
+    vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+  } else {
+    switch (tx_size) {
+      case TX_4X4:
+        vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        return;
+    }
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
+  if (eob == 1) {
+    dqcoeff[0] = 0;
+  } else {
+    if (tx_size <= TX_16X16 && eob <= 10)
+      memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+    else if (tx_size == TX_32X32 && eob <= 34)
+      memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+    else
+      memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
   }
 }
 
@@ -279,54 +278,31 @@
                                           uint8_t *dst, int stride,
                                           int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless) {
-        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_8X8:
-            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_16X16:
-            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_32X32:
-            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-        }
-      }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->lossless) {
+      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
     } else {
-      if (xd->lossless) {
-        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_8X8:
-            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_16X16:
-            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_32X32:
-            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-            return;
-        }
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_8X8:
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_16X16:
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_32X32:
+          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
       }
     }
-#else
+  } else {
     if (xd->lossless) {
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
@@ -348,18 +324,40 @@
           return;
       }
     }
+  }
+#else
+  if (xd->lossless) {
+    vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+  } else {
+    switch (tx_size) {
+      case TX_4X4:
+        vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        return;
+    }
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
+  if (eob == 1) {
+    dqcoeff[0] = 0;
+  } else {
+    if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+      memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+    else if (tx_size == TX_32X32 && eob <= 34)
+      memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+    else
+      memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
   }
 }
 
@@ -389,8 +387,10 @@
         &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
                                             r, mi->segment_id);
-    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
-                                  dst, pd->dst.stride, eob);
+    if (eob > 0) {
+      inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                    dst, pd->dst.stride, eob);
+    }
   }
 }
 
@@ -402,9 +402,11 @@
   const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
                                           mi->segment_id);
 
-  inverse_transform_block_inter(xd, plane, tx_size,
-                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
-                            pd->dst.stride, eob);
+  if (eob > 0) {
+    inverse_transform_block_inter(
+        xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+        pd->dst.stride, eob);
+  }
   return eob;
 }
 
@@ -859,7 +861,7 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
   if (mi->skip) {
     dec_reset_skip_context(xd);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8604420..596427c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -149,17 +149,12 @@
 }
 
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vpx_reader *r) {
+                                 int mi_row, int mi_col, vpx_reader *r,
+                                 int x_mis, int y_mis) {
   struct segmentation *const seg = &cm->seg;
   MODE_INFO *const mi = xd->mi[0];
   int predicted_segment_id, segment_id;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = xd->plane[0].n4_w >> 1;
-  const int bh = xd->plane[0].n4_h >> 1;
-
-  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
@@ -202,19 +197,14 @@
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r,
+                                       int x_mis, int y_mis) {
   MODE_INFO *const mi = xd->mi[0];
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi  = xd->left_mi;
   const BLOCK_SIZE bsize = mi->sb_type;
   int i;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = xd->plane[0].n4_w >> 1;
-  const int bh = xd->plane[0].n4_h >> 1;
-
-  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
 
   mi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
   mi->skip = read_skip(cm, xd, mi->segment_id, r);
@@ -473,14 +463,13 @@
   }
 }
 
-static void dec_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist,
-                                  int_mv *best_mv, int refmv_count) {
+static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
+                                  int refmv_count) {
   int i;
 
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < refmv_count; ++i) {
     lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-    clamp_mv2(&mvlist[i].as_mv, xd);
     *best_mv = mvlist[i];
   }
 }
@@ -788,7 +777,7 @@
                                        tmp_mvs, mi_row, mi_col, -1, 0,
                                        fpm_sync, (void *)pbi);
 
-        dec_find_best_ref_mvs(xd, allow_hp, tmp_mvs, &best_ref_mvs[ref],
+        dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
                               refmv_count);
       }
     }
@@ -839,12 +828,14 @@
 
 static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r,
+                                       int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   int inter_block;
 
-  mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+  mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r, x_mis,
+                                         y_mis);
   mi->skip = read_skip(cm, xd, mi->segment_id, r);
   inter_block = read_is_inter_block(cm, xd, mi->segment_id, r);
   mi->tx_size = read_tx_size(cm, xd, !mi->skip || !inter_block, r);
@@ -860,7 +851,7 @@
   memcpy(dst, src, sizeof(*dst) * 2);
 }
 
-void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                         int mi_row, int mi_col, vpx_reader *r,
                         int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
@@ -869,9 +860,9 @@
   int w, h;
 
   if (frame_is_intra_only(cm)) {
-    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
     for (h = 0; h < y_mis; ++h) {
       for (w = 0; w < x_mis; ++w) {
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 75f568c..45569ec 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                         int mi_row, int mi_col, vpx_reader *r,
                         int x_mis, int y_mis);
 
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index ac834ca..b27ce6a 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -192,11 +192,16 @@
                                      p[2].src.buf,
                                      p[0].src.stride,
                                      p[1].src.stride,
-                                     bsize);
+                                     bsize,
+                                     0,
+                                     0);
     if (is_skin)
       refresh_this_block = 1;
   }
 
+  if (cpi->oxcf.rc_mode == VPX_VBR && mi->ref_frame[0] == GOLDEN_FRAME)
+    refresh_this_block = 0;
+
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
   if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
@@ -304,6 +309,8 @@
     rc->baseline_gf_interval = VPXMIN(4 * (100 / cr->percent_refresh), 40);
   else
     rc->baseline_gf_interval = 40;
+  if (cpi->oxcf.rc_mode == VPX_VBR)
+    rc->baseline_gf_interval = 20;
 }
 
 // Update some encoding stats (from the just encoded frame). If this frame's
@@ -316,42 +323,40 @@
   int mi_row, mi_col;
   double fraction_low = 0.0;
   int low_content_frame = 0;
-
   MODE_INFO **mi = cm->mi_grid_visible;
   RATE_CONTROL *const rc = &cpi->rc;
   const int rows = cm->mi_rows, cols = cm->mi_cols;
   int cnt1 = 0, cnt2 = 0;
   int force_gf_refresh = 0;
-
+  int flag_force_gf_high_motion = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 ?
-          mi[0]->mv[0].as_mv.row : -1 * mi[0]->mv[0].as_mv.row;
-      int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 ?
-          mi[0]->mv[0].as_mv.col : -1 * mi[0]->mv[0].as_mv.col;
-
-      // Calculate the motion of the background.
-      if (abs_mvr <= 16 && abs_mvc <= 16) {
-        cnt1++;
-        if (abs_mvr == 0 && abs_mvc == 0)
-          cnt2++;
+      if (flag_force_gf_high_motion == 1) {
+        int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 ?
+            mi[0]->mv[0].as_mv.row : -1 * mi[0]->mv[0].as_mv.row;
+        int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 ?
+            mi[0]->mv[0].as_mv.col : -1 * mi[0]->mv[0].as_mv.col;
+        // Calculate the motion of the background.
+        if (abs_mvr <= 16 && abs_mvc <= 16) {
+          cnt1++;
+          if (abs_mvr == 0 && abs_mvc == 0)
+            cnt2++;
+        }
       }
       mi++;
-
       // Accumulate low_content_frame.
       if (cr->map[mi_row * cols + mi_col] < 1)
         low_content_frame++;
     }
     mi += 8;
   }
-
   // For video conference clips, if the background has high motion in current
   // frame because of the camera movement, set this frame as the golden frame.
   // Use 70% and 5% as the thresholds for golden frame refreshing.
   // Also, force this frame as a golden update frame if this frame will change
   // the resolution (resize_pending != 0).
   if (cpi->resize_pending != 0 ||
-     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+     (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
     vp9_cyclic_refresh_set_golden_update(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -360,7 +365,6 @@
     cpi->refresh_golden_frame = 1;
     force_gf_refresh = 1;
   }
-
   fraction_low =
       (double)low_content_frame / (rows * cols);
   // Update average.
@@ -503,6 +507,18 @@
     cr->motion_thresh = 4;
     cr->rate_boost_fac = 12;
   }
+  if (cpi->oxcf.rc_mode == VPX_VBR) {
+    // To be adjusted for VBR mode, e.g., based on gf period and boost.
+    // For now use smaller qp-delta (than CBR), no second boosted seg, and
+    // turn-off (no refresh) on golden refresh (since it's already boosted).
+    cr->percent_refresh = 10;
+    cr->rate_ratio_qdelta = 1.5;
+    cr->rate_boost_fac = 10;
+    if (cpi->refresh_golden_frame == 1) {
+      cr->percent_refresh = 0;
+      cr->rate_ratio_qdelta = 1.0;
+    }
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index e419cff..9bc9f26 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -338,7 +338,9 @@
                                      mb->plane[2].src.buf,
                                      mb->plane[0].src.stride,
                                      mb->plane[1].src.stride,
-                                     bs);
+                                     bs,
+                                     0,
+                                     0);
   }
 
   mv_col = ctx->best_sse_mv.as_mv.col;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index cf1fe81..9076b31 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -243,14 +243,16 @@
 static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col,
                                       BLOCK_SIZE bsize) {
-  const int block_width = num_8x8_blocks_wide_lookup[bsize];
-  const int block_height = num_8x8_blocks_high_lookup[bsize];
+  const int block_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
+                                 cm->mi_cols - mi_col);
+  const int block_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
+                                  cm->mi_rows - mi_row);
+  const int mi_stride = xd->mi_stride;
+  MODE_INFO *const src_mi = xd->mi[0];
   int i, j;
   for (j = 0; j < block_height; ++j)
-    for (i = 0; i < block_width; ++i) {
-      if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
-        xd->mi[j * xd->mi_stride + i] = xd->mi[0];
-    }
+    for (i = 0; i < block_width; ++i)
+      xd->mi[j * mi_stride + i] = src_mi;
 }
 
 static void set_block_size(VP9_COMP * const cpi,
@@ -691,21 +693,17 @@
   const int use_4x4_partition = cm->frame_type == KEY_FRAME;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
+  int segment_id;
 
-  int segment_id = CR_SEGMENT_ID_BASE;
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  segment_id = xd->mi[0]->segment_id;
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
-                                                    cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
       set_vbp_thresholds(cpi, thresholds, q);
     }
   }
 
-  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
-
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0)
@@ -770,37 +768,59 @@
       x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
     }
 
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     // Check if most of the superblock is skin content, and if so, force split
-    // to 32x32. Avoid checking superblocks on/near boundary and avoid low
-    // resolutons for now.
+    // to 32x32, and set x->sb_is_skin for use in mode selection.
+    // Avoid checking superblocks on/near boundary and avoid low resolutions.
     // Note superblock may still pick 64X64 if y_sad is very small
     // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
     x->sb_is_skin = 0;
 #if !CONFIG_VP9_HIGHBITDEPTH
     if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
         mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
+      CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+      int bl_index1, bl_index2, bl_index3;
       int num_16x16_skin = 0;
       int num_16x16_nonskin = 0;
+      int is_skin = 0;
+      int consec_zeromv = 0;
       uint8_t *ysignal = x->plane[0].src.buf;
       uint8_t *usignal = x->plane[1].src.buf;
       uint8_t *vsignal = x->plane[2].src.buf;
       int spuv = x->plane[1].src.stride;
-      for (i = 0; i < 4; i++) {
-        for (j = 0; j < 4; j++) {
-          int is_skin = vp9_compute_skin_block(ysignal,
-                                               usignal,
-                                               vsignal,
-                                               sp,
-                                               spuv,
-                                               BLOCK_16X16);
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+      // Loop through the 16x16 sub-blocks.
+      int j, i;
+      for (i = 0; i < ymis; i+=2) {
+        for (j = 0; j < xmis; j+=2) {
+          int bl_index = block_index + i * cm->mi_cols + j;
+          bl_index1 = bl_index + 1;
+          bl_index2 = bl_index + cm->mi_cols;
+          bl_index3 = bl_index2 + 1;
+          consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+                                 VPXMIN(cr->consec_zero_mv[bl_index1],
+                                 VPXMIN(cr->consec_zero_mv[bl_index2],
+                                 cr->consec_zero_mv[bl_index3])));
+          is_skin = vp9_compute_skin_block(ysignal,
+                                           usignal,
+                                           vsignal,
+                                           sp,
+                                           spuv,
+                                           BLOCK_16X16,
+                                           consec_zeromv,
+                                           0);
           num_16x16_skin += is_skin;
           num_16x16_nonskin += (1 - is_skin);
           if (num_16x16_nonskin > 3) {
             // Exit loop if at least 4 of the 16x16 blocks are not skin.
-            i = 4;
-            j = 4;
+            i = ymis;
+            j = xmis;
           }
           ysignal += 16;
           usignal += 8;
@@ -2439,7 +2459,8 @@
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
-  int i, pl;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   BLOCK_SIZE subsize;
   RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
@@ -2587,7 +2608,6 @@
                      &this_rdc, bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  this_rdc.rate, this_rdc.dist);
@@ -2706,7 +2726,6 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
@@ -2772,7 +2791,6 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -2824,7 +2842,6 @@
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
@@ -4271,13 +4288,9 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8[0];
+  MODE_INFO *mi = xd->mi[0];
   const int seg_skip = segfeature_active(&cm->seg, mi->segment_id,
                                          SEG_LVL_SKIP);
-  const int mis = cm->mi_stride;
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 &&
                    cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
@@ -4333,20 +4346,14 @@
       ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
                       &td->counts->tx)[mi->tx_size];
     } else {
-      int x, y;
-      TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(mi)) {
-        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
-                         max_txsize_lookup[bsize]);
+        mi->tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                             max_txsize_lookup[bsize]);
       } else {
-        tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
+        mi->tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
       }
 
-      for (y = 0; y < mi_height; y++)
-        for (x = 0; x < mi_width; x++)
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
-            mi_8x8[mis * y + x]->tx_size = tx_size;
     }
     ++td->counts->tx.tx_totals[mi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 8f4d80c..71f27cc 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -75,11 +75,12 @@
 static void build_nmv_component_cost_table(int *mvcost,
                                            const nmv_component* const mvcomp,
                                            int usehp) {
-  int i, v;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
   int bits_cost[MV_OFFSET_BITS][2];
   int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
+  int i;
+  int c, o;
 
   sign_cost[0] = vp9_cost_zero(mvcomp->sign);
   sign_cost[1] = vp9_cost_one(mvcomp->sign);
@@ -94,44 +95,56 @@
     vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
   vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
 
-  if (usehp) {
-    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
-    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
-    hp_cost[0] = vp9_cost_zero(mvcomp->hp);
-    hp_cost[1] = vp9_cost_one(mvcomp->hp);
-  }
+  // Always build the hp costs to avoid an uninitialized warning from gcc
+  class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+  class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+  hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+  hp_cost[1] = vp9_cost_one(mvcomp->hp);
+
   mvcost[0] = 0;
-  for (v = 1; v <= MV_MAX; ++v) {
-    int z, c, o, d, e, f, cost = 0;
-    z = v - 1;
-    c = vp9_get_mv_class(z, &o);
-    cost += class_cost[c];
+  // MV_CLASS_0
+  for (o = 0; o < (CLASS0_SIZE << 3); ++o) {
+    int d, e, f;
+    int cost = class_cost[MV_CLASS_0];
+    int v = o + 1;
     d = (o >> 3);               /* int mv data */
     f = (o >> 1) & 3;           /* fractional pel mv data */
-    e = (o & 1);                /* high precision mv data */
-    if (c == MV_CLASS_0) {
-      cost += class0_cost[d];
-    } else {
-      int i, b;
-      b = c + CLASS0_BITS - 1;  /* number of bits */
-      for (i = 0; i < b; ++i)
-        cost += bits_cost[i][((d >> i) & 1)];
-    }
-    if (c == MV_CLASS_0) {
-      cost += class0_fp_cost[d][f];
-    } else {
-      cost += fp_cost[f];
-    }
+    cost += class0_cost[d];
+    cost += class0_fp_cost[d][f];
     if (usehp) {
-      if (c == MV_CLASS_0) {
-        cost += class0_hp_cost[e];
-      } else {
-        cost += hp_cost[e];
-      }
+      e = (o & 1);                /* high precision mv data */
+      cost += class0_hp_cost[e];
     }
     mvcost[v] = cost + sign_cost[0];
     mvcost[-v] = cost + sign_cost[1];
   }
+  for (c = MV_CLASS_1; c < MV_CLASSES; ++c) {
+    int d;
+    for (d = 0; d < (1 << c); ++d) {
+      int f;
+      int whole_cost = class_cost[c];
+      int b = c + CLASS0_BITS - 1;  /* number of bits */
+      for (i = 0; i < b; ++i)
+        whole_cost += bits_cost[i][((d >> i) & 1)];
+      for (f = 0; f < 4; ++f) {
+        int cost = whole_cost + fp_cost[f];
+        int v = (CLASS0_SIZE << (c + 2)) + d * 8 + f * 2 /* + e */ + 1;
+        if (usehp) {
+          mvcost[v] = cost + hp_cost[0] + sign_cost[0];
+          mvcost[-v] = cost + hp_cost[0] + sign_cost[1];
+          if (v + 1 > MV_MAX) break;
+          mvcost[v + 1] = cost + hp_cost[1] + sign_cost[0];
+          mvcost[-v - 1] = cost + hp_cost[1] + sign_cost[1];
+        } else {
+          mvcost[v] = cost + sign_cost[0];
+          mvcost[-v] = cost + sign_cost[1];
+          if (v + 1 > MV_MAX) break;
+          mvcost[v + 1] = cost + sign_cost[0];
+          mvcost[-v - 1] = cost + sign_cost[1];
+        }
+      }
+    }
+  }
 }
 
 static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e8a8b89..01855ea 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -410,6 +410,9 @@
   memset(&cpi->svc.scaled_frames[0], 0,
          MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
 
+  vpx_free_frame_buffer(&cpi->svc.scaled_temp);
+  memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp));
+
   vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
   memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 
@@ -2451,6 +2454,13 @@
   return scale;
 }
 
+static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
+         (rc->projected_frame_size < (low_limit / 2));
+}
+
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
 static int recode_loop_test(VP9_COMP *cpi,
@@ -2462,6 +2472,7 @@
   int force_recode = 0;
 
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      big_rate_miss(cpi, high_limit, low_limit) ||
       (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (frame_is_kfgfarf &&
        (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
@@ -2808,9 +2819,38 @@
 
   vpx_clear_system_state();
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    recon_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  } else {
+    recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  }
+#else
   recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0) {
+    double dc_quant_devisor;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        dc_quant_devisor = 4.0;
+        break;
+      case VPX_BITS_10:
+        dc_quant_devisor = 16.0;
+        break;
+      case VPX_BITS_12:
+        dc_quant_devisor = 64.0;
+        break;
+      default:
+        assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+        break;
+    }
+#else
+    dc_quant_devisor = 4.0;
+#endif
+
     fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
@@ -2836,7 +2876,8 @@
         (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
         cpi->rc.total_actual_bits, cm->base_qindex,
         vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
-        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) /
+            dc_quant_devisor,
         vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
                                 cm->bit_depth),
         cpi->rc.avg_q,
@@ -2851,7 +2892,7 @@
         cpi->twopass.kf_zeromotion_pct,
         cpi->twopass.fr_content_type,
         cm->lf.filter_level);
-
+  }
   fclose(f);
 
   if (0) {
@@ -3089,17 +3130,30 @@
   vpx_clear_system_state();
 
   set_frame_size(cpi);
-  cpi->Source = vp9_scale_if_required(cm,
-                                      cpi->un_scaled_source,
-                                      &cpi->scaled_source,
-                                      (cpi->oxcf.pass == 0));
 
+  if (is_one_pass_cbr_svc(cpi) &&
+      cpi->un_scaled_source->y_width == cm->width << 2 &&
+      cpi->un_scaled_source->y_height == cm->height << 2 &&
+      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
+      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+    cpi->Source = vp9_svc_twostage_scale(cm,
+                                         cpi->un_scaled_source,
+                                         &cpi->scaled_source,
+                                         &cpi->svc.scaled_temp);
+  } else {
+    cpi->Source = vp9_scale_if_required(cm,
+                                        cpi->un_scaled_source,
+                                        &cpi->scaled_source,
+                                        (cpi->oxcf.pass == 0));
+  }
   // Avoid scaling last_source unless its needed.
-  // Last source is currently only used for screen-content mode,
-  // if partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
+  // Last source is needed if vp9_avg_source_sad() is used, or if
+  // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
   // estimation is enabled.
   if (cpi->unscaled_last_source != NULL &&
       (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+      cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) ||
       cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
       cpi->noise_estimate.enabled))
     cpi->Last_Source = vp9_scale_if_required(cm,
@@ -3109,18 +3163,18 @@
   vp9_update_noise_estimate(cpi);
 
   if (cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.mode == REALTIME &&
+      cpi->oxcf.speed >= 5 &&
       cpi->resize_state == 0 &&
       cm->frame_type != KEY_FRAME &&
-      cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+      (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+       cpi->oxcf.rc_mode == VPX_VBR))
     vp9_avg_source_sad(cpi);
 
-  // TODO(wonkap/marpan): For 1 pass SVC, since only ZERMOV is allowed for
-  // upsampled reference frame (i.e, svc->force_zero_mode_spatial_ref = 0),
-  // we should be able to avoid this frame-level upsampling.
-  // Keeping it for now as there is an asan error in the multi-threaded SVC
-  // rate control test if this upsampling is removed.
-  if (frame_is_intra_only(cm) == 0) {
+  // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
+  // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
+  // frame-level upsampling.
+  if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
     vp9_scale_references(cpi);
   }
 
@@ -3510,6 +3564,25 @@
   }
 }
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
+    scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp);
+    vp9_scale_and_extend_frame(scaled_temp, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
@@ -3680,6 +3753,12 @@
       ++cm->current_video_frame;
       cpi->ext_refresh_frame_flags_pending = 0;
       cpi->svc.rc_drop_superframe = 1;
+      // TODO(marpan): Advancing the svc counters on dropped frames can break
+      // the referencing scheme for the fixed svc patterns defined in
+      // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
+      // for now, don't advance the svc frame counters on dropped frame.
+      // if (cpi->use_svc)
+      //   vp9_inc_frame_in_layer(cpi);
       return;
     }
   }
@@ -4106,6 +4185,20 @@
     arf_src_index = 0;
 
   if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 017fa61..02d223a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -614,6 +614,11 @@
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9d3b154..10fd6c0 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -490,7 +490,35 @@
   cpi->rc.frames_to_key = INT_MAX;
 }
 
+// This threshold is used to track blocks where to all intents and purposes
+// the intra prediction error 0. Though the metric we test against
+// is technically a sse we are mainly interested in blocks where all the pixels
+// int he 8 bit domain have an error of <= 1 (where error = sse) so a
+// linear scaling for 10 and 12 bit gives similar results.
 #define UL_INTRA_THRESH 50
+#if CONFIG_VP9_HIGHBITDEPTH
+static int get_ul_intra_threshold(VP9_COMMON *cm) {
+  int ret_val = UL_INTRA_THRESH;
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        ret_val = UL_INTRA_THRESH;
+        break;
+      case VPX_BITS_10:
+        ret_val = UL_INTRA_THRESH >> 2;
+        break;
+      case VPX_BITS_12:
+        ret_val = UL_INTRA_THRESH >> 4;
+        break;
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+  return ret_val;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #define INVALID_ROW -1
 void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
@@ -681,7 +709,11 @@
       // domain). In natural videos this is uncommon, but it is much more
       // common in animations, graphics and screen content, so may be used
       // as a signal to detect these types of content.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (this_error < get_ul_intra_threshold(cm)) {
+#else
       if (this_error < UL_INTRA_THRESH) {
+#endif
         ++intra_skip_count;
       } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
         image_data_start_row = mb_row;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 8b7825e..4669145 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -383,6 +383,51 @@
                          (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
 }
 
+int vp9_skip_sub_pixel_tree(
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    int *distortion,
+    unsigned int *sse1,
+    const uint8_t *second_pred,
+    int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+  (void) rr;
+  (void) rc;
+  (void) minr;
+  (void) minc;
+  (void) maxr;
+  (void) maxc;
+  (void) tr;
+  (void) tc;
+  (void) sse;
+  (void) thismse;
+  (void) cost_list;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
 int vp9_find_best_sub_pixel_tree_pruned_evenmore(
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 1c101f2..1b0c860 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -92,6 +92,7 @@
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index e56cc9b..d505629 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -173,12 +173,18 @@
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
           // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          int consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+                                     VPXMIN(cr->consec_zero_mv[bl_index1],
+                                     VPXMIN(cr->consec_zero_mv[bl_index2],
+                                     cr->consec_zero_mv[bl_index3])));
           int is_skin = vp9_compute_skin_block(src_y,
                                                src_u,
                                                src_v,
                                                src_ystride,
                                                src_uvstride,
-                                               bsize);
+                                               bsize,
+                                               consec_zeromv,
+                                               0);
           if (frame_low_motion &&
               cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
               cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index d861f80..3ea2ccd 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1279,14 +1279,21 @@
     usable_ref_frame = GOLDEN_FRAME;
   }
 
-  // If the reference is temporally aligned with current superframe
-  // (e.g., spatial reference within superframe), constrain the inter mode:
-  // for now only test zero motion.
-  if (cpi->use_svc && svc ->force_zero_mode_spatial_ref) {
-    if (svc->ref_frame_index[cpi->lst_fb_idx] == svc->current_superframe)
-      svc_force_zero_mode[LAST_FRAME - 1] = 1;
-    if (svc->ref_frame_index[cpi->gld_fb_idx] == svc->current_superframe)
-      svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+  // For svc mode, on spatial_layer_id > 0: if the reference has different scale
+  // constrain the inter mode to only test zero motion.
+  if (cpi->use_svc &&
+      svc ->force_zero_mode_spatial_ref &&
+      cpi->svc.spatial_layer_id > 0) {
+    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
+      struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+      if (vp9_is_scaled(sf))
+        svc_force_zero_mode[LAST_FRAME - 1] = 1;
+    }
+    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
+      struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+      if (vp9_is_scaled(sf))
+        svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+    }
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
@@ -1356,7 +1363,9 @@
       continue;
 
     if (this_mode == NEWMV) {
-      if (ref_frame > LAST_FRAME && !cpi->use_svc) {
+      if (ref_frame > LAST_FRAME &&
+          !cpi->use_svc &&
+          cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
         int dis, cost_list[5];
 
@@ -1591,7 +1600,8 @@
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
-    if (cpi->oxcf.speed >= 5 &&
+    if (cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.speed >= 5 &&
         cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
         !x->sb_is_skin) {
       // Bias against non-zero (above some threshold) motion for large blocks.
@@ -1679,12 +1689,15 @@
   xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
   x->skip_txfm[0] = best_mode_skip_txfm;
 
-  // Perform intra prediction only if base layer is chosen as the reference.
+  // For spatial enhancemanent layer: perform intra prediction only if base
+  // layer is chosen as the reference. Always perform intra prediction if
+  // LAST is the only reference or is_key_frame is set.
   if (cpi->svc.spatial_layer_id) {
     perform_intra_pred =
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
+        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME])  ||
         (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame
-            && svc_force_zero_mode[best_ref_frame]);
+            && svc_force_zero_mode[best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
   // Perform intra prediction search, if the best SAD is above a certain
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 5df2909..61bb35e 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1469,7 +1469,12 @@
     cm->frame_type = INTER_FRAME;
   }
   if (rc->frames_till_gf_update_due == 0) {
-    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
+      vp9_cyclic_refresh_set_golden_update(cpi);
+    } else {
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    }
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
     if (rc->frames_till_gf_update_due > rc->frames_to_key) {
@@ -1487,6 +1492,8 @@
   else
     target = calc_pframe_target_size_one_pass_vbr(cpi);
   vp9_rc_set_frame_target(cpi, target);
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
+    vp9_cyclic_refresh_update_parameters(cpi);
 }
 
 static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
@@ -1567,40 +1574,28 @@
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
-// Reset information needed to set proper reference frames and buffer updates
-// for temporal layering. This is called when a key frame is encoded.
-static void reset_temporal_layer_to_zero(VP9_COMP *cpi) {
-  int sl;
-  LAYER_CONTEXT *lc = NULL;
-  cpi->svc.temporal_layer_id = 0;
-
-  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-    lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
-    lc->current_video_frame_in_layer = 0;
-    lc->frames_from_key_frame = 0;
-  }
-}
-
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target = rc->avg_frame_bandwidth;
   int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
       cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
-
+  // Periodic key frames is based on the super-frame counter
+  // (svc.current_superframe), also only base spatial layer is key frame.
   if ((cm->current_video_frame == 0) ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (rc->frames_since_key %
-          cpi->oxcf.key_freq == 0))) {
+      (cpi->oxcf.auto_key &&
+       (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) &&
+       cpi->svc.spatial_layer_id == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-
     if (is_two_pass_svc(cpi)) {
       cpi->svc.layer_context[layer].is_key_frame = 1;
       cpi->ref_frame_flags &=
           (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
     } else if (is_one_pass_cbr_svc(cpi)) {
-      reset_temporal_layer_to_zero(cpi);
+      if (cm->current_video_frame > 0)
+        vp9_svc_reset_key_frame(cpi);
       layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
            cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
       cpi->svc.layer_context[layer].is_key_frame = 1;
@@ -2010,13 +2005,17 @@
   VP9_COMMON * const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   rc->high_source_sad = 0;
-  if (cpi->Last_Source != NULL) {
+  if (cpi->Last_Source != NULL &&
+      cpi->Last_Source->y_width == cpi->Source->y_width &&
+      cpi->Last_Source->y_height == cpi->Source->y_height) {
     const uint8_t *src_y = cpi->Source->y_buffer;
     const int src_ystride = cpi->Source->y_stride;
     const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
     const int last_src_ystride = cpi->Last_Source->y_stride;
     int sbi_row, sbi_col;
     const BLOCK_SIZE bsize = BLOCK_64X64;
+    uint32_t min_thresh = 4000;
+    float thresh = 8.0f;
     // Loop over sub-sample of frame, and compute average sad over 64x64 blocks.
     uint64_t avg_sad = 0;
     int num_samples = 0;
@@ -2047,12 +2046,32 @@
     // between current and the previous frame value(s). Use a minimum threshold
     // for cases where there is small change from content that is completely
     // static.
-    if (avg_sad > VPXMAX(4000, (rc->avg_source_sad << 3)) &&
+    if (cpi->oxcf.rc_mode == VPX_VBR) {
+      min_thresh = 30000;
+      thresh = 2.0f;
+    }
+    if (avg_sad >
+        VPXMAX(min_thresh, (unsigned int)(rc->avg_source_sad  * thresh)) &&
         rc->frames_since_key > 1)
       rc->high_source_sad = 1;
     else
       rc->high_source_sad = 0;
-    rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+    if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
+      rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+    // For VBR, under scene change/high content change, force golden refresh.
+    if (cpi->oxcf.rc_mode == VPX_VBR &&
+        rc->high_source_sad &&
+        cpi->refresh_golden_frame == 0 &&
+        cpi->ext_refresh_frame_flags_pending == 0) {
+      int target;
+      cpi->refresh_golden_frame = 1;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      if (rc->frames_till_gf_update_due > rc->frames_to_key)
+        rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->gfu_boost = DEFAULT_GF_BOOST;
+      target = calc_pframe_target_size_one_pass_vbr(cpi);
+      vp9_rc_set_frame_target(cpi, target);
+    }
   }
 }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 193c9d3..508c596 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -387,47 +387,70 @@
     cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
-    int band_left = *band_count++;
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
 
-    // dc token
-    int v = qcoeff[0];
-    int16_t prev_t;
-    EXTRABIT e;
-    vp9_get_token_extra(v, &prev_t, &e);
-    cost = (*token_costs)[0][pt][prev_t] +
-        vp9_get_cost(prev_t, e, cat6_high_cost);
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost);
+      cost += (*token_costs)[0][pt][prev_t];
 
-    token_cache[0] = vp9_pt_energy_class[prev_t];
-    ++token_costs;
+      token_cache[0] = vp9_pt_energy_class[prev_t];
+      ++token_costs;
 
-    // ac tokens
-    for (c = 1; c < eob; c++) {
-      const int rc = scan[c];
-      int16_t t;
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
 
-      v = qcoeff[rc];
-      vp9_get_token_extra(v, &t, &e);
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] +
-            vp9_get_cost(t, e, cat6_high_cost);
-      } else {
-        pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] +
-            vp9_get_cost(t, e, cat6_high_cost);
-        token_cache[rc] = vp9_pt_energy_class[t];
+        v = qcoeff[rc];
+        cost += vp9_get_token_cost(v, &t, cat6_high_cost);
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
       }
-      prev_t = t;
-      if (!--band_left) {
-        band_left = *band_count++;
-        ++token_costs;
-      }
-    }
 
-    // eob token
-    if (band_left) {
-      if (use_fast_coef_costing) {
+      // eob token
+      if (band_left)
         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-      } else {
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+      unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+      cost = vp9_get_token_cost(v, &tok, cat6_high_cost);
+      cost += (*token_costs)[0][pt][tok];
+
+      token_cache[0] = vp9_pt_energy_class[tok];
+      ++token_costs;
+
+      tok_cost_ptr = &((*token_costs)[!tok]);
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += vp9_get_token_cost(v, &tok, cat6_high_cost);
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*tok_cost_ptr)[pt][tok];
+        token_cache[rc] = vp9_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+        tok_cost_ptr = &((*token_costs)[!tok]);
+      }
+
+      // eob token
+      if (band_left) {
         pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index 8e117eb..ff0dfce 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -15,7 +15,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 
-#define MODEL_MODE 0
+#define MODEL_MODE 1
 
 // Fixed-point skin color model parameters.
 static const int skin_mean[5][2] = {
@@ -48,7 +48,8 @@
   return skin_diff;
 }
 
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
+                   int motion) {
   if (y < y_low || y > y_high) {
     return 0;
   } else {
@@ -62,16 +63,19 @@
       // Exit on very strong cb.
       if (cb > 150 && cr < 110)
         return 0;
-      // Exit on (another) low luminance threshold if either color is high.
-      if (y < 50 && (cb > 140 || cr > 140))
-        return 0;
       for (; i < 5; i++) {
-        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[i + 1]) {
-          return 1;
+        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+        if (skin_color_diff < skin_threshold[i + 1]) {
+           if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+             return 0;
+           else if (motion == 0 &&
+                    skin_color_diff > (skin_threshold[i + 1] >> 1))
+             return 0;
+           else
+            return 1;
         }
         // Exit if difference is much large than the threshold.
-        if (evaluate_skin_color_difference(cb, cr, i) >
-            (skin_threshold[i + 1] << 3)) {
+        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
           return 0;
         }
       }
@@ -81,16 +85,25 @@
 }
 
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
-                           int stride, int strideuv, int bsize) {
-  // Take center pixel in block to determine is_skin.
-  const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
-  const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
-  const int uv_width_shift = y_width_shift >> 1;
-  const int uv_height_shift = y_height_shift >> 1;
-  const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
-  const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
-  const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
-  return vp9_skin_pixel(ysource, usource, vsource);
+                           int stride, int strideuv, int bsize,
+                           int consec_zeromv, int curr_motion_magn) {
+  // No skin if block has been zero/small motion for long consecutive time.
+  if (consec_zeromv > 80 && curr_motion_magn == 0) {
+    return 0;
+  } else {
+    int motion = 1;
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
+    const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+    if (consec_zeromv > 30 && curr_motion_magn == 0)
+      motion = 0;
+    return vp9_skin_pixel(ysource, usource, vsource, motion);
+  }
 }
 
 
@@ -99,6 +112,7 @@
 void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   int i, j, mi_row, mi_col, num_bl;
   VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   uint8_t *y;
   const uint8_t *src_y = cpi->Source->y_buffer;
   const uint8_t *src_u = cpi->Source->u_buffer;
@@ -113,7 +127,7 @@
   int shuv = shy - 1;
   int fac = y_bsize / 8;
   // Use center pixel or average of center 2x2 pixels.
-  int mode_filter = 1;
+  int mode_filter = 0;
   YV12_BUFFER_CONFIG skinmap;
   memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
   if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height,
@@ -130,27 +144,48 @@
   for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
     num_bl = 0;
     for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
-      // Select pixel for each block for skin detection.
-      // Use center pixel, or 2x2 average at center.
-      uint8_t ysource = src_y[ypos * src_ystride + ypos];
-      uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
-      uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
-      uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
-      uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
-      uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
-      uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
-      uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos  + 1)];
-      uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos +  1)];
-      uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
-      uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
-      uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
       int is_skin = 0;
       if (mode_filter == 1) {
+        // Use 2x2 average at center.
+        uint8_t ysource = src_y[ypos * src_ystride + ypos];
+        uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
+        uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
+        uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
+        uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
+        uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
+        uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
+        uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos  + 1)];
+        uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos +  1)];
+        uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
+        uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
+        uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
         ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
         usource = (usource + usource2 + usource3 + usource4) >> 2;
         vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
+        is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
+      } else {
+        int block_size = BLOCK_8X8;
+        int consec_zeromv = 0;
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+          int bl_index = mi_row * cm->mi_cols + mi_col;
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + cm->mi_cols;
+          int bl_index3 = bl_index2 + 1;
+          if (y_bsize == 8)
+            consec_zeromv = cr->consec_zero_mv[bl_index];
+          else
+            consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+                                     VPXMIN(cr->consec_zero_mv[bl_index1],
+                                     VPXMIN(cr->consec_zero_mv[bl_index2],
+                                     cr->consec_zero_mv[bl_index3])));
+        }
+        if (y_bsize == 16)
+          block_size = BLOCK_16X16;
+        is_skin  = vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+                                          src_uvstride, block_size,
+                                          consec_zeromv,
+                                          0);
       }
-      is_skin = vp9_skin_pixel(ysource, usource, vsource);
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
           if (is_skin)
diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 73f7c39..c77382d 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h
@@ -21,10 +21,12 @@
 
 // #define OUTPUT_YUV_SKINMAP
 
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
+                   int motion);
 
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
-                           int stride, int strideuv, int bsize);
+                           int stride, int strideuv, int bsize,
+                           int consec_zeromv, int curr_motion_magn);
 
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index f684507..02be3c3 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -400,6 +400,8 @@
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
     sf->simple_model_rd_from_var = 1;
+    if (cpi->oxcf.rc_mode == VPX_VBR)
+      sf->mv.search_method = NSTEP;
 
     if (!is_keyframe) {
       int i;
@@ -441,7 +443,7 @@
   }
   if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
-    sf->mv.subpel_force_stop = 2;
+    sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
   }
 }
@@ -607,7 +609,10 @@
     sf->optimize_coefficients = 0;
   }
 
-  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+  if (sf->mv.subpel_force_stop == 3) {
+    // Whole pel only
+    cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index fa2f79d..90b3216 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -188,7 +188,11 @@
   // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
 
-  // Control when to stop subpel search
+  // Control when to stop subpel search:
+  // 0: Full subpel search.
+  // 1: Stop at quarter pixel.
+  // 2: Stop at half pixel.
+  // 3: Stop at full pixel.
   int subpel_force_stop;
 
   // This variable sets the step_param used in full pel motion search.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1d56154..73048f8 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -43,6 +43,26 @@
     cpi->svc.ext_alt_fb_idx[sl] = 2;
   }
 
+  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
+  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
+  // target of 1/4x1/4.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    if (vpx_realloc_frame_buffer(&cpi->svc.scaled_temp,
+                                 cpi->common.width >> 1,
+                                 cpi->common.height >> 1,
+                                 cpi->common.subsampling_x,
+                                 cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate scaled_frame for svc ");
+  }
+
+
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
                                  SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
@@ -796,3 +816,27 @@
     }
   }
 }
+
+// Reset on key frame: reset counters, references and buffer updates.
+void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
+  int sl, tl;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+      lc->current_video_frame_in_layer = 0;
+      lc->frames_from_key_frame = 0;
+    }
+  }
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+    set_flags_and_fb_idx_for_temporal_mode3(cpi);
+  } else if (svc->temporal_layering_mode ==
+             VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+     set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+  } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
+     set_flags_and_fb_idx_for_temporal_mode2(cpi);
+  }
+  vp9_update_temporal_layer_framerate(cpi);
+  vp9_restore_layer_context(cpi);
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 4e18640..9f386fb 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,6 +70,8 @@
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.
   YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+  // Temp buffer used for 2-stage down-sampling, for real-time mode.
+  YV12_BUFFER_CONFIG scaled_temp;
 
   // Layer context used for rate control in one pass temporal CBR mode or
   // two pass spatial mode.
@@ -134,6 +136,8 @@
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
+void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 82f566b..ebe28b8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -45,8 +45,7 @@
                                             int x, int y) {
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
-  const InterpKernel *const kernel =
-    vp9_filter_kernels[xd->mi[0]->interp_filter];
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
@@ -86,6 +85,7 @@
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  (void)xd;
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 93be6d7..ee1d08a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -50,6 +50,35 @@
 const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
     (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
     / 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp9_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+  3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+  3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+  2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+  3197, 3116, 3058, 2977, 2881, 2800,
+  2742, 2661, 2615, 2534, 2476, 2395,
+  2299, 2218, 2160, 2079,
+  2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+  1893, 1696, 1453, 1256, 1229, 864,
+  512, 512, 512, 512, 0,
+  512, 512, 512, 512,
+  864, 1229, 1256, 1453, 1696, 1893,
+  1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+  2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+  2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+  2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+  3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp9_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index df979b2..fad7988 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -74,6 +74,7 @@
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int *vp9_dct_cat_lt_10_value_cost;
 extern const int16_t vp9_cat6_low_cost[256];
 extern const int vp9_cat6_high_cost[64];
 extern const int vp9_cat6_high10_high_cost[256];
@@ -117,6 +118,18 @@
   return vp9_dct_cat_lt_10_value_tokens[v].token;
 }
 
+static INLINE int vp9_get_token_cost(int v, int16_t *token,
+                                     const int *cat6_high_table) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return vp9_cat6_low_cost[extrabits & 0xff] +
+           cat6_high_table[extrabits >> 8];
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  return vp9_dct_cat_lt_10_value_cost[v];
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 7f01acb..d13e699 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -145,7 +145,7 @@
   RANGE_CHECK(cfg, g_w,                   1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_h,                   1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
-  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+  RANGE_CHECK(cfg, g_timebase.num,        1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile,          3);
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
@@ -473,7 +473,16 @@
   oxcf->content = extra_cfg->content;
 
   oxcf->tile_columns = extra_cfg->tile_columns;
-  oxcf->tile_rows    = extra_cfg->tile_rows;
+
+  // TODO(yunqing): The dependencies between row tiles cause error in multi-
+  // threaded encoding. For now, tile_rows is forced to be 0 in this case.
+  // The further fix can be done by adding synchronizations after a tile row
+  // is encoded. But this will hurt multi-threaded encoder performance. So,
+  // it is recommended to use tile-rows=0 while encoding with threads > 1.
+  if (oxcf->max_threads > 1 && oxcf->tile_columns > 0)
+    oxcf->tile_rows  = 0;
+  else
+    oxcf->tile_rows  = extra_cfg->tile_rows;
 
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
@@ -1553,7 +1562,7 @@
       // keyframing settings (kf)
       VPX_KF_AUTO,        // g_kfmode
       0,                  // kf_min_dist
-      9999,               // kf_max_dist
+      128,                // kf_max_dist
 
       VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
       {0},
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 628afca..8028608 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -322,7 +322,7 @@
 
       for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
         if (si->svc_params.scaling_factor_den[sl] > 0) {
-          alloc_ratio[sl] = (float)( (sl+1) );
+          alloc_ratio[sl] = (float)( pow(2, sl) );
           total += alloc_ratio[sl];
         }
       }
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index fc7060f..e371849 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -41,6 +41,7 @@
 DSP_SRCS-yes += intrapred.c
 
 ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
@@ -130,7 +131,6 @@
 
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
 DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
-DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
 
 DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ced7009..d01e81d 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -548,7 +548,7 @@
 $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc";
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
@@ -569,7 +569,7 @@
 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc";
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 95aa790..95c721a 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -34,7 +34,7 @@
                                     int w, int h) { \
   assert(filter[3] != 128); \
   assert(step_q4 == 16); \
-  if (filter[0] || filter[1] || filter[2]) { \
+  if (filter[0] | filter[1] | filter[2]) { \
     while (w >= 16) { \
       vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                src_stride, \
@@ -46,27 +46,20 @@
       dst += 16; \
       w -= 16; \
     } \
-    while (w >= 8) { \
+    if (w == 8) { \
       vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 8; \
-      dst += 8; \
-      w -= 8; \
-    } \
-    while (w >= 4) { \
+    } else if (w == 4) { \
       vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 4; \
-      dst += 4; \
-      w -= 4; \
     } \
   } else { \
     while (w >= 16) { \
@@ -80,27 +73,20 @@
       dst += 16; \
       w -= 16; \
     } \
-    while (w >= 8) { \
+    if (w == 8) { \
       vpx_filter_block1d8_##dir##2_##avg##opt(src, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 8; \
-      dst += 8; \
-      w -= 8; \
-    } \
-    while (w >= 4) { \
+    } else if (w == 4) { \
       vpx_filter_block1d4_##dir##2_##avg##opt(src, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 4; \
-      dst += 4; \
-      w -= 4; \
     } \
   } \
 }
@@ -164,7 +150,7 @@
   if (step_q4 == 16 && filter[3] != 128) { \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] || filter[1] || filter[2]) { \
+    if (filter[0] | filter[1] | filter[2]) { \
       while (w >= 16) { \
         vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                         src_stride, \
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
index 15105e3..45d0ecc 100644
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm
@@ -1,5 +1,5 @@
 ;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
@@ -8,589 +8,429 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vpx_lpf_horizontal_4_mmx
-;(
-;    unsigned char *src_ptr,
-;    int src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh
-;)
-global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
-sym(vpx_lpf_horizontal_4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-
-        ; calculate breakout conditions
-        movq        mm2, [rdi+2*rax]      ; q3
-        movq        mm1, [rsi+2*rax]      ; q2
-        movq        mm6, mm1              ; q2
-        psubusb     mm1, mm2              ; q2-=q3
-        psubusb     mm2, mm6              ; q3-=q2
-        por         mm1, mm2              ; abs(q3-q2)
-        psubusb     mm1, mm7              ;
-
-
-        movq        mm4, [rsi+rax]        ; q1
-        movq        mm3, mm4              ; q1
-        psubusb     mm4, mm6              ; q1-=q2
-        psubusb     mm6, mm3              ; q2-=q1
-        por         mm4, mm6              ; abs(q2-q1)
-
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        psubusb     mm4, mm3              ; q0-=q1
-        psubusb     mm3, mm0              ; q1-=q0
-        por         mm4, mm3              ; abs(q0-q1)
-        movq        t0, mm4               ; save to t0
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        neg         rax                   ; negate pitch to deal with above border
-
-        movq        mm2, [rsi+4*rax]      ; p3
-        movq        mm4, [rdi+4*rax]      ; p2
-        movq        mm5, mm4              ; p2
-        psubusb     mm4, mm2              ; p2-=p3
-        psubusb     mm2, mm5              ; p3-=p2
-        por         mm4, mm2              ; abs(p3 - p2)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        movq        mm4, [rsi+2*rax]      ; p1
-        movq        mm3, mm4              ; p1
-        psubusb     mm4, mm5              ; p1-=p2
-        psubusb     mm5, mm3              ; p2-=p1
-        por         mm4, mm5              ; abs(p2 - p1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm2, mm3              ; p1
-
-        movq        mm4, [rsi+rax]        ; p0
-        movq        mm5, mm4              ; p0
-        psubusb     mm4, mm3              ; p0-=p1
-        psubusb     mm3, mm5              ; p1-=p0
-        por         mm4, mm3              ; abs(p1 - p0)
-        movq        t1, mm4               ; save to t1
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm3, [rdi]            ; q1
-        movq        mm4, mm3              ; q1
-        psubusb     mm3, mm2              ; q1-=p1
-        psubusb     mm2, mm4              ; p1-=q1
-        por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm2, 1                ; abs(p1-q1)/2
-
-        movq        mm6, mm5              ; p0
-        movq        mm3, [rsi]            ; q0
-        psubusb     mm5, mm3              ; p0-=q0
-        psubusb     mm3, mm6              ; q0-=p0
-        por         mm5, mm3              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm7, [rdx]            ; blimit
-
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,    mm5
-        pxor        mm5,    mm5
-        pcmpeqb     mm1,    mm5           ; mask mm1
-
-        ; calculate high edge variance
-        mov         rdx, arg(4) ;thresh           ; get thresh
-        movq        mm7, [rdx]            ;
-        movq        mm4, t0               ; get abs (q1 - q0)
-        psubusb     mm4, mm7
-        movq        mm3, t1               ; get abs (p1 - p0)
-        psubusb     mm3, mm7
-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     mm4,        mm5
-
-        pcmpeqb     mm5,        mm5
-        pxor        mm4,        mm5
-
-
-        ; start work on filters
-        movq        mm2, [rsi+2*rax]      ; p1
-        movq        mm7, [rdi]            ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        mm1, mm2                  ; mask filter values we don't care about
-        movq        mm2, mm1
-        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        pxor        mm0, mm0             ;
-        pxor        mm5, mm5
-        punpcklbw   mm0, mm2            ;
-        punpckhbw   mm5, mm2            ;
-        psraw       mm0, 11             ;
-        psraw       mm5, 11
-        packsswb    mm0, mm5
-        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0, mm0              ; 0
-        movq        mm5, mm1              ; abcdefgh
-        punpcklbw   mm0, mm1              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        pxor        mm1, mm1              ; 0
-        punpckhbw   mm1, mm5              ; a0b0c0d0
-        psraw       mm1, 11               ; sign extended shift right by 3
-        movq        mm5, mm0              ; save results
-
-        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5, [GLOBAL(ones)]
-        paddsw      mm1, [GLOBAL(ones)]
-        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
-        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
-        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-        pandn       mm4, mm5              ; high edge variance additive
-
-        paddsb      mm6, mm2              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        movq        mm6, [rsi+2*rax]      ; p1
-        pxor        mm6, [GLOBAL(t80)]    ; reoffset
-        paddsb      mm6, mm4              ; p1+= p1 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+2*rax], mm6      ; write back
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-        psubsb      mm7, mm4              ; q1-= q1 add
-        pxor        mm7, [GLOBAL(t80)]    ; unoffset
-        movq        [rdi], mm7            ; write back
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_lpf_vertical_4_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh
-;)
-global sym(vpx_lpf_vertical_4_mmx) PRIVATE
-sym(vpx_lpf_vertical_4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 64      ; reserve 64 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi + rax*4 - 4]
-
-        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
-        add         rdi,        rax
-
-
-        ;transpose
-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
-
-        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
-        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
-
-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
-
-        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
-        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
-
-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
-
-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
-
-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
-
-        neg         rax
-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
-        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
-
-        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
-
-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
-
-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
-
-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
-
-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
-
-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
-        psubusb     mm5,        mm7                         ; q2-q3
-
-        psubusb     mm7,        mm6                         ; q3-q2
-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
-
-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
-
-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
-
-        psubusb     mm3,        mm6                         ; q1-q2
-        psubusb     mm6,        mm5                         ; q2-q1
-
-        por         mm6,        mm3                         ; mm6=abs(q2-q1)
-        lea         rdx,        srct
-
-        movq        [rdx+24],   mm5                         ; save q1
-        movq        [rdx+16],   mm0                         ; save q0
-
-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
-
-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
-
-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
-
-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
-        psubusb     mm2,        mm0                         ; p2-p3
-
-        psubusb     mm0,        mm1                         ; p3-p2
-        por         mm0,        mm2                         ; mm0=abs(p3-p2)
-
-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
-
-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
-        movq        [rdx+8],    mm3                         ; save p0
-
-        movq        [rdx],      mm2                         ; save p1
-        movq        mm5,        mm2                         ; mm5 = p1
-
-        psubusb     mm2,        mm1                         ; p1-p2
-        psubusb     mm1,        mm5                         ; p2-p1
-
-        por         mm1,        mm2                         ; mm1=abs(p2-p1)
-        mov         rdx,        arg(3) ;limit
-
-        movq        mm4,        [rdx]                       ; mm4 = limit
-        psubusb     mm7,        mm4
-
-        psubusb     mm0,        mm4
-        psubusb     mm1,        mm4
-
-        psubusb     mm6,        mm4
-        por         mm7,        mm6
-
-        por         mm0,        mm1
-        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movq        mm1,        mm5                         ; p1
-
-        movq        mm7,        mm3                         ; mm3=mm7=p0
-        psubusb     mm7,        mm5                         ; p0 - p1
-
-        psubusb     mm5,        mm3                         ; p1 - p0
-        por         mm5,        mm7                         ; abs(p1-p0)
-
-        movq        t0,         mm5                         ; save abs(p1-p0)
-        lea         rdx,        srct
-
-        psubusb     mm5,        mm4
-        por         mm0,        mm5                         ; mm0=mask
-
-        movq        mm5,        [rdx+16]                    ; mm5=q0
-        movq        mm7,        [rdx+24]                    ; mm7=q1
-
-        movq        mm6,        mm5                         ; mm6=q0
-        movq        mm2,        mm7                         ; q1
-        psubusb     mm5,        mm7                         ; q0-q1
-
-        psubusb     mm7,        mm6                         ; q1-q0
-        por         mm7,        mm5                         ; abs(q1-q0)
-
-        movq        t1,         mm7                         ; save abs(q1-q0)
-        psubusb     mm7,        mm4
-
-        por         mm0,        mm7                         ; mask
-
-        movq        mm5,        mm2                         ; q1
-        psubusb     mm5,        mm1                         ; q1-=p1
-        psubusb     mm1,        mm2                         ; p1-=q1
-        por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
-        psrlw       mm5,        1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                      ;
-
-        movq        mm4,        [rdx]                       ;blimit
-        movq        mm1,        mm3                         ; mm1=mm3=p0
-
-        movq        mm7,        mm6                         ; mm7=mm6=q0
-        psubusb     mm1,        mm7                         ; p0-q0
-
-        psubusb     mm7,        mm3                         ; q0-p0
-        por         mm1,        mm7                         ; abs(q0-p0)
-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,        mm0;                        ; mask
-
-        pxor        mm0,        mm0
-        pcmpeqb     mm1,        mm0
-
-        ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movq        mm7,        [rdx]
-        ;
-        movq        mm4,        t0              ; get abs (q1 - q0)
-        psubusb     mm4,        mm7
-
-        movq        mm3,        t1              ; get abs (p1 - p0)
-        psubusb     mm3,        mm7
-
-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     mm4,        mm0
-
-        pcmpeqb     mm0,        mm0
-        pxor        mm4,        mm0
-
-
-
-        ; start work on filters
-        lea         rdx,        srct
-
-        movq        mm2,        [rdx]           ; p1
-        movq        mm7,        [rdx+24]        ; q1
-
-        movq        mm6,        [rdx+8]         ; p0
-        movq        mm0,        [rdx+16]        ; q0
-
-        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
-        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
-
-        psubsb      mm2,        mm7             ; p1 - q1
-        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
-
-        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
-        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
-
-        movq        mm3,        mm0             ; q0
-        psubsb      mm0,        mm6             ; q0 - p0
-
-        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand       mm1,        mm2              ; mask filter values we don't care about
-
-        movq        mm2,        mm1
-        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
-        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-        pxor        mm0,        mm0          ;
-
-        pxor        mm5,        mm5
-        punpcklbw   mm0,        mm2         ;
-
-        punpckhbw   mm5,        mm2         ;
-        psraw       mm0,        11              ;
-
-        psraw       mm5,        11
-        packsswb    mm0,        mm5
-
-        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0,        mm0           ; 0
-        movq        mm5,        mm1           ; abcdefgh
-
-        punpcklbw   mm0,        mm1           ; e0f0g0h0
-        psraw       mm0,        11                ; sign extended shift right by 3
-
-        pxor        mm1,        mm1           ; 0
-        punpckhbw   mm1,        mm5           ; a0b0c0d0
-
-        psraw       mm1,        11                ; sign extended shift right by 3
-        movq        mm5,        mm0              ; save results
-
-        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5,        [GLOBAL(ones)]
-
-        paddsw      mm1,        [GLOBAL(ones)]
-        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
-
-        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
-        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-        pandn       mm4,        mm5             ; high edge variance additive
-
-        paddsb      mm6,        mm2             ; p0+= p0 add
-        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
-
-        ; mm6=p0                               ;
-        movq        mm1,        [rdx]           ; p1
-        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
-
-        paddsb      mm1,        mm4                 ; p1+= p1 add
-        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
-        ; mm6 = p0 mm1 = p1
-
-        psubsb      mm3,        mm0                 ; q0-= q0 add
-        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
-
-        ; mm3 = q0
-        psubsb      mm7,        mm4                 ; q1-= q1 add
-        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
-        ; mm7 = q1
-
-        ; transpose and write back
-        ; mm1 =    72 62 52 42 32 22 12 02
-        ; mm6 =    73 63 53 43 33 23 13 03
-        ; mm3 =    74 64 54 44 34 24 14 04
-        ; mm7 =    75 65 55 45 35 25 15 05
-
-        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
-        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
-
-        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
-        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
-
-        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
-        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
-
-        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
-        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
-
-        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
-        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
-
-        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
-        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
-
-
-        ; mm2 = 15 14 13 12 05 04 03 02
-        ; mm6 = 35 34 33 32 25 24 23 22
-        ; mm5 = 55 54 53 52 45 44 43 42
-        ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
-        movd        [rsi+rax*4+2], mm2
-        psrlq       mm2,        32
-
-        movd        [rdi+rax*4+2], mm2
-        movd        [rsi+rax*2+2], mm6
-
-        psrlq       mm6,        32
-        movd        [rsi+rax+2],mm6
-
-        movd        [rsi+2],    mm1
-        psrlq       mm1,        32
-
-        movd        [rdi+2],    mm1
-        neg         rax
-
-        movd        [rdi+rax+2],mm5
-        psrlq       mm5,        32
-
-        movd        [rdi+rax*2+2], mm5
-
-    add rsp, 64
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+%include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
 align 16
 tfe:
     times 8 db 0xfe
-align 16
 t80:
     times 8 db 0x80
-align 16
 t3:
     times 8 db 0x03
-align 16
 t4:
     times 8 db 0x04
-align 16
 ones:
     times 4 dw 0x0001
+
+SECTION .text
+
+%define stkreg rsp
+
+%define t0                  0
+%define t1            t0 + 16
+%define p1            t1 + 16
+%define p0            p1 + 16
+%define q0            p0 + 16
+%define q1            q0 + 16
+%define lstacksize    q1 + 16
+
+%define goffsetq _limitq
+
+;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int  src_pixel_step,
+;                              const char *blimit, const char *limit,
+;                              const char *thresh);
+INIT_MMX mmx
+cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \
+                                s, p, _blimit, _limit, _thresh, s1
+    movq                  m7, [_limitq]
+    GET_GOT         goffsetq
+%if GET_GOT_DEFINED=1
+    add rsp, gprsize                          ; restore stack
+%endif
+    lea                  s1q, [sq + pq]       ; s1q points to row +1
+
+    ; calculate breakout conditions
+    movq                  m2, [s1q + 2 * pq]  ; q3
+    movq                  m1, [ sq + 2 * pq]  ; q2
+    movq                  m6, m1              ; q2
+    psubusb               m1, m2              ; q2-=q3
+    psubusb               m2, m6              ; q3-=q2
+    por                   m1, m2              ; abs(q3-q2)
+    psubusb               m1, m7
+    movq                  m4, [sq + pq]       ; q1
+    movq                  m3, m4              ; q1
+    psubusb               m4, m6              ; q1-=q2
+    psubusb               m6, m3              ; q2-=q1
+    por                   m4, m6              ; abs(q2-q1)
+    psubusb               m4, m7
+    por                   m1, m4
+    movq                  m4, [sq]            ; q0
+    movq                  m0, m4              ; q0
+    psubusb               m4, m3              ; q0-=q1
+    psubusb               m3, m0              ; q1-=q0
+    por                   m4, m3              ; abs(q0-q1)
+    movq       [stkreg + t0], m4              ; save to t0
+    psubusb               m4, m7
+    por                   m1, m4
+    neg                   pq                  ; negate pitch to deal with
+                                              ; above border
+    movq                  m2, [ sq + 4 * pq]  ; p3
+    movq                  m4, [s1q + 4 * pq]  ; p2
+    movq                  m5, m4              ; p2
+    psubusb               m4, m2              ; p2-=p3
+    psubusb               m2, m5              ; p3-=p2
+    por                   m4, m2              ; abs(p3 - p2)
+    psubusb               m4, m7
+    por                   m1, m4
+    movq                  m4, [sq + 2 * pq]   ; p1
+    movq                  m3, m4              ; p1
+    psubusb               m4, m5              ; p1-=p2
+    psubusb               m5, m3              ; p2-=p1
+    por                   m4, m5              ; abs(p2 - p1)
+    psubusb               m4, m7
+    por                   m1, m4
+    movq                  m2, m3              ; p1
+    movq                  m4, [sq + pq]       ; p0
+    movq                  m5, m4              ; p0
+    psubusb               m4, m3              ; p0-=p1
+    psubusb               m3, m5              ; p1-=p0
+    por                   m4, m3              ; abs(p1 - p0)
+    movq       [stkreg + t1], m4              ; save to t1
+    psubusb               m4, m7
+    por                   m1, m4
+    movq                  m3, [s1q]           ; q1
+    movq                  m4, m3              ; q1
+    psubusb               m3, m2              ; q1-=p1
+    psubusb               m2, m4              ; p1-=q1
+    por                   m2, m3              ; abs(p1-q1)
+    pand                  m2, [GLOBAL(tfe)]   ; set lsb of each byte to zero
+    psrlw                 m2, 1               ; abs(p1-q1)/2
+    movq                  m6, m5              ; p0
+    movq                  m3, [sq]            ; q0
+    psubusb               m5, m3              ; p0-=q0
+    psubusb               m3, m6              ; q0-=p0
+    por                   m5, m3              ; abs(p0 - q0)
+    paddusb               m5, m5              ; abs(p0-q0)*2
+    paddusb               m5, m2              ; abs (p0 - q0) * 2 + abs(p1-q1)/2
+    movq                  m7, [_blimitq]            ; blimit
+    psubusb               m5, m7              ; abs (p0 - q0) * 2 +
+                                              ; abs(p1-q1)/2  > blimit
+    por                   m1, m5
+    pxor                  m5, m5
+    pcmpeqb               m1, m5              ; mask m1
+
+    ; calculate high edge variance
+    movq                  m7, [_threshq]
+    movq                  m4, [stkreg + t0]   ; get abs (q1 - q0)
+    psubusb               m4, m7
+    movq                  m3, [stkreg + t1]   ; get abs (p1 - p0)
+    psubusb               m3, m7
+    paddb                 m4, m3              ; abs(q1 - q0) > thresh ||
+                                              ; abs(p1 - p0) > thresh
+    pcmpeqb               m4, m5
+    pcmpeqb               m5, m5
+    movq                  m3, [GLOBAL(t80)]
+    pxor                  m4, m5
+
+    ; start work on filters
+    movq                  m2, [sq + 2 * pq]   ; p1
+    movq                  m7, [s1q]           ; q1
+    pxor                  m2, m3              ; p1 converted to signed values
+    pxor                  m7, m3              ; q1 converted to signed values
+    psubsb                m2, m7              ; p1 - q1
+    pand                  m2, m4              ; high var mask (hvm)(p1 - q1)
+    pxor                  m6, m3              ; p0 converted to signed values
+    pxor                  m0, m3              ; q0 converted to signed values
+    movq                  m3, m0              ; q0
+    psubsb                m0, m6              ; q0 - p0
+    paddsb                m2, m0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+    paddsb                m2, m0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+    paddsb                m2, m0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+    pand                  m1, m2              ; mask filter values we don't
+                                              ; care about
+    movq                  m2, m1
+    paddsb                m1, [GLOBAL(t4)]    ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+    paddsb                m2, [GLOBAL(t3)]    ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+    pxor                  m0, m0
+    pxor                  m5, m5
+    punpcklbw             m0, m2
+    punpckhbw             m5, m2
+    psraw                 m0, 11
+    psraw                 m5, 11
+    packsswb              m0, m5
+    movq                  m2, m0              ; (3* (q0 - p0) + hvm(p1 - q1)
+                                              ; + 3) >> 3;
+    pxor                  m0, m0
+    movq                  m5, m1              ; abcdefgh
+    punpcklbw             m0, m1              ; e0f0g0h0
+    psraw                 m0, 11              ; sign extended shift right by 3
+    pxor                  m1, m1
+    punpckhbw             m1, m5              ; a0b0c0d0
+    psraw                 m1, 11              ; sign extended shift right by 3
+    movq                  m5, m0              ; save results
+
+    packsswb              m0, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
+                                              ; + 4) >>3
+    paddsw                m5, [GLOBAL(ones)]
+    paddsw                m1, [GLOBAL(ones)]
+    psraw                 m5, 1
+    psraw                 m1, 1
+    packsswb              m5, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
+                                              ; + 4) >>4
+    movq                  m1, [GLOBAL(t80)]
+    pandn                 m4, m5              ; high edge variance additive
+    paddsb                m6, m2              ; p0+= p0 add
+    pxor                  m6, m1              ; unoffset
+    movq           [sq + pq], m6              ; write back
+    movq                  m6, [sq + 2 * pq]   ; p1
+    pxor                  m6, m1              ; reoffset
+    paddsb                m6, m4              ; p1+= p1 add
+    pxor                  m6, m1              ; unoffset
+    movq       [sq + 2 * pq], m6              ; write back
+    psubsb                m3, m0              ; q0-= q0 add
+    pxor                  m3, m1              ; unoffset
+    movq                [sq], m3              ; write back
+    psubsb                m7, m4              ; q1-= q1 add
+    pxor                  m7, m1              ; unoffset
+    movq               [s1q], m7              ; write back
+    RET
+
+;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int  src_pixel_step,
+;                            const char *blimit, const char *limit,
+;                            const char *thresh);
+INIT_MMX mmx
+cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \
+                              s, p, _blimit, _limit, _thresh, s1
+    lea                   sq, [sq + pq * 4 - 4]
+    lea                  s1q, [sq + pq]       ; s1q points to row +1
+    ;transpose
+    movq                  m6, [ sq + 2 * pq]  ; 67 66 65 64 63 62 61 60
+    movq                  m7, m6              ; 77 76 75 74 73 72 71 70
+    punpckhbw             m7, [s1q + 2 * pq]  ; 77 67 76 66 75 65 74 64
+    punpcklbw             m6, [s1q + 2 * pq]  ; 73 63 72 62 71 61 70 60
+    movq                  m4, [sq]            ; 47 46 45 44 43 42 41 40
+    movq                  m5, m4              ; 47 46 45 44 43 42 41 40
+    punpckhbw             m5, [sq + pq]       ; 57 47 56 46 55 45 54 44
+    punpcklbw             m4, [sq + pq]       ; 53 43 52 42 51 41 50 40
+    movq                  m3, m5              ; 57 47 56 46 55 45 54 44
+    punpckhwd             m5, m7              ; 77 67 57 47 76 66 56 46
+    punpcklwd             m3, m7              ; 75 65 55 45 74 64 54 44
+    movq                  m2, m4              ; 53 43 52 42 51 41 50 40
+    punpckhwd             m4, m6              ; 73 63 53 43 72 62 52 42
+    punpcklwd             m2, m6              ; 71 61 51 41 70 60 50 40
+    neg                   pq
+    movq                  m6, [ sq + pq * 2]  ; 27 26 25 24 23 22 21 20
+    movq                  m1, m6              ; 27 26 25 24 23 22 21 20
+    punpckhbw             m6, [ sq + pq    ]  ; 37 27 36 36 35 25 34 24
+    punpcklbw             m1, [ sq + pq    ]  ; 33 23 32 22 31 21 30 20
+    movq                  m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00
+    punpckhbw             m7, [s1q + pq * 4]  ; 17 07 16 06 15 05 14 04
+    movq                  m0, m7              ; 17 07 16 06 15 05 14 04
+    punpckhwd             m7, m6              ; 37 27 17 07 36 26 16 06
+    punpcklwd             m0, m6              ; 35 25 15 05 34 24 14 04
+    movq                  m6, m7              ; 37 27 17 07 36 26 16 06
+    punpckhdq             m7, m5              ; 77 67 57 47 37 27 17 07  = q3
+    punpckldq             m6, m5              ; 76 66 56 46 36 26 16 06  = q2
+    movq                  m5, m6              ; 76 66 56 46 36 26 16 06
+    psubusb               m5, m7              ; q2-q3
+    psubusb               m7, m6              ; q3-q2
+    por                   m7, m5;             ; m7=abs (q3-q2)
+    movq                  m5, m0              ; 35 25 15 05 34 24 14 04
+    punpckhdq             m5, m3              ; 75 65 55 45 35 25 15 05 = q1
+    punpckldq             m0, m3              ; 74 64 54 44 34 24 15 04 = q0
+    movq                  m3, m5              ; 75 65 55 45 35 25 15 05 = q1
+    psubusb               m3, m6              ; q1-q2
+    psubusb               m6, m5              ; q2-q1
+    por                   m6, m3              ; m6=abs(q2-q1)
+
+    movq       [stkreg + q1], m5              ; save q1
+    movq       [stkreg + q0], m0              ; save q0
+
+    movq                  m3, [ sq + pq * 4]  ; 07 06 05 04 03 02 01 00
+    punpcklbw             m3, [s1q + pq * 4]  ; 13 03 12 02 11 01 10 00
+    movq                  m0, m3              ; 13 03 12 02 11 01 10 00
+    punpcklwd             m0, m1              ; 31 21 11 01 30 20 10 00
+    punpckhwd             m3, m1              ; 33 23 13 03 32 22 12 02
+    movq                  m1, m0              ; 31 21 11 01 30 20 10 00
+    punpckldq             m0, m2              ; 70 60 50 40 30 20 10 00  =p3
+    punpckhdq             m1, m2              ; 71 61 51 41 31 21 11 01  =p2
+    movq                  m2, m1              ; 71 61 51 41 31 21 11 01  =p2
+    psubusb               m2, m0              ; p2-p3
+    psubusb               m0, m1              ; p3-p2
+    por                   m0, m2              ; m0=abs(p3-p2)
+    movq                  m2, m3              ; 33 23 13 03 32 22 12 02
+    punpckldq             m2, m4              ; 72 62 52 42 32 22 12 02 = p1
+    punpckhdq             m3, m4              ; 73 63 53 43 33 23 13 03 = p0
+
+    movq       [stkreg + p0], m3              ; save p0
+    movq       [stkreg + p1], m2              ; save p1
+    movq                  m5, m2              ; m5 = p1
+    psubusb               m2, m1              ; p1-p2
+    psubusb               m1, m5              ; p2-p1
+    por                   m1, m2              ; m1=abs(p2-p1)
+    movq                  m4, [_limitq]
+    GET_GOT         goffsetq
+%if GET_GOT_DEFINED=1
+    add rsp, gprsize                          ; restore stack
+%endif
+    psubusb               m7, m4
+    psubusb               m0, m4
+    psubusb               m1, m4
+    psubusb               m6, m4
+    por                   m7, m6
+    por                   m0, m1
+    por                   m0, m7              ; abs(q3-q2) > limit ||
+                                              ; abs(p3-p2) > limit ||
+                                              ; abs(p2-p1) > limit ||
+                                              ; abs(q2-q1) > limit
+    movq                  m1, m5              ; p1
+    movq                  m7, m3              ; m3=m7=p0
+    psubusb               m7, m5              ; p0 - p1
+    psubusb               m5, m3              ; p1 - p0
+    por                   m5, m7              ; abs(p1-p0)
+    movq       [stkreg + t0], m5              ; save abs(p1-p0)
+    psubusb               m5, m4
+    por                   m0, m5              ; m0=mask
+    movq                  m5, [stkreg + q0]   ; m5=q0
+    movq                  m7, [stkreg + q1]   ; m7=q1
+    movq                  m6, m5              ; m6=q0
+    movq                  m2, m7              ; q1
+    psubusb               m5, m7              ; q0-q1
+    psubusb               m7, m6              ; q1-q0
+    por                   m7, m5              ; abs(q1-q0)
+    movq       [stkreg + t1], m7              ; save abs(q1-q0)
+    psubusb               m7, m4
+    por                   m0, m7              ; mask
+    movq                  m5, m2              ; q1
+    psubusb               m5, m1              ; q1-=p1
+    psubusb               m1, m2              ; p1-=q1
+    por                   m5, m1              ; abs(p1-q1)
+    pand                  m5, [GLOBAL(tfe)]   ; set lsb of each byte to zero
+    psrlw                 m5, 1               ; abs(p1-q1)/2
+    movq                  m4, [_blimitq]
+    movq                  m1, m3              ; m1=m3=p0
+    movq                  m7, m6              ; m7=m6=q0
+    psubusb               m1, m7              ; p0-q0
+    psubusb               m7, m3              ; q0-p0
+    por                   m1, m7              ; abs(q0-p0)
+    paddusb               m1, m1              ; abs(q0-p0)*2
+    paddusb               m1, m5              ; abs(p0 - q0)*2 + abs(p1-q1)/2
+    psubusb               m1, m4              ; abs(p0 - q0)*2 + abs(p1-q1)/2
+                                              ; > blimit
+    por                   m1, m0;             ; mask
+    pxor                  m0, m0
+    pcmpeqb               m1, m0
+
+    ; calculate high edge variance
+    movq                  m7, [_threshq]
+    movq                  m4, [stkreg + t0]   ; get abs (q1 - q0)
+    psubusb               m4, m7
+    movq                  m3, [stkreg + t1]   ; get abs (p1 - p0)
+    psubusb               m3, m7
+    por                   m4, m3              ; abs(q1 - q0) > thresh ||
+                                              ; abs(p1 - p0) > thresh
+    pcmpeqb               m4, m0
+    pcmpeqb               m0, m0
+    movq                  m3, [GLOBAL(t80)]
+    pxor                  m4, m0
+
+    ; start work on filters
+    movq                  m2, [stkreg + p1]
+    movq                  m7, [stkreg + q1]
+    movq                  m6, [stkreg + p0]
+    movq                  m0, [stkreg + q0]
+    pxor                  m2, m3
+    pxor                  m7, m3
+    psubsb                m2, m7              ; p1 - q1
+    pand                  m2, m4              ; high var mask (hvm)(p1 - q1)
+    pxor                  m6, m3
+    pxor                  m0, m3
+    movq                  m3, m0              ; q0
+    psubsb                m0, m6              ; q0 - p0
+    paddsb                m2, m0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+    paddsb                m2, m0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+    paddsb                m2, m0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+    pand                  m1, m2              ; mask filter values we don't
+                                              ; care about
+    movq                  m2, m1
+    paddsb                m1, [GLOBAL(t4)]    ; 3*(q0 - p0) + hvm(p1 - q1) + 4
+    paddsb                m2, [GLOBAL(t3)]    ; 3*(q0 - p0) + hvm(p1 - q1) + 3
+    pxor                  m0, m0
+    pxor                  m5, m5
+    punpcklbw             m0, m2
+    punpckhbw             m5, m2
+    psraw                 m0, 11
+    psraw                 m5, 11
+    packsswb              m0, m5
+    movq                  m2, m0              ; (3*(q0 - p0) + hvm(p1 - q1)
+                                              ; + 3) >> 3;
+    pxor                  m0, m0
+    movq                  m5, m1              ; abcdefgh
+    punpcklbw             m0, m1              ; e0f0g0h0
+    psraw                 m0, 11              ; sign extended shift right by 3
+    pxor                  m1, m1
+    punpckhbw             m1, m5              ; a0b0c0d0
+    psraw                 m1, 11              ; sign extended shift right by 3
+    movq                  m5, m0              ; save results
+    packsswb              m0, m1              ; (3*(q0 - p0) + hvm(p1 - q1)
+                                              ; + 4) >>3
+    paddsw                m5, [GLOBAL(ones)]
+    paddsw                m1, [GLOBAL(ones)]
+    psraw                 m5, 1
+    psraw                 m1, 1
+    packsswb              m5, m1              ; (3* (q0 - p0) + hvm(p1 - q1)
+                                              ; + 4) >>4
+    pandn                 m4, m5              ; high edge variance additive
+    movq                  m5, [GLOBAL(t80)]
+    paddsb                m6, m2              ; p0+= p0 add
+    pxor                  m6, m5              ; unoffset
+    ; m6=p0
+    movq                  m1, [stkreg + p1]
+    pxor                  m1, m5              ; reoffset
+    paddsb                m1, m4              ; p1+= p1 add
+    pxor                  m1, m5              ; unoffset
+    ; m6 = p0 m1 = p1
+    psubsb                m3, m0              ; q0-= q0 add
+    pxor                  m3, m5              ; unoffset
+    ; m3 = q0
+    psubsb                m7, m4              ; q1-= q1 add
+    pxor                  m7, m5              ; unoffset
+    ; m7 = q1
+    ; transpose and write back
+    ; m1 =    72 62 52 42 32 22 12 02
+    ; m6 =    73 63 53 43 33 23 13 03
+    ; m3 =    74 64 54 44 34 24 14 04
+    ; m7 =    75 65 55 45 35 25 15 05
+    movq                  m2, m1              ; 72 62 52 42 32 22 12 02
+    punpcklbw             m2, m6              ; 33 32 23 22 13 12 03 02
+    movq                  m4, m3              ; 74 64 54 44 34 24 14 04
+    punpckhbw             m1, m6              ; 73 72 63 62 53 52 43 42
+    punpcklbw             m4, m7              ; 35 34 25 24 15 14 05 04
+    punpckhbw             m3, m7              ; 75 74 65 64 55 54 45 44
+    movq                  m6, m2              ; 33 32 23 22 13 12 03 02
+    punpcklwd             m2, m4              ; 15 14 13 12 05 04 03 02
+    punpckhwd             m6, m4              ; 35 34 33 32 25 24 23 22
+    movq                  m5, m1              ; 73 72 63 62 53 52 43 42
+    punpcklwd             m1, m3              ; 55 54 53 52 45 44 43 42
+    punpckhwd             m5, m3              ; 75 74 73 72 65 64 63 62
+
+    ; m2 = 15 14 13 12 05 04 03 02
+    ; m6 = 35 34 33 32 25 24 23 22
+    ; m5 = 55 54 53 52 45 44 43 42
+    ; m1 = 75 74 73 72 65 64 63 62
+    movd   [sq + pq * 4 + 2], m2
+    psrlq                 m2, 32
+    movd  [s1q + pq * 4 + 2], m2
+    movd   [sq + pq * 2 + 2], m6
+    psrlq                 m6, 32
+    movd       [sq + pq + 2], m6
+    movd            [sq + 2], m1
+    psrlq                 m1, 32
+    movd           [s1q + 2], m1
+    neg                   pq
+    movd      [s1q + pq + 2], m5
+    psrlq                 m5, 32
+    movd  [s1q + pq * 2 + 2], m5
+    RET
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index 3fbaa27..d2cb8ea 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -16,6 +16,11 @@
 ; %define USE_PMULHRSW
 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
 ; when using this instruction.
+;
+; The add order below (based on ffvp9) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
 
 SECTION .text
 %if ARCH_X86_64
@@ -77,17 +82,12 @@
 
     pmaddubsw %2, k0k1k4k5
     pmaddubsw m3, k2k3k6k7
-
-    mova      m4, %2
-    mova      m5, m3
-    psrldq    %2, 8
-    psrldq    m3, 8
-    mova      m6, m5
-
-    paddsw    m4, m3
-    pmaxsw    m5, %2
-    pminsw    %2, m6
+    mova      m4, %2        ;k0k1
+    mova      m5, m3        ;k2k3
+    psrldq    %2, 8         ;k4k5
+    psrldq    m3, 8         ;k6k7
     paddsw    %2, m4
+    paddsw    m5, m3
     paddsw    %2, m5
     paddsw    %2, krd
     psraw     %2, 7
@@ -157,27 +157,20 @@
     pmaddubsw           m7, k0k1k4k5
     palignr             m3, m2,  5
     pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4
-    mova                m5, m1
-    mova                m2, m7
-    psrldq              m4, 8
-    psrldq              m1, 8
-    mova                m6, m5
-    paddsw              m0, m1
-    mova                m1, m3
-    psrldq              m7, 8
-    psrldq              m3, 8
-    paddsw              m2, m3
-    mova                m3, m1
-    pmaxsw              m5, m4
-    pminsw              m4, m6
+    mova                m0, m4                  ;k0k1
+    mova                m5, m1                  ;k2k3
+    mova                m2, m7                  ;k0k1 upper
+    psrldq              m4, 8                   ;k4k5
+    psrldq              m1, 8                   ;k6k7
     paddsw              m4, m0
-    paddsw              m4, m5
-    pmaxsw              m1, m7
-    pminsw              m7, m3
+    paddsw              m5, m1
+    mova                m1, m3                  ;k2k3 upper
+    psrldq              m7, 8                   ;k4k5 upper
+    psrldq              m3, 8                   ;k6k7 upper
     paddsw              m7, m2
+    paddsw              m4, m5
+    paddsw              m1, m3
     paddsw              m7, m1
-
     paddsw              m4, krd
     psraw               m4, 7
     packuswb            m4, m4
@@ -240,16 +233,13 @@
     pmaddubsw   %3, k2k3
     pmaddubsw   %4, k4k5
     pmaddubsw   %5, k6k7
-
+    paddsw      %2, %4
+    paddsw      %5, %3
     paddsw      %2, %5
-    mova        %1, %3
-    pminsw      %3, %4
-    pmaxsw      %1, %4
-    paddsw      %2, %3
-    paddsw      %1, %2
-    paddsw      %1, krd
-    psraw       %1, 7
-    packuswb    %1, %1
+    paddsw      %2, krd
+    psraw       %2, 7
+    packuswb    %2, %2
+    SWAP        %1, %2
 %endm
 
 ;-------------------------------------------------------------------------------
@@ -293,39 +283,33 @@
     pmaddubsw            m3, k4k5
 
     palignr              m7, m4, 13
-    paddsw               m1, m5
-    mova                 m5, m6
-    mova                 m0, m2
-    palignr              m5, m4, 5
-    pminsw               m2, m3
-    pmaddubsw            m7, k6k7
-    pmaxsw               m3, m0
-    paddsw               m1, m2
     mova                 m0, m6
-    palignr              m6, m4, 1
-    pmaddubsw            m5, k2k3
+    palignr              m0, m4, 5
+    pmaddubsw            m7, k6k7
     paddsw               m1, m3
+    paddsw               m2, m5
+    paddsw               m1, m2
+    mova                 m5, m6
+    palignr              m6, m4, 1
+    pmaddubsw            m0, k2k3
     pmaddubsw            m6, k0k1
-    palignr              m0, m4, 9
+    palignr              m5, m4, 9
     paddsw               m1, krd
-    pmaddubsw            m0, k4k5
-    mova                 m4, m5
+    pmaddubsw            m5, k4k5
     psraw                m1, 7
-    pminsw               m5, m0
-    paddsw               m6, m7
+    paddsw               m0, m7
+%ifidn %1, h8_avg
+    movh                 m7, [dstq]
+    movh                 m2, [dstq + dstrideq]
+%endif
     packuswb             m1, m1
-
     paddsw               m6, m5
-    pmaxsw               m0, m4
     paddsw               m6, m0
     paddsw               m6, krd
     psraw                m6, 7
     packuswb             m6, m6
-
 %ifidn %1, h8_avg
-    movh                 m0, [dstq]
-    movh                 m2, [dstq + dstrideq]
-    pavgb                m1, m0
+    pavgb                m1, m7
     pavgb                m6, m2
 %endif
     movh             [dstq], m1
@@ -388,7 +372,7 @@
     pmaddubsw     m1, k2k3
     palignr       m2, m7, 9
     pmaddubsw     m2, k4k5
-    paddsw        m0, m3
+    paddsw        m1, m3
     mova          m3, m4
     punpckhbw     m4, m4
     mova          m5, m4
@@ -403,17 +387,13 @@
     pmaddubsw     m6, k4k5
     palignr       m7, m3, 13
     pmaddubsw     m7, k6k7
-
-    mova          m3, m1
-    pmaxsw        m1, m2
-    pminsw        m2, m3
     paddsw        m0, m2
     paddsw        m0, m1
-    paddsw        m4, m7
-    mova          m7, m5
-    pmaxsw        m5, m6
-    pminsw        m6, m7
+%ifidn %1, h8_avg
+    mova          m1, [dstq]
+%endif
     paddsw        m4, m6
+    paddsw        m5, m7
     paddsw        m4, m5
     paddsw        m0, krd
     paddsw        m4, krd
@@ -421,7 +401,6 @@
     psraw         m4, 7
     packuswb      m0, m4
 %ifidn %1, h8_avg
-    mova          m1, [dstq]
     pavgb         m0, m1
 %endif
     lea         srcq, [srcq + sstrideq]
@@ -488,27 +467,21 @@
     movx         m7, [src1q + sstride6q   ]     ;H
     punpcklbw    m6, m7                         ;G H
     pmaddubsw    m6, k6k7
-    mova        tmp, m2
     pmaddubsw    m3, k2k3
     pmaddubsw    m1, k0k1
-    pmaxsw       m2, m4
-    paddsw       m0, m6
+    paddsw       m0, m4
+    paddsw       m2, m6
     movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
     punpcklbw    m7, m6
     pmaddubsw    m7, k6k7
-    pminsw       m4, tmp
-    paddsw       m0, m4
-    mova         m4, m3
     paddsw       m0, m2
-    pminsw       m3, m5
-    pmaxsw       m5, m4
     paddsw       m0, krd
     psraw        m0, 7
-    paddsw       m1, m7
+    paddsw       m1, m5
     packuswb     m0, m0
 
+    paddsw       m3, m7
     paddsw       m1, m3
-    paddsw       m1, m5
     paddsw       m1, krd
     psraw        m1, 7
     lea        srcq, [srcq + sstrideq * 2 ]
@@ -538,11 +511,11 @@
     movx         m1, [srcq + sstrideq     ]     ;B
     movx         m6, [srcq + sstride6q    ]     ;G
     punpcklbw    m0, m1                         ;A B
-    movx         m7, [rax + sstride6q     ]     ;H
+    movx         m7, [src1q + sstride6q   ]     ;H
     pmaddubsw    m0, k0k1
     movx         m2, [srcq + sstrideq * 2 ]     ;C
     punpcklbw    m6, m7                         ;G H
-    movx         m3, [rax + sstrideq * 2  ]     ;D
+    movx         m3, [src1q + sstrideq * 2]     ;D
     pmaddubsw    m6, k6k7
     movx         m4, [srcq + sstrideq * 4 ]     ;E
     punpcklbw    m2, m3                         ;C D
@@ -550,10 +523,7 @@
     punpcklbw    m4, m5                         ;E F
     pmaddubsw    m2, k2k3
     pmaddubsw    m4, k4k5
-    paddsw       m0, m6
-    mova         m1, m2
-    pmaxsw       m2, m4
-    pminsw       m4, m1
+    paddsw       m2, m6
     paddsw       m0, m4
     paddsw       m0, m2
     paddsw       m0, krd
@@ -572,7 +542,6 @@
 %macro SUBPIX_VFILTER16 1
 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
-
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
 %if ARCH_X86_64
@@ -611,12 +580,9 @@
     punpcklbw     m3, m5                         ;A B
     movh          m7, [srcq + sstrideq * 2 + 8]  ;C
     pmaddubsw     m6, k6k7
-    mova          m1, m2
     movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    pmaxsw        m2, m4
     punpcklbw     m7, m5                         ;C D
-    pminsw        m4, m1
-    paddsw        m0, m6
+    paddsw        m2, m6
     pmaddubsw     m3, k0k1
     movh          m1, [srcq + sstrideq * 4 + 8]  ;E
     paddsw        m0, m4
@@ -630,30 +596,24 @@
     movh          m5, [src1q + sstride6q + 8]    ;H
     psraw         m0, 7
     punpcklbw     m2, m5                         ;G H
-    packuswb      m0, m0
     pmaddubsw     m2, k6k7
 %ifidn %1, v8_avg
-    movh          m4, [dstq]
-    pavgb         m0, m4
+    mova          m4, [dstq]
 %endif
     movh      [dstq], m0
-    mova          m6, m7
-    pmaxsw        m7, m1
-    pminsw        m1, m6
-    paddsw        m3, m2
+    paddsw        m7, m2
     paddsw        m3, m1
     paddsw        m3, m7
     paddsw        m3, krd
     psraw         m3, 7
-    packuswb      m3, m3
+    packuswb      m0, m3
 
     add         srcq, sstrideq
     add        src1q, sstrideq
 %ifidn %1, v8_avg
-    movh          m1, [dstq + 8]
-    pavgb         m3, m1
+    pavgb         m0, m4
 %endif
-    movh  [dstq + 8], m3
+    mova      [dstq], m0
     add         dstq, dst_stride
     dec      heightd
     jnz        .loop
diff --git a/vpxenc.c b/vpxenc.c
index 35b79de..5e14934 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -380,7 +380,8 @@
 static const arg_def_t tile_cols = ARG_DEF(
     NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows = ARG_DEF(
-    NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+    NULL, "tile-rows", 1,
+    "Number of tile rows to use, log2 (set to 0 while threads > 1)");
 static const arg_def_t lossless = ARG_DEF(
     NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
@@ -804,7 +805,6 @@
   int                       arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int                       arg_ctrl_cnt;
   int                       write_webm;
-  int                       have_kf_max_dist;
 #if CONFIG_VP9_HIGHBITDEPTH
   // whether to use 16bit internal buffers
   int                       use_16bit_internal;
@@ -1224,7 +1224,6 @@
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
-      config->have_kf_max_dist = 1;
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1352,19 +1351,6 @@
   }
 }
 
-
-static void set_default_kf_interval(struct stream_state *stream,
-                                    struct VpxEncoderConfig *global) {
-  /* Use a max keyframe interval of 5 seconds, if none was
-   * specified on the command line.
-   */
-  if (!stream->config.have_kf_max_dist) {
-    double framerate = (double)global->framerate.num / global->framerate.den;
-    if (framerate > 0.0)
-      stream->config.cfg.kf_max_dist = (unsigned int)(5.0 * framerate);
-  }
-}
-
 static const char* file_type_to_string(enum VideoFileType t) {
   switch (t) {
     case FILE_TYPE_RAW: return "RAW";
@@ -2087,8 +2073,6 @@
                      stream->config.cfg.g_timebase.num = global.framerate.den);
     }
 
-    FOREACH_STREAM(set_default_kf_interval(stream, &global));
-
     /* Show configuration */
     if (global.verbose && pass == 0)
       FOREACH_STREAM(show_stream_config(stream, &global, &input));