Merge "Removing gold_is_last, alt_is_last, gold_is_alt flags."

diff --git a/test/set_maps.sh b/test/set_maps.sh
new file mode 100755
index 0000000..e7c8d43
--- /dev/null
+++ b/test/set_maps.sh

@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file tests the libvpx set_maps example. To add new tests to this file,
+##  do the following:
+##    1. Write a shell function (this is your test).
+##    2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBVPX_BIN_PATH.
+set_maps_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path set_maps)" ]; then
+    elog "set_maps not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+  local encoder="$(vpx_tool_path set_maps)"
+  local codec="$1"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+set_maps_vp8() {
+  if [ "$(vp8_encode_available)" = "yes" ]; then
+    set_maps vp8 || return 1
+  fi
+}
+
+set_maps_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    set_maps vp9 || return 1
+  fi
+}
+
+set_maps_tests="set_maps_vp8
+                set_maps_vp9"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 6619fb1..ff42725 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc

@@ -105,7 +105,7 @@
 INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_c));
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_neon));
 #endif

diff --git a/test/svc_test.cc b/test/svc_test.cc
index e9cf38d..1cb01a4 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc

@@ -13,6 +13,9 @@
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/i420_video_source.h"
+
+#include "vp9/decoder/vp9_decoder.h"
+
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
@@ -21,6 +24,7 @@
 
 using libvpx_test::CodecFactory;
 using libvpx_test::Decoder;
+using libvpx_test::DxDataIterator;
 using libvpx_test::VP9CodecFactory;
 
 class SvcTest : public ::testing::Test {
@@ -62,9 +66,213 @@
   }
 
   virtual void TearDown() {
-    vpx_svc_release(&svc_);
+    ReleaseEncoder();
     delete(decoder_);
+  }
+
+  void InitializeEncoder() {
+    const vpx_codec_err_t res =
+        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    codec_initialized_ = true;
+  }
+
+  void ReleaseEncoder() {
+    vpx_svc_release(&svc_);
     if (codec_initialized_) vpx_codec_destroy(&codec_);
+    codec_initialized_ = false;
+  }
+
+  void Pass1EncodeNFrames(const int n, const int layers,
+                          std::string *const stats_buf) {
+    vpx_codec_err_t res;
+    size_t stats_size = 0;
+    const char *stats_data = NULL;
+
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+      EXPECT_GT(stats_size, 0U);
+      stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+      ASSERT_TRUE(stats_data != NULL);
+      stats_buf->append(stats_data, stats_size);
+      video.Next();
+    }
+
+    // Flush encoder and test EOS packet
+    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+    EXPECT_GT(stats_size, 0U);
+    stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+    ASSERT_TRUE(stats_data != NULL);
+    stats_buf->append(stats_data, stats_size);
+
+    ReleaseEncoder();
+  }
+
+  void StoreFrames(const size_t max_frame_received,
+                   struct vpx_fixed_buf *const outputs,
+                   size_t *const frame_received) {
+    size_t frame_size;
+    while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
+      ASSERT_LT(*frame_received, max_frame_received);
+
+      if (*frame_received == 0) {
+        EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
+      }
+
+      outputs[*frame_received].buf = malloc(frame_size);
+      ASSERT_TRUE(outputs[*frame_received].buf != NULL);
+      memcpy(outputs[*frame_received].buf, vpx_svc_get_buffer(&svc_),
+             frame_size);
+      outputs[*frame_received].sz = frame_size;
+      ++(*frame_received);
+    }
+  }
+
+  void Pass2EncodeNFrames(std::string *const stats_buf,
+                          const int n, const int layers,
+                          struct vpx_fixed_buf *const outputs) {
+    vpx_codec_err_t res;
+    size_t frame_received = 0;
+
+    ASSERT_TRUE(outputs != NULL);
+    ASSERT_GT(n, 0);
+    ASSERT_GT(layers, 0);
+    svc_.spatial_layers = layers;
+    codec_enc_.rc_target_bitrate = 500;
+    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
+      ASSERT_TRUE(stats_buf != NULL);
+      ASSERT_GT(stats_buf->size(), 0U);
+      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
+      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
+    }
+    InitializeEncoder();
+
+    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+                                       codec_enc_.g_timebase.den,
+                                       codec_enc_.g_timebase.num, 0, 30);
+    video.Begin();
+
+    for (int i = 0; i < n; ++i) {
+      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                           video.duration(), VPX_DL_GOOD_QUALITY);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+      StoreFrames(n, outputs, &frame_received);
+      video.Next();
+    }
+
+    // Flush Encoder
+    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+                         video.duration(), VPX_DL_GOOD_QUALITY);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+    StoreFrames(n, outputs, &frame_received);
+
+    EXPECT_EQ(frame_received, (size_t)n);
+
+    ReleaseEncoder();
+  }
+
+  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
+    int decoded_frames = 0;
+    int received_frames = 0;
+
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+      const vpx_codec_err_t res_dec =
+          decoder_->DecodeFrame(static_cast<const uint8_t *>(inputs[i].buf),
+                                inputs[i].sz);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+      ++decoded_frames;
+
+      DxDataIterator dec_iter = decoder_->GetDxData();
+      while (dec_iter.Next()) {
+        ++received_frames;
+      }
+    }
+    EXPECT_EQ(decoded_frames, n);
+    EXPECT_EQ(received_frames, n);
+  }
+
+  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
+                             const int num_super_frames,
+                             const int remained_layers) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(num_super_frames, 0);
+    ASSERT_GT(remained_layers, 0);
+
+    for (int i = 0; i < num_super_frames; ++i) {
+      uint32_t frame_sizes[8] = {0};
+      int frame_count = 0;
+      int frames_found = 0;
+      int frame;
+      ASSERT_TRUE(inputs[i].buf != NULL);
+      ASSERT_GT(inputs[i].sz, 0U);
+
+      vpx_codec_err_t res =
+          vp9_parse_superframe_index(static_cast<const uint8_t*>(inputs[i].buf),
+                                     inputs[i].sz, frame_sizes, &frame_count,
+                                     NULL, NULL);
+      ASSERT_EQ(VPX_CODEC_OK, res);
+
+      uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf);
+      uint8_t *frame_start = frame_data;
+      for (frame = 0; frame < frame_count; ++frame) {
+        // Looking for a visible frame
+        if (frame_data[0] & 0x02) {
+          ++frames_found;
+          if (frames_found == remained_layers)
+            break;
+        }
+        frame_data += frame_sizes[frame];
+      }
+      ASSERT_LT(frame, frame_count);
+      if (frame == frame_count - 1)
+        continue;
+
+      frame_data += frame_sizes[frame];
+      uint8_t marker =
+          static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1];
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frame_count;
+      const size_t new_index_sz = 2 + mag * (frame + 1);
+      marker &= 0x0f8;
+      marker |= frame;
+      frame_data[0] = marker;
+      memcpy(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
+             new_index_sz - 2);
+      frame_data[new_index_sz - 1] = marker;
+      inputs[i].sz = frame_data - frame_start + new_index_sz;
+    }
+  }
+
+  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
+    ASSERT_TRUE(inputs != NULL);
+    ASSERT_GT(n, 0);
+
+    for (int i = 0; i < n; ++i) {
+      free(inputs[i].buf);
+      inputs[i].buf = NULL;
+      inputs[i].sz = 0;
+    }
   }
 
   SvcContext svc_;
@@ -93,9 +301,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   svc_.spatial_layers = 0;  // use default layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
 }
 
@@ -106,9 +312,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");  // valid scale values
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, InvalidOptions) {
@@ -124,18 +328,15 @@
 TEST_F(SvcTest, SetLayersOption) {
   vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(3, svc_.spatial_layers);
 }
 
 TEST_F(SvcTest, SetMultipleOptions) {
   vpx_codec_err_t res =
       vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
   EXPECT_EQ(2, svc_.spatial_layers);
 }
 
@@ -149,9 +350,7 @@
 
   res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetQuantizersOption) {
@@ -162,9 +361,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_options(&svc_, "quantizers=40,45");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetAutoAltRefOption) {
@@ -180,9 +377,7 @@
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetQuantizers) {
@@ -200,9 +395,7 @@
 
   res = vpx_svc_set_quantizers(&svc_, "40,30");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 TEST_F(SvcTest, SetScaleFactors) {
@@ -220,121 +413,25 @@
 
   res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
   EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 }
 
 // Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, FirstFrameHasLayers) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  video.Begin();
-
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
-  if (vpx_svc_get_frame_size(&svc_) == 0) {
-    // Flush encoder
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                         video.duration(), VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-  }
-
-  int frame_size = vpx_svc_get_frame_size(&svc_);
-  EXPECT_GT(frame_size, 0);
-  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-
-  // this test fails with a decoder error
-  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+TEST_F(SvcTest, OnePassEncodeOneFrame) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  vpx_fixed_buf output = {0};
+  Pass2EncodeNFrames(NULL, 1, 2, &output);
+  DecodeNFrames(&output, 1);
+  FreeBitstreamBuffers(&output, 1);
 }
 
-TEST_F(SvcTest, EncodeThreeFrames) {
-  svc_.spatial_layers = 2;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-  int decoded_frames = 0;
-  vpx_codec_err_t res_dec;
-  int frame_size;
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  // FRAME 0
-  video.Begin();
-  // This frame is a keyframe.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // FRAME 1
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // FRAME 2
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  // Flush encoder
-  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
-  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
-
-  EXPECT_EQ(decoded_frames, 3);
+TEST_F(SvcTest, OnePassEncodeThreeFrames) {
+  codec_enc_.g_pass = VPX_RC_ONE_PASS;
+  vpx_fixed_buf outputs[3];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 3);
+  FreeBitstreamBuffers(&outputs[0], 3);
 }
 
 TEST_F(SvcTest, GetLayerResolution) {
@@ -342,14 +439,11 @@
   vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
   vpx_svc_set_quantizers(&svc_, "40,30");
 
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+  InitializeEncoder();
 
   // ensure that requested layer is a valid layer
   uint32_t layer_width, layer_height;
-  res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
+  vpx_codec_err_t res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
                                      &layer_width, &layer_height);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
@@ -373,131 +467,113 @@
   EXPECT_EQ(kHeight * 8 / 16, layer_height);
 }
 
-TEST_F(SvcTest, TwoPassEncode) {
+TEST_F(SvcTest, TwoPassEncode10Frames) {
   // First pass encode
   std::string stats_buf;
-  svc_.spatial_layers = 2;
-  codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-
-  vpx_codec_err_t res =
-      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
-
-  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
-                                     codec_enc_.g_timebase.den,
-                                     codec_enc_.g_timebase.num, 0, 30);
-  // FRAME 0
-  video.Begin();
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // FRAME 1
-  video.Next();
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // Flush encoder and test EOS packet
-  res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-  EXPECT_GT(stats_size, 0U);
-  stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-  ASSERT_TRUE(stats_data != NULL);
-  stats_buf.append(stats_data, stats_size);
-
-  // Tear down encoder
-  vpx_svc_release(&svc_);
-  vpx_codec_destroy(&codec_);
+  Pass1EncodeNFrames(10, 2, &stats_buf);
 
   // Second pass encode
-  int decoded_frames = 0;
-  vpx_codec_err_t res_dec;
-  int frame_size;
   codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(20, 2, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
   vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
-  codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
 
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  ASSERT_EQ(VPX_CODEC_OK, res);
-  codec_initialized_ = true;
+TEST_F(SvcTest, TwoPassEncode2LayersDecodeBaseLayerOnly) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 2, &stats_buf);
 
-  // FRAME 0
-  video.Begin();
-  // This frame is a keyframe.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode5LayersDecode54321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  Pass1EncodeNFrames(10, 5, &stats_buf);
 
-  // FRAME 1
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 4);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 3);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 2);
+  DecodeNFrames(&outputs[0], 10);
+  DropEnhancementLayers(&outputs[0], 10, 1);
+  DecodeNFrames(&outputs[0], 10);
 
-  // FRAME 2
-  video.Next();
-  // This is a P-frame.
-  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  ASSERT_EQ(VPX_CODEC_OK, res);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
 
-  if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+  Pass1EncodeNFrames(20, 2, &stats_buf);
 
-  // Flush encoder
-  res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
-                       video.duration(), VPX_DL_GOOD_QUALITY);
-  EXPECT_EQ(VPX_CODEC_OK, res);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1 scale-factors=1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  FreeBitstreamBuffers(&outputs[0], 20);
+}
 
-  while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-    EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
-    res_dec = decoder_->DecodeFrame(
-        static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-    ++decoded_frames;
-  }
+TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+  Pass1EncodeNFrames(20, 3, &stats_buf);
 
-  EXPECT_EQ(decoded_frames, 3);
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  vpx_svc_set_options(&svc_,
+                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
+  vpx_fixed_buf outputs[20];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
+  DecodeNFrames(&outputs[0], 20);
+  DropEnhancementLayers(&outputs[0], 20, 2);
+  DecodeNFrames(&outputs[0], 20);
+  DropEnhancementLayers(&outputs[0], 20, 1);
+  DecodeNFrames(&outputs[0], 20);
+
+  FreeBitstreamBuffers(&outputs[0], 20);
 }
 
 }  // namespace

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index fd9afd2..204cbf0 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -456,9 +456,8 @@
 $vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/;
+specialize qw/vp8_short_walsh4x4 sse2 media neon/;
 $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
-$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
 
 #
 # Quantizer
@@ -503,19 +502,16 @@
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 
 add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_b mmx sse2 media neon/;
 $vp8_subtract_b_media=vp8_subtract_b_armv6;
-$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
 
 add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_mby mmx sse2 media neon/;
 $vp8_subtract_mby_media=vp8_subtract_mby_armv6;
-$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
 
 add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
 $vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
-$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
 
 #
 # Motion search

diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
deleted file mode 100644
index 840cb33..0000000
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ /dev/null

@@ -1,205 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT |vp8_subtract_b_neon|
-    EXPORT |vp8_subtract_mby_neon|
-    EXPORT |vp8_subtract_mbuv_neon|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-|vp8_subtract_b_neon| PROC
-
-    stmfd   sp!, {r4-r7}
-
-    ldr     r3, [r0, #vp8_block_base_src]
-    ldr     r4, [r0, #vp8_block_src]
-    ldr     r5, [r0, #vp8_block_src_diff]
-    ldr     r3, [r3]
-    ldr     r6, [r0, #vp8_block_src_stride]
-    add     r3, r3, r4                      ; src = *base_src + src
-    ldr     r7, [r1, #vp8_blockd_predictor]
-
-    vld1.8          {d0}, [r3], r6          ;load src
-    vld1.8          {d1}, [r7], r2          ;load pred
-    vld1.8          {d2}, [r3], r6
-    vld1.8          {d3}, [r7], r2
-    vld1.8          {d4}, [r3], r6
-    vld1.8          {d5}, [r7], r2
-    vld1.8          {d6}, [r3], r6
-    vld1.8          {d7}, [r7], r2
-
-    vsubl.u8        q10, d0, d1
-    vsubl.u8        q11, d2, d3
-    vsubl.u8        q12, d4, d5
-    vsubl.u8        q13, d6, d7
-
-    mov             r2, r2, lsl #1
-
-    vst1.16         {d20}, [r5], r2         ;store diff
-    vst1.16         {d22}, [r5], r2
-    vst1.16         {d24}, [r5], r2
-    vst1.16         {d26}, [r5], r2
-
-    ldmfd   sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-
-;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
-;                           unsigned char *pred, int pred_stride)
-|vp8_subtract_mby_neon| PROC
-    push            {r4-r7}
-    vpush           {d8-d15}
-
-    mov             r12, #4
-    ldr             r4, [sp, #80]           ; pred_stride
-    mov             r6, #32                 ; "diff" stride x2
-    add             r5, r0, #16             ; second diff pointer
-
-subtract_mby_loop
-    vld1.8          {q0}, [r1], r2          ;load src
-    vld1.8          {q1}, [r3], r4          ;load pred
-    vld1.8          {q2}, [r1], r2
-    vld1.8          {q3}, [r3], r4
-    vld1.8          {q4}, [r1], r2
-    vld1.8          {q5}, [r3], r4
-    vld1.8          {q6}, [r1], r2
-    vld1.8          {q7}, [r3], r4
-
-    vsubl.u8        q8, d0, d2
-    vsubl.u8        q9, d1, d3
-    vsubl.u8        q10, d4, d6
-    vsubl.u8        q11, d5, d7
-    vsubl.u8        q12, d8, d10
-    vsubl.u8        q13, d9, d11
-    vsubl.u8        q14, d12, d14
-    vsubl.u8        q15, d13, d15
-
-    vst1.16         {q8}, [r0], r6          ;store diff
-    vst1.16         {q9}, [r5], r6
-    vst1.16         {q10}, [r0], r6
-    vst1.16         {q11}, [r5], r6
-    vst1.16         {q12}, [r0], r6
-    vst1.16         {q13}, [r5], r6
-    vst1.16         {q14}, [r0], r6
-    vst1.16         {q15}, [r5], r6
-
-    subs            r12, r12, #1
-    bne             subtract_mby_loop
-
-    vpop            {d8-d15}
-    pop             {r4-r7}
-    bx              lr
-    ENDP
-
-;=================================
-;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-
-|vp8_subtract_mbuv_neon| PROC
-    push            {r4-r7}
-    vpush           {d8-d15}
-
-    ldr             r4, [sp, #80]       ; upred
-    ldr             r5, [sp, #84]       ; vpred
-    ldr             r6, [sp, #88]       ; pred_stride
-    add             r0, r0, #512        ; short *udiff = diff + 256;
-    mov             r12, #32            ; "diff" stride x2
-    add             r7, r0, #16         ; second diff pointer
-
-;u
-    vld1.8          {d0}, [r1], r3      ;load usrc
-    vld1.8          {d1}, [r4], r6      ;load upred
-    vld1.8          {d2}, [r1], r3
-    vld1.8          {d3}, [r4], r6
-    vld1.8          {d4}, [r1], r3
-    vld1.8          {d5}, [r4], r6
-    vld1.8          {d6}, [r1], r3
-    vld1.8          {d7}, [r4], r6
-    vld1.8          {d8}, [r1], r3
-    vld1.8          {d9}, [r4], r6
-    vld1.8          {d10}, [r1], r3
-    vld1.8          {d11}, [r4], r6
-    vld1.8          {d12}, [r1], r3
-    vld1.8          {d13}, [r4], r6
-    vld1.8          {d14}, [r1], r3
-    vld1.8          {d15}, [r4], r6
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0], r12     ;store diff
-    vst1.16         {q9}, [r7], r12
-    vst1.16         {q10}, [r0], r12
-    vst1.16         {q11}, [r7], r12
-    vst1.16         {q12}, [r0], r12
-    vst1.16         {q13}, [r7], r12
-    vst1.16         {q14}, [r0], r12
-    vst1.16         {q15}, [r7], r12
-
-;v
-    vld1.8          {d0}, [r2], r3      ;load vsrc
-    vld1.8          {d1}, [r5], r6      ;load vpred
-    vld1.8          {d2}, [r2], r3
-    vld1.8          {d3}, [r5], r6
-    vld1.8          {d4}, [r2], r3
-    vld1.8          {d5}, [r5], r6
-    vld1.8          {d6}, [r2], r3
-    vld1.8          {d7}, [r5], r6
-    vld1.8          {d8}, [r2], r3
-    vld1.8          {d9}, [r5], r6
-    vld1.8          {d10}, [r2], r3
-    vld1.8          {d11}, [r5], r6
-    vld1.8          {d12}, [r2], r3
-    vld1.8          {d13}, [r5], r6
-    vld1.8          {d14}, [r2], r3
-    vld1.8          {d15}, [r5], r6
-
-    vsubl.u8        q8, d0, d1
-    vsubl.u8        q9, d2, d3
-    vsubl.u8        q10, d4, d5
-    vsubl.u8        q11, d6, d7
-    vsubl.u8        q12, d8, d9
-    vsubl.u8        q13, d10, d11
-    vsubl.u8        q14, d12, d13
-    vsubl.u8        q15, d14, d15
-
-    vst1.16         {q8}, [r0], r12     ;store diff
-    vst1.16         {q9}, [r7], r12
-    vst1.16         {q10}, [r0], r12
-    vst1.16         {q11}, [r7], r12
-    vst1.16         {q12}, [r0], r12
-    vst1.16         {q13}, [r7], r12
-    vst1.16         {q14}, [r0], r12
-    vst1.16         {q15}, [r7], r12
-
-    vpop            {d8-d15}
-    pop             {r4-r7}
-    bx              lr
-
-    ENDP
-
-    END

diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c
new file mode 100644
index 0000000..d3ab7b1
--- /dev/null
+++ b/vp8/encoder/arm/neon/subtract_neon.c

@@ -0,0 +1,154 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+void vp8_subtract_b_neon(
+        BLOCK *be,
+        BLOCKD *bd,
+        int pitch) {
+    unsigned char *src_ptr, *predictor;
+    int src_stride;
+    int16_t *src_diff;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint16x8_t q10u16, q11u16, q12u16, q13u16;
+
+    src_ptr = *be->base_src + be->src;
+    src_stride = be->src_stride;
+    predictor = bd->predictor;
+
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d4u8 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d6u8 = vld1_u8(src_ptr);
+
+    d1u8 = vld1_u8(predictor);
+    predictor += pitch;
+    d3u8 = vld1_u8(predictor);
+    predictor += pitch;
+    d5u8 = vld1_u8(predictor);
+    predictor += pitch;
+    d7u8 = vld1_u8(predictor);
+
+    q10u16 = vsubl_u8(d0u8, d1u8);
+    q11u16 = vsubl_u8(d2u8, d3u8);
+    q12u16 = vsubl_u8(d4u8, d5u8);
+    q13u16 = vsubl_u8(d6u8, d7u8);
+
+    src_diff = be->src_diff;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
+    src_diff += pitch;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
+    src_diff += pitch;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
+    src_diff += pitch;
+    vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
+    return;
+}
+
+void vp8_subtract_mby_neon(
+        int16_t *diff,
+        unsigned char *src,
+        int src_stride,
+        unsigned char *pred,
+        int pred_stride) {
+    int i;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    for (i = 0; i < 8; i++) {  // subtract_mby_loop
+        q0u8 = vld1q_u8(src);
+        src += src_stride;
+        q2u8 = vld1q_u8(src);
+        src += src_stride;
+        q1u8 = vld1q_u8(pred);
+        pred += pred_stride;
+        q3u8 = vld1q_u8(pred);
+        pred += pred_stride;
+
+        q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
+        q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
+        q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
+        q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
+
+        vst1q_u16((uint16_t *)diff, q8u16);
+        diff += 8;
+        vst1q_u16((uint16_t *)diff, q9u16);
+        diff += 8;
+        vst1q_u16((uint16_t *)diff, q10u16);
+        diff += 8;
+        vst1q_u16((uint16_t *)diff, q11u16);
+        diff += 8;
+    }
+    return;
+}
+
+void vp8_subtract_mbuv_neon(
+        int16_t *diff,
+        unsigned char *usrc,
+        unsigned char *vsrc,
+        int src_stride,
+        unsigned char *upred,
+        unsigned char *vpred,
+        int pred_stride) {
+    int i, j;
+    unsigned char *src_ptr, *pred_ptr;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    diff += 256;
+    for (i = 0; i < 2; i++) {
+        if (i == 0) {
+            src_ptr = usrc;
+            pred_ptr = upred;
+        } else if (i == 1) {
+            src_ptr = vsrc;
+            pred_ptr = vpred;
+        }
+
+        for (j = 0; j < 2; j++) {
+            d0u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d1u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+            d2u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d3u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+            d4u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d5u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+            d6u8 = vld1_u8(src_ptr);
+            src_ptr += src_stride;
+            d7u8 = vld1_u8(pred_ptr);
+            pred_ptr += pred_stride;
+
+            q8u16  = vsubl_u8(d0u8, d1u8);
+            q9u16  = vsubl_u8(d2u8, d3u8);
+            q10u16 = vsubl_u8(d4u8, d5u8);
+            q11u16 = vsubl_u8(d6u8, d7u8);
+
+            vst1q_u16((uint16_t *)diff, q8u16);
+            diff += 8;
+            vst1q_u16((uint16_t *)diff, q9u16);
+            diff += 8;
+            vst1q_u16((uint16_t *)diff, q10u16);
+            diff += 8;
+            vst1q_u16((uint16_t *)diff, q11u16);
+            diff += 8;
+        }
+    }
+    return;
+}

diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
deleted file mode 100644
index 22266297..0000000
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ /dev/null

@@ -1,103 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_short_walsh4x4_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0   short *input,
-; r1   short *output,
-; r2   int pitch
-|vp8_short_walsh4x4_neon| PROC
-
-    vld1.16         {d0}, [r0@64], r2   ; load input
-    vld1.16         {d1}, [r0@64], r2
-    vld1.16         {d2}, [r0@64], r2
-    vld1.16         {d3}, [r0@64]
-
-    ;First for-loop
-    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
-    vtrn.32         d0, d2
-    vtrn.32         d1, d3
-
-    vmov.s32        q15, #3             ; add 3 to all values
-
-    vtrn.16         d0, d1
-    vtrn.16         d2, d3
-
-    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
-    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
-    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
-    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
-
-    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
-    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
-    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
-    vceq.s16        d16, d4, #0         ; a1 == 0
-    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
-
-    vadd.s16        d0, d4, d5          ; a1 + d1
-    vmvn            d16, d16            ; a1 != 0
-    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
-    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
-    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
-    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
-
-    ;Second for-loop
-    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
-    vtrn.32         d1, d3
-    vtrn.32         d0, d2
-    vtrn.16         d2, d3
-    vtrn.16         d0, d1
-
-    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
-    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
-    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
-    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
-
-    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
-    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
-    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
-    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
-
-    vclt.s32        q8, q0, #0
-    vclt.s32        q9, q1, #0
-    vclt.s32        q10, q2, #0
-    vclt.s32        q11, q3, #0
-
-    ; subtract -1 (or 0)
-    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
-    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
-    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
-    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
-
-    vadd.s32        q8, q0, q15         ; a2 + 3
-    vadd.s32        q9, q1, q15         ; b2 + 3
-    vadd.s32        q10, q2, q15        ; c2 + 3
-    vadd.s32        q11, q3, q15        ; d2 + 3
-
-    ; vrshrn? would add 1 << 3-1 = 2
-    vshrn.s32       d0, q8, #3
-    vshrn.s32       d1, q9, #3
-    vshrn.s32       d2, q10, #3
-    vshrn.s32       d3, q11, #3
-
-    vst1.16         {q0, q1}, [r1@128]
-
-    bx              lr
-
-    ENDP
-
-    END

diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
new file mode 100644
index 0000000..d6b67f8
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c

@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_walsh4x4_neon(
+        int16_t *input,
+        int16_t *output,
+        int pitch) {
+    uint16x4_t d16u16;
+    int16x8_t q0s16, q1s16;
+    int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q15s32;
+    uint32x4_t q8u32, q9u32, q10u32, q11u32;
+    int16x4x2_t v2tmp0, v2tmp1;
+    int32x2x2_t v2tmp2, v2tmp3;
+
+    dEmptys16 = vdup_n_s16(0);
+    qEmptys32 = vdupq_n_s32(0);
+    q15s32 = vdupq_n_s32(3);
+
+    d0s16 = vld1_s16(input);
+    input += pitch/2;
+    d1s16 = vld1_s16(input);
+    input += pitch/2;
+    d2s16 = vld1_s16(input);
+    input += pitch/2;
+    d3s16 = vld1_s16(input);
+
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
+
+    d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
+    d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
+    d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
+
+    d4s16 = vshl_n_s16(d4s16, 2);
+    d5s16 = vshl_n_s16(d5s16, 2);
+    d6s16 = vshl_n_s16(d6s16, 2);
+    d7s16 = vshl_n_s16(d7s16, 2);
+
+    d16u16 = vceq_s16(d4s16, dEmptys16);
+    d16u16 = vmvn_u16(d16u16);
+
+    d0s16 = vadd_s16(d4s16, d5s16);
+    d3s16 = vsub_s16(d4s16, d5s16);
+    d1s16 = vadd_s16(d7s16, d6s16);
+    d2s16 = vsub_s16(d7s16, d6s16);
+
+    d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
+
+    // Second for-loop
+    v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+                      vreinterpret_s32_s16(d3s16));
+    v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+                      vreinterpret_s32_s16(d2s16));
+    v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]),   // d2
+                      vreinterpret_s16_s32(v2tmp2.val[1]));  // d3
+    v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]),   // d0
+                      vreinterpret_s16_s32(v2tmp2.val[0]));  // d1
+
+    q8s32  = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+    q9s32  = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+    q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+
+    q0s32 = vaddq_s32(q8s32, q9s32);
+    q1s32 = vaddq_s32(q11s32, q10s32);
+    q2s32 = vsubq_s32(q11s32, q10s32);
+    q3s32 = vsubq_s32(q8s32, q9s32);
+
+    q8u32  = vcltq_s32(q0s32, qEmptys32);
+    q9u32  = vcltq_s32(q1s32, qEmptys32);
+    q10u32 = vcltq_s32(q2s32, qEmptys32);
+    q11u32 = vcltq_s32(q3s32, qEmptys32);
+
+    q8s32  = vreinterpretq_s32_u32(q8u32);
+    q9s32  = vreinterpretq_s32_u32(q9u32);
+    q10s32 = vreinterpretq_s32_u32(q10u32);
+    q11s32 = vreinterpretq_s32_u32(q11u32);
+
+    q0s32 = vsubq_s32(q0s32, q8s32);
+    q1s32 = vsubq_s32(q1s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q3s32 = vsubq_s32(q3s32, q11s32);
+
+    q8s32  = vaddq_s32(q0s32, q15s32);
+    q9s32  = vaddq_s32(q1s32, q15s32);
+    q10s32 = vaddq_s32(q2s32, q15s32);
+    q11s32 = vaddq_s32(q3s32, q15s32);
+
+    d0s16 = vshrn_n_s32(q8s32, 3);
+    d1s16 = vshrn_n_s32(q9s32, 3);
+    d2s16 = vshrn_n_s32(q10s32, 3);
+    d3s16 = vshrn_n_s32(q11s32, 3);
+
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+
+    vst1q_s16(output, q0s16);
+    vst1q_s16(output + 8, q1s16);
+    return;
+}

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index d29dd39..38b8999 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -3303,15 +3303,17 @@
   int skip = 2;
   // Only select blocks for computing nmse that have been encoded
   // as ZERO LAST min_consec_zero_last frames in a row.
-  int min_consec_zero_last = 10;
+  // Scale with number of temporal layers.
+  int min_consec_zero_last = 8 / cpi->oxcf.number_of_layers;
   // Decision is tested for changing the denoising mode every
   // num_mode_change times this function is called. Note that this
   // function called every 8 frames, so (8 * num_mode_change) is number
   // of frames where denoising mode change is tested for switch.
   int num_mode_change = 15;
   // Framerate factor, to compensate for larger mse at lower framerates.
-  // TODO(marpan): Adjust this factor,
-  int fac_framerate = cpi->output_framerate < 25.0f ? 80 : 100;
+  // Use ref_framerate, which is full source framerate for temporal layers.
+  // TODO(marpan): Adjust this factor.
+  int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100;
   int tot_num_blocks = cm->mb_rows * cm->mb_cols;
   int ystride = cpi->Source->y_stride;
   unsigned char *src = cpi->Source->y_buffer;
@@ -3380,13 +3382,13 @@
   // num_mode_change.
   if (cpi->denoiser.nmse_source_diff_count == num_mode_change) {
     // Check for going up: from normal to aggressive mode.
-    if ((cpi->denoiser.denoiser_mode = kDenoiserOnYUV) &&
+    if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) &&
         (cpi->denoiser.nmse_source_diff >
         cpi->denoiser.threshold_aggressive_mode)) {
       vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
     } else {
       // Check for going down: from aggressive to normal mode.
-      if ((cpi->denoiser.denoiser_mode = kDenoiserOnYUVAggressive) &&
+      if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
           (cpi->denoiser.nmse_source_diff <
           cpi->denoiser.threshold_aggressive_mode)) {
         vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);

diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 5733048..0b3eed0 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk

@@ -38,9 +38,9 @@
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/subtract_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
 
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 1a41558..e79dcf3 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -314,3 +314,67 @@
   vp9_clear_system_state();
   return ret;
 }
+
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state) {
+  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+  // it is a super frame index. If the last byte of real video compression
+  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+  // not the associated matching marker byte at the front of the index we have
+  // an invalid bitstream and need to return an error.
+
+  uint8_t marker;
+
+  assert(data_sz);
+  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+  *count = 0;
+
+  if ((marker & 0xe0) == 0xc0) {
+    const uint32_t frames = (marker & 0x7) + 1;
+    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+    const size_t index_sz = 2 + mag * frames;
+
+    // This chunk is marked as having a superframe index but doesn't have
+    // enough data for it, thus it's an invalid superframe index.
+    if (data_sz < index_sz)
+      return VPX_CODEC_CORRUPT_FRAME;
+
+    {
+      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+                                          data + data_sz - index_sz);
+
+      // This chunk is marked as having a superframe index but doesn't have
+      // the matching marker byte at the front of the index therefore it's an
+      // invalid chunk.
+      if (marker != marker2)
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    {
+      // Found a valid superframe index.
+      uint32_t i, j;
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+
+      // Frames has a maximum of 8 and mag has a maximum of 4.
+      uint8_t clear_buffer[32];
+      assert(sizeof(clear_buffer) >= frames * mag);
+      if (decrypt_cb) {
+        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+        x = clear_buffer;
+      }
+
+      for (i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (j = 0; j < mag; ++j)
+          this_sz |= (*x++) << (j * 8);
+        sizes[i] = this_sz;
+      }
+      *count = frames;
+    }
+  }
+  return VPX_CODEC_OK;
+}

diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 223b66f..848d212 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h

@@ -78,6 +78,25 @@
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
+static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
+                                  void *decrypt_state,
+                                  const uint8_t *data) {
+  if (decrypt_cb) {
+    uint8_t marker;
+    decrypt_cb(decrypt_state, data, &marker, 1);
+    return marker;
+  }
+  return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+                                           size_t data_sz,
+                                           uint32_t sizes[8], int *count,
+                                           vpx_decrypt_cb decrypt_cb,
+                                           void *decrypt_state);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 2d5ec79..8c13d0d 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c

@@ -28,7 +28,6 @@
                           int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                           int zbin_oq_value, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  int i;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
@@ -39,7 +38,7 @@
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-
+    int i;
     const int16x8_t v_zero = vdupq_n_s16(0);
     const int16x8_t v_one = vdupq_n_s16(1);
     int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
@@ -50,13 +49,12 @@
     v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
     v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
     v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-
-    for (i = 0; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
       const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
-      const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
       const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
                                            vget_low_s16(v_quant));
       const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
@@ -65,20 +63,39 @@
                                             vshrn_n_s32(v_tmp_hi, 16));
       const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
       const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan =
-          vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
       const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
       const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
       const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-
       v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-
-      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
       v_round = vmovq_n_s16(round_ptr[1]);
       v_quant = vmovq_n_s16(quant_ptr[1]);
       v_dequant = vmovq_n_s16(dequant_ptr[1]);
     }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
     {
       const int16x4_t v_eobmax_3210 =
           vmax_s16(vget_low_s16(v_eobmax_76543210),

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index acbb88b..d27620c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -541,6 +541,20 @@
   set_tile_limits(cpi);
 }
 
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const VP9EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8
+                                            : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8
+                                           : maximum * bandwidth / 1000;
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -574,28 +588,8 @@
   }
   cpi->encode_breakout = cpi->oxcf.encode_breakout;
 
-  // local file playback mode == really big buffer
-  if (cpi->oxcf.rc_mode == VPX_VBR) {
-    cpi->oxcf.starting_buffer_level_ms = 60000;
-    cpi->oxcf.optimal_buffer_level_ms = 60000;
-    cpi->oxcf.maximum_buffer_size_ms = 240000;
-  }
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
 
-  rc->starting_buffer_level = cpi->oxcf.starting_buffer_level_ms *
-                                  cpi->oxcf.target_bandwidth / 1000;
-
-  // Set or reset optimal and maximum buffer levels.
-  if (cpi->oxcf.optimal_buffer_level_ms == 0)
-    rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-  else
-    rc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level_ms *
-                                   cpi->oxcf.target_bandwidth / 1000;
-
-  if (cpi->oxcf.maximum_buffer_size_ms == 0)
-    rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-  else
-    rc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size_ms *
-                                  cpi->oxcf.target_bandwidth / 1000;
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
   rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
@@ -1990,19 +1984,41 @@
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
+static void set_mv_search_params(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param =
+            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
                                       unsigned int *frame_flags) {
   VP9_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
   int q;
   int top_index;
   int bottom_index;
 
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
-  struct segmentation *const seg = &cm->seg;
   set_ext_overrides(cpi);
 
   cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
@@ -2028,24 +2044,7 @@
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
-  // Initialize cpi->mv_step_param to default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
-  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
-  if (sf->mv.auto_mv_step_size) {
-    if (frame_is_intra_only(cm)) {
-      // Initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // Allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution.
-        cpi->mv_step_param = vp9_init_search_range(MIN(max_mv_def, 2 *
-                                 cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
+  set_mv_search_params(cpi);
 
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm)) {

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index a309a06..82be0f4 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -82,36 +82,19 @@
 } VPX_SCALING;
 
 typedef enum {
-  // Good Quality Fast Encoding. The encoder balances quality with the
-  // amount of time it takes to encode the output. (speed setting
-  // controls how fast)
-  ONE_PASS_GOOD = 1,
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
 
-  // One Pass - Best Quality. The encoder places priority on the
-  // quality of the output over encoding speed. The output is compressed
-  // at the highest possible quality. This option takes the longest
-  // amount of time to encode. (speed setting ignored)
-  ONE_PASS_BEST = 2,
+  // The encoder places priority on the quality of the output over encoding
+  // speed. The output is compressed at the highest possible quality. This
+  // option takes the longest amount of time to encode. Speed setting ignored.
+  BEST,
 
-  // Two Pass - First Pass. The encoder generates a file of statistics
-  // for use in the second encoding pass. (speed setting controls how fast)
-  TWO_PASS_FIRST = 3,
-
-  // Two Pass - Second Pass. The encoder uses the statistics that were
-  // generated in the first encoding pass to create the compressed
-  // output. (speed setting controls how fast)
-  TWO_PASS_SECOND_GOOD = 4,
-
-  // Two Pass - Second Pass Best.  The encoder uses the statistics that
-  // were generated in the first encoding pass to create the compressed
-  // output using the highest possible quality, and taking a
-  // longer amount of time to encode. (speed setting ignored)
-  TWO_PASS_SECOND_BEST = 5,
-
-  // Realtime/Live Encoding. This mode is optimized for realtime
-  // encoding (for example, capturing a television signal or feed from
-  // a live camera). (speed setting controls how fast)
-  REALTIME = 6,
+  // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+  // example, capturing a television signal or feed from a live camera). Speed
+  // setting controls how fast.
+  REALTIME
 } MODE;
 
 typedef enum {
@@ -241,7 +224,7 @@
 }
 
 static INLINE int is_best_mode(MODE mode) {
-  return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
+  return mode == BEST;
 }
 
 typedef struct VP9_COMP {
@@ -491,14 +474,6 @@
       .buf;
 }
 
-// Intra only frames, golden frames (except alt ref overlays) and
-// alt ref frames tend to be coded at a higher than ambient quality
-static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
-  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
-         vp9_is_upper_layer_key_frame(cpi);
-}
-
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
   // TODO(JBB): double check we can't exceed this token count if we have a
   // 32x32 transform crossing a boundary at a multiple of 16.

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 295e437..94bbe9c 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -432,6 +432,8 @@
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  LAYER_CONTEXT *const lc = is_spatial_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -444,15 +446,14 @@
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex());
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
     const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
-    twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass;
+    twopass = &lc->twopass;
 
     if (cpi->common.current_video_frame == 0) {
       cpi->ref_frame_flags = 0;
     } else {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
       if (lc->current_video_frame_in_layer == 0)
         cpi->ref_frame_flags = VP9_GOLD_FLAG;
       else
@@ -613,7 +614,7 @@
                                                 &unscaled_last_source_buf_2d);
 
         // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || is_spatial_svc(cpi)) {
+        if (raw_motion_error > 25 || lc != NULL) {
           // Test last reference frame using the previous best mv as the
           // starting point (best reference) for the search.
           first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
@@ -895,7 +896,7 @@
 
   vp9_extend_frame_borders(new_yv12);
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     vp9_update_reference_frames(cpi);
   } else {
     // Swap frame pointers so last frame refers to the frame we just compressed.
@@ -1081,8 +1082,7 @@
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
-                                     const FIRSTPASS_STATS *frame) {
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
   const double sr_ratio = frame->coded_error /
                           DOUBLE_DIVIDE_CHECK(frame->sr_coded_error);
   const double zero_motion_pct = frame->pcnt_inter -
@@ -1095,12 +1095,10 @@
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(TWO_PASS *twopass,
+static int detect_transition_to_still(const TWO_PASS *twopass,
                                       int frame_interval, int still_interval,
                                       double loop_decay_rate,
                                       double last_decay_rate) {
-  int trans_to_still = 0;
-
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
@@ -1108,26 +1106,22 @@
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
-    const FIRSTPASS_STATS *position = twopass->stats_in;
-    FIRSTPASS_STATS tmp_next_frame;
 
     // Look ahead a few frames to see if static condition persists...
     for (j = 0; j < still_interval; ++j) {
-      if (EOF == input_stats(twopass, &tmp_next_frame))
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end)
         break;
 
-      if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999)
         break;
     }
 
-    reset_fpf_position(twopass, position);
-
     // Only if it does do we signal a transition to still.
-    if (j == still_interval)
-      trans_to_still = 1;
+    return j == still_interval;
   }
 
-  return trans_to_still;
+  return 0;
 }
 
 // This function detects a flash through the high relative pcnt_second_ref
@@ -1373,7 +1367,8 @@
                                    double group_error, int gf_arf_bits) {
   RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
   int frame_index = 1;
@@ -1396,17 +1391,17 @@
   // is also the golden frame.
   if (!key_frame) {
     if (rc->source_alt_ref_active) {
-      twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
-      twopass->gf_group.rf_level[0] = INTER_NORMAL;
-      twopass->gf_group.bit_allocation[0] = 0;
-      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
-      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+      gf_group->update_type[0] = OVERLAY_UPDATE;
+      gf_group->rf_level[0] = INTER_NORMAL;
+      gf_group->bit_allocation[0] = 0;
+      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     } else {
-      twopass->gf_group.update_type[0] = GF_UPDATE;
-      twopass->gf_group.rf_level[0] = GF_ARF_STD;
-      twopass->gf_group.bit_allocation[0] = gf_arf_bits;
-      twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
-      twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+      gf_group->update_type[0] = GF_UPDATE;
+      gf_group->rf_level[0] = GF_ARF_STD;
+      gf_group->bit_allocation[0] = gf_arf_bits;
+      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     }
 
     // Step over the golden frame / overlay frame
@@ -1421,25 +1416,25 @@
 
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
-    twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
-    twopass->gf_group.arf_src_offset[frame_index] =
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+    gf_group->arf_src_offset[frame_index] =
       (unsigned char)(rc->baseline_gf_interval - 1);
-    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-    twopass->gf_group.arf_ref_idx[frame_index] =
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[frame_index] =
       arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                          rc->source_alt_ref_active];
     ++frame_index;
 
     if (cpi->multi_arf_enabled) {
       // Set aside a slot for a level 1 arf.
-      twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
-      twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
-      twopass->gf_group.arf_src_offset[frame_index] =
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] =
         (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
-      twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
       ++frame_index;
     }
   }
@@ -1469,16 +1464,16 @@
       if (frame_index <= mid_frame_idx)
         arf_idx = 1;
     }
-    twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
 
     target_frame_size = clamp(target_frame_size, 0,
                               MIN(max_bits, (int)total_group_bits));
 
-    twopass->gf_group.update_type[frame_index] = LF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+    gf_group->update_type[frame_index] = LF_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
 
-    twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+    gf_group->bit_allocation[frame_index] = target_frame_size;
     ++frame_index;
   }
 
@@ -1486,23 +1481,23 @@
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
   // vp9_rc_get_second_pass_params() the data will be undefined.
-  twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
-  twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
 
   if (rc->source_alt_ref_pending) {
-    twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
 
     // Final setup for second arf and its overlay.
     if (cpi->multi_arf_enabled) {
-      twopass->gf_group.bit_allocation[2] =
-        twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
-      twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
-      twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+      gf_group->bit_allocation[2] =
+          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+      gf_group->bit_allocation[mid_frame_idx] = 0;
     }
   } else {
-    twopass->gf_group.update_type[frame_index] = GF_UPDATE;
-    twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
 
   // Note whether multi-arf was enabled this group for next time.
@@ -1554,8 +1549,6 @@
   vp9_clear_system_state();
   vp9_zero(next_frame);
 
-  gf_group_bits = 0;
-
   // Load stats for the current frame.
   mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
 
@@ -1615,9 +1608,8 @@
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
-      zero_motion_accumulator =
-        MIN(zero_motion_accumulator,
-            get_zero_motion_factor(&cpi->common, &next_frame));
+      zero_motion_accumulator = MIN(zero_motion_accumulator,
+                                    get_zero_motion_factor(&next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -1831,6 +1823,7 @@
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const FIRSTPASS_STATS first_frame = *this_frame;
   const FIRSTPASS_STATS *const start_position = twopass->stats_in;
@@ -1849,7 +1842,7 @@
   cpi->common.frame_type = KEY_FRAME;
 
   // Reset the GF group data structures.
-  vp9_zero(twopass->gf_group);
+  vp9_zero(*gf_group);
 
   // Is this a forced key frame by interval.
   rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -1987,9 +1980,8 @@
       break;
 
     // Monitor for static sections.
-    zero_motion_accumulator =
-      MIN(zero_motion_accumulator,
-          get_zero_motion_factor(&cpi->common, &next_frame));
+    zero_motion_accumulator =MIN(zero_motion_accumulator,
+                                 get_zero_motion_factor(&next_frame));
 
     // For the first few frames collect data to decide kf boost.
     if (i <= (rc->max_gf_interval * 2)) {
@@ -2040,9 +2032,9 @@
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
-  twopass->gf_group.bit_allocation[0] = kf_bits;
-  twopass->gf_group.update_type[0] = KF_UPDATE;
-  twopass->gf_group.rf_level[0] = KF_STD;
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
 
   // Note the total error score of the kf group minus the key frame itself.
   twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2119,15 +2111,16 @@
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
   int frames_left;
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS this_frame_copy;
 
   int target_rate;
-  LAYER_CONTEXT *lc = NULL;
+  LAYER_CONTEXT *const lc = is_spatial_svc(cpi) ?
+        &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (is_spatial_svc(cpi)) {
-    lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+  if (lc != NULL) {
     frames_left = (int)(twopass->total_stats.count -
                   lc->current_video_frame_in_layer);
   } else {
@@ -2140,10 +2133,10 @@
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
-  if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
     configure_buffer_updates(cpi);
-    target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+    target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
@@ -2154,7 +2147,7 @@
     vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
 
-    if (is_spatial_svc(cpi)) {
+    if (lc != NULL) {
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
       } else {
@@ -2170,7 +2163,7 @@
 
   vp9_clear_system_state();
 
-  if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) {
+  if (lc != NULL && twopass->kf_intra_err_min == 0) {
     twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
     twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
   }
@@ -2178,8 +2171,7 @@
   if (cpi->oxcf.rc_mode == VPX_Q) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
-             (is_spatial_svc(cpi) &&
-              lc->current_video_frame_in_layer == 0)) {
+             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2205,7 +2197,7 @@
     cm->frame_type = INTER_FRAME;
   }
 
-  if (is_spatial_svc(cpi)) {
+  if (lc != NULL) {
     if (cpi->svc.spatial_layer_id == 0) {
       lc->is_key_frame = (cm->frame_type == KEY_FRAME);
       if (lc->is_key_frame)
@@ -2236,13 +2228,13 @@
     }
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (!is_spatial_svc(cpi))
+    if (lc != NULL)
       cpi->refresh_golden_frame = 1;
   }
 
   configure_buffer_updates(cpi);
 
-  target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+  target_rate = gf_group->bit_allocation[gf_group->index];
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
   else

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 6115f5a..e5469c8 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -17,6 +17,7 @@
 
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -343,6 +344,52 @@
   }
 }
 
+struct estimate_block_intra_args {
+  VP9_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int rate;
+  int64_t dist;
+};
+
+static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                                 TX_SIZE tx_size, void *arg) {
+  struct estimate_block_intra_args* const args = arg;
+  VP9_COMP *const cpi = args->cpi;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int i, j;
+  int rate;
+  int64_t dist;
+  unsigned int var_y, sse_y;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  assert(plane == 0);
+  (void) plane;
+
+  p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+  pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+  // Use source buffer as an approximation for the fully reconstructed buffer.
+  vp9_predict_intra_block(xd, block >> (2 * tx_size),
+                          b_width_log2(plane_bsize),
+                          tx_size, args->mode,
+                          p->src.buf, src_stride,
+                          pd->dst.buf, dst_stride,
+                          i, j, 0);
+  // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
+  model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  args->rate += rate;
+  args->dist += dist;
+}
+
 static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
@@ -360,7 +407,6 @@
                             PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE this_mode, best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
@@ -397,9 +443,9 @@
       (((mi_row + mi_col) >> bsl) +
        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
   int const_motion[MAX_REF_FRAMES] = { 0 };
-  int bh = num_4x4_blocks_high_lookup[bsize] << 2;
-  int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
-  int pixels_in_block = bh * bw;
+  const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int pixels_in_block = bh * bw;
   // For speed 6, the result of interp filter is reused later in actual encoding
   // process.
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
@@ -670,20 +716,11 @@
   // threshold.
   if (!x->skip && best_rd > inter_mode_thresh &&
       bsize <= cpi->sf.max_intra_bsize) {
-    int i, j;
-    const int width  = num_4x4_blocks_wide_lookup[bsize];
-    const int height = num_4x4_blocks_high_lookup[bsize];
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
 
-    int rate2 = 0;
-    int64_t dist2 = 0;
-    const int dst_stride = cpi->sf.reuse_inter_pred_sby ? bw : pd->dst.stride;
-    const int src_stride = p->src.stride;
-    int block_idx = 0;
-
-    TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
-                              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size];
-    const int step = 1 << tmp_tx_size;
+    const TX_SIZE intra_tx_size =
+        MIN(max_txsize_lookup[bsize],
+            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
 
     if (cpi->sf.reuse_inter_pred_sby) {
       pd->dst.buf = tmp[0].data;
@@ -691,44 +728,26 @@
     }
 
     for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      uint8_t *const src_buf_base = p->src.buf;
-      uint8_t *const dst_buf_base = pd->dst.buf;
-      for (j = 0; j < height; j += step) {
-        for (i = 0; i < width; i += step) {
-          p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
-          pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
-          // Use source buffer as an approximation for the fully reconstructed
-          // buffer
-          vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
-                                  tmp_tx_size, this_mode,
-                                  p->src.buf, src_stride,
-                                  pd->dst.buf, dst_stride,
-                                  i, j, 0);
-          model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
-          rate2 += rate;
-          dist2 += dist;
-          ++block_idx;
-        }
-      }
-      p->src.buf = src_buf_base;
-      pd->dst.buf = dst_buf_base;
-
-      rate = rate2;
-      dist = dist2;
-
+      const TX_SIZE saved_tx_size = mbmi->tx_size;
+      args.mode = this_mode;
+      args.rate = 0;
+      args.dist = 0;
+      mbmi->tx_size = intra_tx_size;
+      vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                             estimate_block_intra, &args);
+      mbmi->tx_size = saved_tx_size;
+      rate = args.rate;
+      dist = args.dist;
       rate += cpi->mbmode_cost[this_mode];
       rate += intra_cost_penalty;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
 
-      if (cpi->sf.reuse_inter_pred_sby)
-        pd->dst = orig_dst;
-
       if (this_rd + intra_mode_cost < best_rd) {
         best_rd = this_rd;
         *returnrate = rate;
         *returndistortion = dist;
         mbmi->mode = this_mode;
-        mbmi->tx_size = tmp_tx_size;
+        mbmi->tx_size = intra_tx_size;
         mbmi->ref_frame[0] = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
@@ -736,6 +755,8 @@
         x->skip_txfm[0] = skip_txfm;
       }
     }
+    if (cpi->sf.reuse_inter_pred_sby)
+      pd->dst = orig_dst;
   }
 
   return INT64_MAX;

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9da2ade..b926a58 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -646,7 +646,6 @@
   int q;
 
   if (frame_is_intra_only(cm)) {
-    active_best_quality = rc->best_quality;
 
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 517674e..cfda964 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -490,24 +490,24 @@
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX}};
-  TX_SIZE n, m;
+  int n, m;
   int s0, s1;
   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   int64_t best_rd = INT64_MAX;
-  TX_SIZE best_tx = TX_4X4;
+  TX_SIZE best_tx = max_tx_size;
 
   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (n = TX_4X4; n <= max_tx_size; n++) {
+  for (n = max_tx_size; n >= 0;  n--) {
     txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
                      &sse[n], ref_best_rd, 0, bs, n,
                      cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
-      for (m = 0; m <= n - (n == max_tx_size); m++) {
+      for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
         if (m == n)
           r[n][1] += vp9_cost_zero(tx_probs[m]);
         else
@@ -523,6 +523,13 @@
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd[n][1] == INT64_MAX ||
+        (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
+        s[n] == 1))
+      break;
+
     if (rd[n][1] < best_rd) {
       best_tx = n;
       best_rd = rd[n][1];
@@ -2632,6 +2639,12 @@
     int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
+    this_mode = vp9_mode_order[mode_index].mode;
+    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
+      continue;
+    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (mode_index == mode_skip_start && best_mode_index >= 0) {
@@ -2653,6 +2666,13 @@
           break;
       }
     }
+
+    if (cpi->sf.alt_ref_search_fp && cpi->rc.is_src_frame_alt_ref) {
+      mode_skip_mask = 0;
+      if (!(ref_frame == ALTREF_FRAME && second_ref_frame == NONE))
+        continue;
+    }
+
     if (mode_skip_mask & (1 << mode_index))
       continue;
 
@@ -2661,12 +2681,6 @@
                             rd_thresh_freq_fact[mode_index]))
       continue;
 
-    this_mode = vp9_mode_order[mode_index].mode;
-    ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
-      continue;
-    second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
-
     if (cpi->sf.motion_field_mode_search) {
       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
                                 tile->mi_col_end - mi_col);

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 67b6e26..57835ec 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -50,8 +50,20 @@
                               (1 << THR_GOLD)
 };
 
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
+         vp9_is_upper_layer_key_frame(cpi);
+}
+
+
 static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
                                    SPEED_FEATURES *sf, int speed) {
+  const int boosted = frame_is_boosted(cpi);
+
   sf->adaptive_rd_thresh = 1;
   sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
   sf->allow_skip_recode = 1;
@@ -59,8 +71,6 @@
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check  = 1;
-    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
-                                                      : USE_LARGESTALL;
 
     if (MIN(cm->width, cm->height) >= 720)
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
@@ -80,9 +90,14 @@
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+    sf->tx_size_search_breakout = 1;
   }
 
   if (speed >= 2) {
+    sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+                                                      : USE_LARGESTALL;
+
     if (MIN(cm->width, cm->height) >= 720) {
       sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
       sf->last_partitioning_redo_frequency = 3;
@@ -117,9 +132,10 @@
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
     }
     sf->adaptive_pred_interp_filter = 0;
-    sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+    sf->cb_partition_search = !boosted;
     sf->cb_pred_filter_search = 1;
-    sf->motion_field_mode_search = frame_is_boosted(cpi) ? 0 : 1;
+    sf->alt_ref_search_fp = 1;
+    sf->motion_field_mode_search = !boosted;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->last_partitioning_redo_frequency = 3;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
@@ -347,6 +363,7 @@
   sf->cb_pred_filter_search = 0;
   sf->cb_partition_search = 0;
   sf->motion_field_mode_search = 0;
+  sf->alt_ref_search_fp = 0;
   sf->use_quant_fp = 0;
   sf->reference_masking = 0;
   sf->partition_search_type = SEARCH_PARTITION;
@@ -389,6 +406,7 @@
   // Recode loop tolerence %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
+  sf->tx_size_search_breakout = 0;
 
   if (oxcf->mode == REALTIME) {
     set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);

diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 8edcb1d..bad956d 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h

@@ -291,6 +291,8 @@
 
   int motion_field_mode_search;
 
+  int alt_ref_search_fp;
+
   // Fast quantization process path
   int use_quant_fp;
 
@@ -374,6 +376,10 @@
 
   // default interp filter choice
   INTERP_FILTER default_interp_filter;
+
+  // Early termination in transform size search, which only applies while
+  // tx_size_search_method is USE_FULL_RD.
+  int tx_size_search_breakout;
 } SPEED_FEATURES;
 
 struct VP9_COMP;

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index fb52d1a..1d9bdd8 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c

@@ -106,12 +106,9 @@
     }
     bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
     // Update buffer-related quantities.
-    lrc->starting_buffer_level =
-        (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-    lrc->optimal_buffer_level =
-        (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-    lrc->maximum_buffer_size =
-        (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+    lrc->starting_buffer_level = rc->starting_buffer_level * bitrate_alloc;
+    lrc->optimal_buffer_level = rc->optimal_buffer_level * bitrate_alloc;
+    lrc->maximum_buffer_size = rc->maximum_buffer_size * bitrate_alloc;
     lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
     lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
     // Update framerate-related quantities.

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 487deef..b6bcdd9 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -2891,11 +2891,11 @@
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
-#undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
+#undef  FDCT32x32_2D
 
 #define FDCT32x32_2D vp9_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
-#undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
+#undef  FDCT32x32_2D

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index cdbb69b..08f6849 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -325,6 +325,7 @@
     VP9EncoderConfig *oxcf,
     const vpx_codec_enc_cfg_t *cfg,
     const struct vp9_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == VPX_VBR;
   oxcf->profile = cfg->g_profile;
   oxcf->width   = cfg->g_w;
   oxcf->height  = cfg->g_h;
@@ -334,17 +335,16 @@
   if (oxcf->init_framerate > 180)
     oxcf->init_framerate = 30;
 
+  oxcf->mode = BEST;
+
   switch (cfg->g_pass) {
     case VPX_RC_ONE_PASS:
-      oxcf->mode = ONE_PASS_GOOD;
       oxcf->pass = 0;
       break;
     case VPX_RC_FIRST_PASS:
-      oxcf->mode = TWO_PASS_FIRST;
       oxcf->pass = 1;
       break;
     case VPX_RC_LAST_PASS:
-      oxcf->mode = TWO_PASS_SECOND_BEST;
       oxcf->pass = 2;
       break;
   }
@@ -371,9 +371,9 @@
   oxcf->scaled_frame_width       = cfg->rc_scaled_width;
   oxcf->scaled_frame_height      = cfg->rc_scaled_height;
 
-  oxcf->maximum_buffer_size_ms   = cfg->rc_buf_sz;
-  oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level_ms  = cfg->rc_buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms   = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms  = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
 
   oxcf->drop_frames_water_mark   = cfg->rc_dropframe_thresh;
 
@@ -718,31 +718,36 @@
   return VPX_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long duration,
                                     unsigned long deadline) {
-  // Use best quality mode if no deadline is given.
-  MODE new_qc = ONE_PASS_BEST;
+  MODE new_mode = BEST;
 
-  if (deadline) {
-    // Convert duration parameter from stream timebase to microseconds
-    const uint64_t duration_us = (uint64_t)duration * 1000000 *
-                               (uint64_t)ctx->cfg.g_timebase.num /
-                               (uint64_t)ctx->cfg.g_timebase.den;
+  switch (ctx->cfg.g_pass) {
+    case VPX_RC_ONE_PASS:
+      if (deadline > 0) {
+        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
 
-    // If the deadline is more that the duration this frame is to be shown,
-    // use good quality mode. Otherwise use realtime mode.
-    new_qc = (deadline > duration_us) ? ONE_PASS_GOOD : REALTIME;
+        // Convert duration parameter from stream timebase to microseconds.
+        const uint64_t duration_us = (uint64_t)duration * 1000000 *
+           (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den;
+
+        // If the deadline is more that the duration this frame is to be shown,
+        // use good quality mode. Otherwise use realtime mode.
+        new_mode = (deadline > duration_us) ? GOOD : REALTIME;
+      } else {
+        new_mode = BEST;
+      }
+      break;
+    case VPX_RC_FIRST_PASS:
+      break;
+    case VPX_RC_LAST_PASS:
+      new_mode = deadline > 0 ? GOOD : BEST;
+      break;
   }
 
-  if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
-    new_qc = TWO_PASS_FIRST;
-  else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
-    new_qc = (new_qc == ONE_PASS_BEST) ? TWO_PASS_SECOND_BEST
-                                          : TWO_PASS_SECOND_GOOD;
-
-  if (ctx->oxcf.mode != new_qc) {
-    ctx->oxcf.mode = new_qc;
+  if (ctx->oxcf.mode != new_mode) {
+    ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 }

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 4372ac9..ee82905 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -332,81 +332,6 @@
   return VPX_CODEC_OK;
 }
 
-static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
-                                  void *decrypt_state,
-                                  const uint8_t *data) {
-  if (decrypt_cb) {
-    uint8_t marker;
-    decrypt_cb(decrypt_state, data, &marker, 1);
-    return marker;
-  }
-  return *data;
-}
-
-static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
-                                              size_t data_sz,
-                                              uint32_t sizes[8], int *count,
-                                              vpx_decrypt_cb decrypt_cb,
-                                              void *decrypt_state) {
-  // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
-  // it is a super frame index. If the last byte of real video compression
-  // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
-  // not the associated matching marker byte at the front of the index we have
-  // an invalid bitstream and need to return an error.
-
-  uint8_t marker;
-
-  assert(data_sz);
-  marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
-  *count = 0;
-
-  if ((marker & 0xe0) == 0xc0) {
-    const uint32_t frames = (marker & 0x7) + 1;
-    const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * frames;
-
-    // This chunk is marked as having a superframe index but doesn't have
-    // enough data for it, thus it's an invalid superframe index.
-    if (data_sz < index_sz)
-      return VPX_CODEC_CORRUPT_FRAME;
-
-    {
-      const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
-                                          data + data_sz - index_sz);
-
-      // This chunk is marked as having a superframe index but doesn't have
-      // the matching marker byte at the front of the index therefore it's an
-      // invalid chunk.
-      if (marker != marker2)
-        return VPX_CODEC_CORRUPT_FRAME;
-    }
-
-    {
-      // Found a valid superframe index.
-      uint32_t i, j;
-      const uint8_t *x = &data[data_sz - index_sz + 1];
-
-      // Frames has a maximum of 8 and mag has a maximum of 4.
-      uint8_t clear_buffer[32];
-      assert(sizeof(clear_buffer) >= frames * mag);
-      if (decrypt_cb) {
-        decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
-        x = clear_buffer;
-      }
-
-      for (i = 0; i < frames; ++i) {
-        uint32_t this_sz = 0;
-
-        for (j = 0; j < mag; ++j)
-          this_sz |= (*x++) << (j * 8);
-        sizes[i] = this_sz;
-      }
-      *count = frames;
-    }
-  }
-  return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
@@ -424,8 +349,8 @@
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
-                               ctx->decrypt_cb, ctx->decrypt_state);
+  res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+                                   ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK)
     return res;
 

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index a7716d1..0f5ce3d 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h

@@ -347,7 +347,6 @@
     vpx_codec_priv_cb_pair_t    put_slice_cb;
   } dec;
   struct {
-    int                         tbd;
     struct vpx_fixed_buf        cx_data_dst_buf;
     unsigned int                cx_data_pad_before;
     unsigned int                cx_data_pad_after;

diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 8c7e3cf..e20703a 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c

@@ -8,38 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <stdlib.h>
 #include <string.h>
+
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
-
-#define ADDRESS_STORAGE_SIZE      sizeof(size_t)
-/*returns an addr aligned to the byte boundary specified by align*/
-#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
-
-/* Memalign code is copied from vpx_mem.c */
-static void *img_buf_memalign(size_t align, size_t size) {
-  void *addr,
-       * x = NULL;
-
-  addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
-
-  if (addr) {
-    x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
-    /* save the actual malloc address */
-    ((size_t *)x)[-1] = (size_t)addr;
-  }
-
-  return x;
-}
-
-static void img_buf_free(void *memblk) {
-  if (memblk) {
-    void *addr = (void *)(((size_t *)memblk)[-1]);
-    free(addr);
-  }
-}
+#include "vpx_mem/vpx_mem.h"
 
 static vpx_image_t *img_alloc_helper(vpx_image_t   *img,
                                      vpx_img_fmt_t  fmt,
@@ -172,7 +146,7 @@
     if (alloc_size != (size_t)alloc_size)
       goto fail;
 
-    img->img_data = img_buf_memalign(buf_align, (size_t)alloc_size);
+    img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);
     img->img_data_owner = 1;
   }
 
@@ -296,7 +270,7 @@
 void vpx_img_free(vpx_image_t *img) {
   if (img) {
     if (img->img_data && img->img_data_owner)
-      img_buf_free(img->img_data);
+      vpx_free(img->img_data);
 
     if (img->self_allocd)
       free(img);

diff --git a/vpxdec.c b/vpxdec.c
index faee42a..6c822ab 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -15,13 +15,16 @@
 #include <string.h>
 #include <limits.h>
 
+#include "./vpx_config.h"
+
+#if CONFIG_LIBYUV
 #include "third_party/libyuv/include/libyuv/scale.h"
+#endif
 
 #include "./args.h"
 #include "./ivfdec.h"
 
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
@@ -123,6 +126,7 @@
 };
 #endif
 
+#if CONFIG_LIBYUV
 static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
                                   FilterModeEnum mode) {
   assert(src->fmt == VPX_IMG_FMT_I420);
@@ -137,6 +141,7 @@
                    dst->d_w, dst->d_h,
                    mode);
 }
+#endif
 
 void usage_exit() {
   int i;
@@ -538,7 +543,8 @@
   struct VpxDecInputContext input = {NULL, NULL};
   struct VpxInputContext vpx_input_ctx;
 #if CONFIG_WEBM_IO
-  struct WebmInputContext webm_ctx = {0};
+  struct WebmInputContext webm_ctx;
+  memset(&(webm_ctx), 0, sizeof(webm_ctx));
   input.webm_ctx = &webm_ctx;
 #endif
   input.vpx_input_ctx = &vpx_input_ctx;

diff --git a/vpxenc.c b/vpxenc.c
index 7e037a6..b99e61a 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -19,12 +19,15 @@
 #include <stdlib.h>
 #include <string.h>
 
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
 #include "vpx/vpx_encoder.h"
 #if CONFIG_DECODERS
 #include "vpx/vpx_decoder.h"
 #endif
 
-#include "third_party/libyuv/include/libyuv/scale.h"
 #include "./args.h"
 #include "./ivfenc.h"
 #include "./tools_common.h"