Merge "[spatial svc]Add layer bitrates options and clean up parsing options from string"
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index cf86cf5..50e7c23 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -113,9 +113,12 @@
 const DecodeParam kVP9InvalidFileTests[] = {
   {1, "invalid-vp90-02-v2.webm"},
   {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
-  {1, "invalid-vp90-03-v2.webm"},
+  {1, "invalid-vp90-03-v3.webm"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf"},
 };
 
 VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 6166749..84b13f9 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -10,8 +10,8 @@
 25751f5d3b05ff03f0719ad42cd625348eb8961e  invalid-vp90-01-v2.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5  invalid-vp90-02-v2.webm
 8e2eff4af87d2b561cce2365713269e301457ef3  invalid-vp90-02-v2.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v2.webm
-25dd58c22d23f75304d7ce7f69f4e5b02ef9119a  invalid-vp90-03-v2.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3  invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612  invalid-vp90-03-v3.webm.res
 d637297561dd904eb2c97a9015deeb31c4a1e8d2  invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 3a204bdbeaa3c6458b77bcebb8366d107267f55d  invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
 a432f96ff0a787268e2f94a8092ab161a18d1b06  park_joy_90p_10_420.y4m
@@ -691,3 +691,9 @@
 368dccdde5288c13c25695d2eacdc7402cadf613  vp90-2-19-skip.webm.md5
 ffe460282df2b0e7d4603c2158653ad96f574b02  vp90-2-19-skip-01.webm
 bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85  vp90-2-19-skip-01.webm.md5
+b03c408cf23158638da18dbc3323b99a1635c68a  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+5e67e24e7f53fd189e565513cef8519b1bd6c712  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+741158f67c0d9d23726624d06bdc482ad368afc9  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf
+fb79dcbbbb8c82d5a750e339acce66e39a32f15f  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res
diff --git a/test/test.mk b/test/test.mk
index 7c34169..c839c92 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -797,14 +797,16 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
@@ -813,6 +815,10 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 925ff03..f76402e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -35,6 +35,14 @@
 using ::std::tr1::tuple;
 using libvpx_test::ACMRandom;
 
+static unsigned int mb_ss_ref(const int16_t *src) {
+  unsigned int res = 0;
+  for (int i = 0; i < 256; ++i) {
+    res += src[i] * src[i];
+  }
+  return res;
+}
+
 static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
                                  int l2w, int l2h, unsigned int *sse_ptr) {
   int se = 0;
@@ -76,6 +84,50 @@
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
 
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+  SumOfSquaresTest() : func_(GetParam()) {}
+
+  virtual ~SumOfSquaresTest() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void ConstTest();
+  void RefTest();
+
+  SumOfSquaresFunction func_;
+  ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+  int16_t mem[256];
+  unsigned int res;
+  for (int v = 0; v < 256; ++v) {
+    for (int i = 0; i < 256; ++i) {
+      mem[i] = v;
+    }
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(256u * (v * v), res);
+  }
+}
+
+void SumOfSquaresTest::RefTest() {
+  int16_t mem[256];
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      mem[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const unsigned int expected = mb_ss_ref(mem);
+    unsigned int res;
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(expected, res);
+  }
+}
+
 template<typename VarianceFunctionType>
 class VarianceTest
     : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@@ -88,7 +140,7 @@
     height_ = 1 << log2height_;
     variance_ = get<2>(params);
 
-    rnd(ACMRandom::DeterministicSeed());
+    rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     ref_ = new uint8_t[block_size_];
@@ -107,7 +159,7 @@
   void RefTest();
   void OneQuarterTest();
 
-  ACMRandom rnd;
+  ACMRandom rnd_;
   uint8_t* src_;
   uint8_t* ref_;
   int width_, log2width_;
@@ -135,8 +187,8 @@
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
-      src_[j] = rnd.Rand8();
-      ref_[j] = rnd.Rand8();
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -206,7 +258,7 @@
     height_ = 1 << log2height_;
     subpel_variance_ = get<2>(params);
 
-    rnd(ACMRandom::DeterministicSeed());
+    rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
@@ -226,7 +278,7 @@
  protected:
   void RefTest();
 
-  ACMRandom rnd;
+  ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
@@ -241,10 +293,10 @@
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
       for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd.Rand8();
+        src_[j] = rnd_.Rand8();
       }
       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd.Rand8();
+        ref_[j] = rnd_.Rand8();
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -263,11 +315,11 @@
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
       for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd.Rand8();
-        sec_[j] = rnd.Rand8();
+        src_[j] = rnd_.Rand8();
+        sec_[j] = rnd_.Rand8();
       }
       for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd.Rand8();
+        ref_[j] = rnd_.Rand8();
       }
       unsigned int sse1, sse2;
       unsigned int var1;
@@ -362,6 +414,13 @@
 namespace vp9 {
 
 #if CONFIG_VP9_ENCODER
+
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+                        ::testing::Values(vp9_get_mb_ss_c));
+
 typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
 typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
 typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
@@ -487,6 +546,10 @@
 
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
+
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+                        ::testing::Values(vp9_get_mb_ss_sse2));
+
 const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
 const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
 const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index fb2b9d1..3869d25 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -13,4 +13,4 @@
 in order to encode multiple resolution bit streams.
 
 Local Modifications:
-None.
+cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
index deb4c44..8f8a403 100644
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -15,7 +15,8 @@
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
     !defined(__native_client__)  && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+    (defined(_M_IX86) || defined(_M_X64))
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index f58fc51..d79c353 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -21,7 +21,8 @@
 #endif
 
 // This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+    (defined(_M_IX86) || defined(_M_X64))
 
 #define YG 74  /* (int8)(1.164 * 64 + 0.5) */
 
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index d77f2ba..9824a31 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,7 @@
 
 #include <arm_neon.h>
 
-static const uint16_t bifilter4_coeff[8][2] = {
+static const uint8_t bifilter4_coeff[8][2] = {
     {128,   0},
     {112,  16},
     { 96,  32},
@@ -64,8 +64,8 @@
         q1u8 = vcombine_u8(d2u8, d3u8);
         q2u8 = vcombine_u8(d4u8, d5u8);
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
         q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
@@ -155,8 +155,8 @@
         q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
         q5u8 = vld1q_u8(src_ptr);
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
         q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
@@ -245,8 +245,8 @@
         q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
         q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
 
-        d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
 
         q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
         q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index e3f7083..467a509 100644
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -95,7 +95,7 @@
 #endif /* HAVE_MEDIA */
 
 
-#if HAVE_NEON_ASM
+#if HAVE_NEON
 
 extern unsigned int vp8_sub_pixel_variance16x16_neon_func
 (
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 651809d..a90c876 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -530,13 +530,6 @@
 }
 
 #
-# Pick Loopfilter
-#
-add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
-$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
-
-#
 # Denoiser filter
 #
 if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
@@ -544,7 +537,6 @@
     specialize qw/vp8_denoiser_filter sse2 neon/;
     add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
     specialize qw/vp8_denoiser_filter_uv sse2 neon/;
-
 }
 
 # End of encoder only functions
diff --git a/vp8/encoder/arm/neon/picklpf_arm.c b/vp8/encoder/arm/neon/picklpf_arm.c
deleted file mode 100644
index ec8071e..0000000
--- a/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/loopfilter.h"
-#include "vpx_scale/yv12config.h"
-
-extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
-                                    unsigned char *src_ptr,
-                                    int sz);
-
-
-void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
-                                      YV12_BUFFER_CONFIG *dst_ybc)
-{
-    unsigned char *src_y, *dst_y;
-    int yheight;
-    int ystride;
-    int yoffset;
-    int linestocopy;
-
-    yheight  = src_ybc->y_height;
-    ystride  = src_ybc->y_stride;
-
-    /* number of MB rows to use in partial filtering */
-    linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
-    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
-
-    /* Copy extra 4 so that full filter context is available if filtering done
-     * on the copied partial frame and not original. Partial filter does mb
-     * filtering for top row also, which can modify3 pixels above.
-     */
-    linestocopy += 4;
-    /* partial image starts at ~middle of frame (macroblock border) */
-    yoffset  = ystride * (((yheight >> 5) * 16) - 4);
-    src_y = src_ybc->y_buffer + yoffset;
-    dst_y = dst_ybc->y_buffer + yoffset;
-
-    vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
-}
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
deleted file mode 100644
index d219e2d..0000000
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,72 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_memcpy_partial_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;this is not a full memcpy function!!!
-;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
-;                             int sz);
-|vp8_memcpy_partial_neon| PROC
-    vpush               {d8-d15}
-    ;pld                [r1]                        ;preload pred data
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
-    vld1.8          {q0, q1}, [r1]!                 ;load src data
-    subs            r12, r12, #1
-    vld1.8          {q2, q3}, [r1]!
-    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
-    vld1.8          {q4, q5}, [r1]!
-    vst1.8          {q2, q3}, [r0]!
-    vld1.8          {q6, q7}, [r1]!
-    vst1.8          {q4, q5}, [r0]!
-    vld1.8          {q8, q9}, [r1]!
-    vst1.8          {q6, q7}, [r0]!
-    vld1.8          {q10, q11}, [r1]!
-    vst1.8          {q8, q9}, [r0]!
-    vld1.8          {q12, q13}, [r1]!
-    vst1.8          {q10, q11}, [r0]!
-    vld1.8          {q14, q15}, [r1]!
-    vst1.8          {q12, q13}, [r0]!
-    vst1.8          {q14, q15}, [r0]!
-
-    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
-    ;pld                [r1, #128]
-    ;pld                [r1, #256]
-    ;pld                [r1, #384]
-
-    bne             memcpy_neon_loop
-
-    ands            r3, r2, #0xff                   ;extra copy
-    beq             done_copy_neon_loop
-
-extra_copy_neon_loop
-    vld1.8          {q0}, [r1]!                 ;load src data
-    subs            r3, r3, #16
-    vst1.8          {q0}, [r0]!
-    bne             extra_copy_neon_loop
-
-done_copy_neon_loop
-    vpop            {d8-d15}
-    bx              lr
-    ENDP
-
-    END
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 250d04c..f0c8f28 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -23,8 +23,8 @@
 
 extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc)
+static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+                                    YV12_BUFFER_CONFIG *dst_ybc)
 {
     unsigned char *src_y, *dst_y;
     int yheight;
@@ -173,7 +173,7 @@
     /* Get the err using the previous frame's filter value. */
 
     /* Copy the unfiltered / processed recon buffer to the new buffer */
-    vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+    yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
     vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
     best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
@@ -184,7 +184,7 @@
     while (filt_val >= min_filter_level)
     {
         /* Apply the loop filter */
-        vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+        yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
         vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
         /* Get the err for filtered frame */
@@ -214,7 +214,7 @@
         while (filt_val < max_filter_level)
         {
             /* Apply the loop filter */
-            vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+            yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
 
             vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index fc8c407..b1b079c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -635,7 +635,6 @@
         }
 
         ctx->priv = (vpx_codec_priv_t *)priv;
-        ctx->priv->sz = sizeof(*priv);
         ctx->priv->init_flags = ctx->init_flags;
 
         if (ctx->config.enc)
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 6d9ecc0..3ab8ed0 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -84,7 +84,6 @@
         (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
 
     ctx->priv = (vpx_codec_priv_t *)priv;
-    ctx->priv->sz = sizeof(*priv);
     ctx->priv->init_flags = ctx->init_flags;
 
     priv->si.sz = sizeof(priv->si);
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index eb94202..551271e 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -36,9 +36,7 @@
 #File list for neon
 # encoder
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
 
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 403e105..471929a 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -9,11 +9,9 @@
  */
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
-
-#include "./vp9_rtcd.h"
 
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -292,32 +290,32 @@
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
 
-static intra_pred_fn pred[INTRA_MODES][4];
-static intra_pred_fn dc_pred[2][2][4];
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
 
-static void init_intra_pred_fn_ptrs(void) {
-#define intra_pred_allsizes(l, type) \
-  l[0] = vp9_##type##_predictor_4x4; \
-  l[1] = vp9_##type##_predictor_8x8; \
-  l[2] = vp9_##type##_predictor_16x16; \
-  l[3] = vp9_##type##_predictor_32x32
+void vp9_init_intra_predictors() {
+#define INIT_ALL_SIZES(p, type) \
+  p[TX_4X4] = vp9_##type##_predictor_4x4; \
+  p[TX_8X8] = vp9_##type##_predictor_8x8; \
+  p[TX_16X16] = vp9_##type##_predictor_16x16; \
+  p[TX_32X32] = vp9_##type##_predictor_32x32
 
-  intra_pred_allsizes(pred[V_PRED], v);
-  intra_pred_allsizes(pred[H_PRED], h);
-  intra_pred_allsizes(pred[D207_PRED], d207);
-  intra_pred_allsizes(pred[D45_PRED], d45);
-  intra_pred_allsizes(pred[D63_PRED], d63);
-  intra_pred_allsizes(pred[D117_PRED], d117);
-  intra_pred_allsizes(pred[D135_PRED], d135);
-  intra_pred_allsizes(pred[D153_PRED], d153);
-  intra_pred_allsizes(pred[TM_PRED], tm);
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[D207_PRED], d207);
+  INIT_ALL_SIZES(pred[D45_PRED], d45);
+  INIT_ALL_SIZES(pred[D63_PRED], d63);
+  INIT_ALL_SIZES(pred[D117_PRED], d117);
+  INIT_ALL_SIZES(pred[D135_PRED], d135);
+  INIT_ALL_SIZES(pred[D153_PRED], d153);
+  INIT_ALL_SIZES(pred[TM_PRED], tm);
 
-  intra_pred_allsizes(dc_pred[0][0], dc_128);
-  intra_pred_allsizes(dc_pred[0][1], dc_top);
-  intra_pred_allsizes(dc_pred[1][0], dc_left);
-  intra_pred_allsizes(dc_pred[1][1], dc);
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
 
-#undef intra_pred_allsizes
+#undef INIT_ALL_SIZES
 }
 
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
@@ -343,8 +341,6 @@
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
 
-  once(init_intra_pred_fn_ptrs);
-
   // Get current frame pointer, width and height.
   if (plane == 0) {
     frame_width = xd->cur_buf->y_width;
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index d09d2a1..845f3bc 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+void vp9_init_intra_predictors();
+
 void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f2a3eef..667e057 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -268,7 +268,7 @@
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_down mmx sse2/;
+specialize qw/vp9_mbpost_proc_down sse2/;
 $vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
 
 add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
@@ -276,23 +276,14 @@
 $vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
 
 add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise mmx sse2/;
+specialize qw/vp9_plane_add_noise sse2/;
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
 }
 
-add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_inner/;
-
-add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_outer/;
-
-add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_b/;
-
 #
 # Sub Pixel Filters
 #
@@ -693,19 +684,19 @@
 specialize qw/vp9_sad4x4x4d sse/;
 
 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 sse2 avx2/;
+specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16 sse2/;
+specialize qw/vp9_mse8x16/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8 sse2/;
+specialize qw/vp9_mse16x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8 sse2/;
+specialize qw/vp9_mse8x8/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss sse2/;
+specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 # ENCODEMB INVOKE
 
 add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm
deleted file mode 100644
index 5b8deef..0000000
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ /dev/null
@@ -1,533 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT  7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int rows,
-;    int cols,
-;    int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
-sym(vp9_post_proc_down_and_across_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-    ; move the global rd onto the stack, since we don't have enough registers
-    ; to do PIC addressing
-    movq        mm0, [GLOBAL(rd)]
-    sub         rsp, 8
-    movq        [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
-        push        rbx
-        lea         rbx, [GLOBAL(Blur)]
-        movd        mm2, dword ptr arg(6) ;flimit
-        punpcklwd   mm2, mm2
-        punpckldq   mm2, mm2
-
-        mov         rsi,        arg(0) ;src_ptr
-        mov         rdi,        arg(1) ;dst_ptr
-
-        movsxd      rcx, DWORD PTR arg(4) ;rows
-        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-.nextrow:
-
-        xor         rdx,        rdx       ; clear out rdx for use as loop counter
-.nextcol:
-
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
-        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
-        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
-        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = r0 p0..p3
-        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
-        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        neg         rax
-        movq        mm6, [rbx ]           ; kernel 0 taps
-        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
-        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]       ; kernel 1 taps
-        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
-        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = r0 p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-
-        movd        [rdi], mm1            ;
-        neg         rax                   ; pitch is positive
-
-
-        add         rsi, 4
-        add         rdi, 4
-        add         rdx, 4
-
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .nextcol
-        ; done with the all cols, start the across filtering in place
-        sub         rsi, rdx
-        sub         rdi, rdx
-
-
-        push        rax
-        xor         rdx,    rdx
-        mov         rax,    [rdi-4];
-
-.acrossnextcol:
-        pxor        mm7, mm7              ; mm7 = 00000000
-        movq        mm6, [rbx + 32 ]      ;
-        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
-        movq        mm3, mm4              ; mm3 = p0..p7
-        punpcklbw   mm3, mm0              ; mm3 = p0..p3
-        movq        mm1, mm3              ; mm1 = p0..p3
-        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
-
-        movq        mm6, [rbx + 48]
-        psrlq       mm4, 8                ; mm4 = p1..p7
-        movq        mm5, mm4              ; mm5 = p1..p7
-        punpcklbw   mm5, mm0              ; mm5 = p1..p4
-        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm6
-
-        ; thresholding
-        movq        mm7, mm1              ; mm7 = p0..p3
-        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [rbx + 64 ]
-        psrlq       mm4, 8                ; mm4 = p2..p7
-        movq        mm5, mm4              ; mm5 = p2..p7
-        punpcklbw   mm5, mm0              ; mm5 = p2..p5
-        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-
-        movq        mm6, [rbx ]
-        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
-        movq        mm5, mm4              ; mm5 = p-2..p5
-        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
-        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        movq        mm6, [rbx + 16]
-        psrlq       mm4, 8                ; mm4 = p-1..p5
-        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
-        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ; mm3 += mm5
-
-        ; thresholding
-        movq        mm6, mm1              ; mm6 = p0..p3
-        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
-        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ; accumulate thresholds
-
-        paddusw     mm3, RD               ; mm3 += round value
-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
-
-        pand        mm1, mm7              ; mm1 select vals > thresh from source
-        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ; combination
-
-        packuswb    mm1, mm0              ; pack to bytes
-        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
-        movd        eax,    mm1
-
-        add         rdx, 4
-        cmp         edx, dword ptr arg(5) ;cols
-        jl          .acrossnextcol;
-
-        mov         DWORD PTR [rdi+rdx-4],  eax
-        pop         rax
-
-        ; done with this rwo
-        add         rsi,rax               ; next line
-        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
-        add         rdi,rax               ; next destination
-        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
-        dec         rcx                   ; decrement count
-        jnz         .nextrow               ; next row
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-;                             int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx) PRIVATE
-sym(vp9_mbpost_proc_down_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 136
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vp9_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword ptr arg(2), 8
-
-    ;for(c=0; c<cols; c+=4)
-.loop_col:
-            mov         rsi,        arg(0)  ;s
-            pxor        mm0,        mm0     ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-            neg         rax                                     ; rax = -pitch
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-
-            pxor        mm5,        mm5
-            pxor        mm6,        mm6     ;
-
-            pxor        mm7,        mm7     ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movd        mm1,        DWORD PTR [rdi];
-            punpcklbw   mm1,        mm0     ;
-
-            paddw       mm5,        mm1     ;
-            pmullw      mm1,        mm1     ;
-
-            movq        mm2,        mm1     ;
-            punpcklwd   mm1,        mm0     ;
-
-            punpckhwd   mm2,        mm0     ;
-            paddd       mm6,        mm1     ;
-
-            paddd       mm7,        mm2     ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
-            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   mm1,        mm0
-            punpcklbw   mm2,        mm0
-
-            paddw       mm5,        mm2
-            psubw       mm5,        mm1
-
-            pmullw      mm2,        mm2
-            movq        mm4,        mm2
-
-            punpcklwd   mm2,        mm0
-            punpckhwd   mm4,        mm0
-
-            paddd       mm6,        mm2
-            paddd       mm7,        mm4
-
-            pmullw      mm1,        mm1
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm0
-            psubd       mm6,        mm1
-
-            punpckhwd   mm2,        mm0
-            psubd       mm7,        mm2
-
-
-            movq        mm3,        mm6
-            pslld       mm3,        4
-
-            psubd       mm3,        mm6
-            movq        mm1,        mm5
-
-            movq        mm4,        mm5
-            pmullw      mm1,        mm1
-
-            pmulhw      mm4,        mm4
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm4
-            punpckhwd   mm2,        mm4
-
-            movq        mm4,        mm7
-            pslld       mm4,        4
-
-            psubd       mm4,        mm7
-
-            psubd       mm3,        mm1
-            psubd       mm4,        mm2
-
-            psubd       mm3,        flimit2
-            psubd       mm4,        flimit2
-
-            psrad       mm3,        31
-            psrad       mm4,        31
-
-            packssdw    mm3,        mm4
-            packsswb    mm3,        mm0
-
-            movd        mm1,        DWORD PTR [rsi+rax*8]
-
-            movq        mm2,        mm1
-            punpcklbw   mm1,        mm0
-
-            paddw       mm1,        mm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vp9_rv))]
-            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
-            movq        mm4,        [sym(vp9_rv) + rcx*2]
-%endif
-            paddw       mm1,        mm4
-            ;paddw     xmm1,       eight8s
-            psraw       mm1,        4
-
-            packuswb    mm1,        mm0
-            pand        mm1,        mm3
-
-            pandn       mm3,        mm2
-            por         mm1,        mm3
-
-            and         rcx,        15
-            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-
-            and         rcx,        15
-            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
-            movd        [rsi],      mm1
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-
-        add         dword arg(0), 4 ; s += 4
-        sub         dword arg(3), 4 ; cols -= 4
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 136
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_mmx) PRIVATE
-sym(vp9_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-Blur:
-    times 16 dw 16
-    times  8 dw 64
-    times 16 dw 16
-    times  8 dw  0
-
-rd:
-    times 4 dw 0x40
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 372dc83..a9c03f0 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -330,6 +330,9 @@
   if (!vp9_is_valid_scale(&ref_buffer->sf))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
+  if (ref_buffer->buf->corrupted)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Block reference is corrupt");
   vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
                        &ref_buffer->sf);
   xd->corrupted |= ref_buffer->buf->corrupted;
@@ -628,11 +631,13 @@
 #endif
   if (cm->width != width || cm->height != height) {
     const int new_mi_rows =
-        calc_mi_size(ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2);
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
     const int new_mi_cols =
-        calc_mi_size(ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2);
-    if (new_mi_cols > cm->mi_stride ||
-        (new_mi_rows * new_mi_cols > cm->mi_alloc_size)) {
+        ALIGN_POWER_OF_TWO(width,  MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+    // Allocations in vp9_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
       if (vp9_alloc_context_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
@@ -675,6 +680,10 @@
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
+      if (buf->corrupted) {
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Frame reference is corrupt");
+      }
       found = 1;
       break;
     }
@@ -892,7 +901,7 @@
         pbi->mb.corrupted |= tile_data->xd.corrupted;
       }
       // Loopfilter one row.
-      if (cm->lf.filter_level) {
+      if (cm->lf.filter_level && !pbi->mb.corrupted) {
         const int lf_start = mi_row - MI_BLOCK_SIZE;
         LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
@@ -915,7 +924,7 @@
   }
 
   // Loopfilter remaining rows in the frame.
-  if (cm->lf.filter_level) {
+  if (cm->lf.filter_level && !pbi->mb.corrupted) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     winterface->sync(&pbi->lf_worker);
     lf_data->start = lf_data->stop;
@@ -1442,9 +1451,11 @@
   if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
       cm->frame_parallel_decoding_mode) {
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    // If multiple threads are used to decode tiles, then we use those threads
-    // to do parallel loopfiltering.
-    vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+    if (!xd->corrupted) {
+      // If multiple threads are used to decode tiles, then we use those threads
+      // to do parallel loopfiltering.
+      vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+    }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index f461af5..9106b0d 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -25,6 +25,7 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
@@ -36,7 +37,9 @@
   static int init_done = 0;
 
   if (!init_done) {
+    vp9_rtcd();
     vp9_init_neighbors();
+    vp9_init_intra_predictors();
     init_done = 1;
   }
 }
@@ -59,8 +62,6 @@
   cm->error.setjmp = 1;
   initialize_dec();
 
-  vp9_rtcd();
-
   // Initialize the references to not point to any frame buffers.
   vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index aef20f2..b726383 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -112,7 +112,7 @@
   int quant_fp;
 
   // skip forward transform and quantization
-  int skip_txfm[MAX_MB_PLANE << 2];
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
 
   int64_t bsse[MAX_MB_PLANE << 2];
 
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index fccdaf5..236389b 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -36,7 +36,7 @@
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 0.
   int skippable;
-  int skip_txfm[MAX_MB_PLANE << 2];
+  uint8_t skip_txfm[MAX_MB_PLANE << 2];
   int best_mode_index;
   int hybrid_pred_diff;
   int comp_pred_diff;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index e047f7e..c4cf5ee 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -78,7 +78,8 @@
                                              int mc_avg_stride,
                                              uint8_t *avg, int avg_stride,
                                              int increase_denoising,
-                                             BLOCK_SIZE bs) {
+                                             BLOCK_SIZE bs,
+                                             int motion_magnitude) {
   int r, c;
   const uint8_t *sig_start = sig;
   const uint8_t *mc_avg_start = mc_avg;
@@ -86,6 +87,19 @@
   int diff, adj, absdiff, delta;
   int adj_val[] = {3, 4, 6};
   int total_adj = 0;
+  int shift_inc = 1;
+
+  /* If motion_magnitude is small, making the denoiser more aggressive by
+   * increasing the adjustment for each level. Add another increment for
+   * blocks that are labeled for increase denoising. */
+  if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+    if (increase_denoising) {
+      shift_inc = 2;
+    }
+    adj_val[0] += shift_inc;
+    adj_val[1] += shift_inc;
+    adj_val[2] += shift_inc;
+  }
 
   // First attempt to apply a strong temporal denoising filter.
   for (r = 0; r < heights[bs]; ++r) {
@@ -130,7 +144,8 @@
   // Otherwise, we try to dampen the filter if the delta is not too high.
   delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
            >> 8) + 1;
-  if (delta > delta_thresh(bs, increase_denoising)) {
+
+  if (delta >= delta_thresh(bs, increase_denoising)) {
     return COPY_BLOCK;
   }
 
@@ -191,7 +206,8 @@
                                                          int increase_denoising,
                                                          int mi_row,
                                                          int mi_col,
-                                                         PICK_MODE_CONTEXT *ctx
+                                                         PICK_MODE_CONTEXT *ctx,
+                                                         int *motion_magnitude
                                                          ) {
   int mv_col, mv_row;
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
@@ -216,6 +232,8 @@
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
 
+  *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
   frame = ctx->best_reference_frame;
 
   // If the best reference frame uses inter-prediction and there is enough of a
@@ -303,6 +321,7 @@
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
                           PICK_MODE_CONTEXT *ctx) {
+  int motion_magnitude = 0;
   VP9_DENOISER_DECISION decision = FILTER_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -313,13 +332,14 @@
 
   decision = perform_motion_compensation(denoiser, mb, bs,
                                          denoiser->increase_denoising,
-                                         mi_row, mi_col, ctx);
+                                         mi_row, mi_col, ctx,
+                                         &motion_magnitude);
 
   if (decision == FILTER_BLOCK) {
     decision = denoiser_filter(src.buf, src.stride,
                                mc_avg_start, mc_avg.y_stride,
                                avg_start, avg.y_stride,
-                               0, bs);
+                               0, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 1c827b6..a913add 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,6 +18,8 @@
 extern "C" {
 #endif
 
+#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 0026ce8..2ca91b9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -24,6 +24,7 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
@@ -142,7 +143,9 @@
   static int init_done = 0;
 
   if (!init_done) {
+    vp9_rtcd();
     vp9_init_neighbors();
+    vp9_init_intra_predictors();
     vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
     vp9_init_me_luts();
@@ -422,21 +425,6 @@
   }
 }
 
-
-static void set_speed_features(VP9_COMP *cpi) {
-#if CONFIG_INTERNAL_STATS
-  int i;
-  for (i = 0; i < MAX_MODES; ++i)
-    cpi->mode_chosen_counts[i] = 0;
-#endif
-
-  vp9_set_speed_features(cpi);
-
-  // Set rd thresholds based on mode and speed setting
-  vp9_set_rd_speed_thresholds(cpi);
-  vp9_set_rd_speed_thresholds_sub8x8(cpi);
-}
-
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
@@ -768,8 +756,6 @@
 
   cm->error.setjmp = 1;
 
-  vp9_rtcd();
-
   cpi->use_svc = 0;
 
   init_config(cpi, oxcf);
@@ -985,7 +971,7 @@
     }
   }
 
-  set_speed_features(cpi);
+  vp9_set_speed_features(cpi);
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -2246,7 +2232,16 @@
   }
 #endif
 
-  set_speed_features(cpi);
+#if CONFIG_INTERNAL_STATS
+  int i;
+  for (i = 0; i < MAX_MODES; ++i)
+    cpi->mode_chosen_counts[i] = 0;
+#endif
+
+  vp9_set_speed_features(cpi);
+
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
 
   // Decide q and q bounds.
   q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 9651cee..eee6ffe 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -423,7 +423,7 @@
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   int64_t this_rd = INT64_MAX;
-  int skip_txfm = 0;
+  uint8_t skip_txfm = 0;
   int rate = INT_MAX;
   int64_t dist = INT64_MAX;
   // var_y and sse_y are saved to be used in skipping checking
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index b9e4408..b826ff4 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -374,7 +374,6 @@
   uint8_t *ref_y_ptr;
   const int num_mv_refs = MAX_MV_REF_CANDIDATES +
                     (cpi->sf.adaptive_motion_search &&
-                     cpi->common.show_frame &&
                      block_size < cpi->sf.max_partition_size);
 
   MV pred_mv[3];
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9af6a0c..506c9bc 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1398,7 +1398,7 @@
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
-          if (cpi->sf.adaptive_motion_search && cm->show_frame) {
+          if (cpi->sf.adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
             step_param = MAX(step_param, 8);
@@ -1815,8 +1815,7 @@
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
-      cm->show_frame) {
+  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
                                                        b_width_log2(bsize)));
     step_param = MAX(step_param, boffset);
@@ -1876,7 +1875,7 @@
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search && cm->show_frame)
+  if (cpi->sf.adaptive_motion_search)
     x->pred_mv[ref] = tmp_mv->as_mv;
 
   if (scaled_ref_frame) {
@@ -2130,6 +2129,8 @@
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
+                                 INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+                                 int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
                                  const int64_t ref_best_rd) {
   VP9_COMMON *cm = &cpi->common;
@@ -2153,7 +2154,7 @@
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
   INTERP_FILTER best_filter = SWITCHABLE;
-  int skip_txfm[MAX_MB_PLANE << 2] = {0};
+  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
   int64_t bsse[MAX_MB_PLANE << 2] = {0};
 
   int bsl = mi_width_log2_lookup[bsize];
@@ -2176,6 +2177,12 @@
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
+
+    if (cpi->sf.adaptive_mode_search) {
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+    }
   }
 
   if (this_mode == NEWMV) {
@@ -2365,8 +2372,19 @@
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+    vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+    vpx_memcpy(bsse, x->bsse, sizeof(bsse));
   }
 
+  if (!is_comp_pred)
+    single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+
+  if (cpi->sf.adaptive_mode_search)
+    if (is_comp_pred)
+      if (single_skippable[this_mode][refs[0]] &&
+          single_skippable[this_mode][refs[1]])
+        vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
+
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
@@ -2426,6 +2444,9 @@
     *skippable = skippable_y && skippable_uv;
   }
 
+  if (!is_comp_pred)
+    single_skippable[this_mode][refs[0]] = *skippable;
+
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return this_rd;  // if 0, this will be re-calculated by caller
 }
@@ -2532,10 +2553,12 @@
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
-  int comp_pred, i;
+  int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+  INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
@@ -2584,6 +2607,12 @@
     rate_uv_intra[i] = INT_MAX;
   for (i = 0; i < MAX_REF_FRAMES; ++i)
     x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < MB_MODE_COUNT; ++i) {
+    for (k = 0; k < MAX_REF_FRAMES; ++k) {
+      single_inter_filter[i][k] = SWITCHABLE;
+      single_skippable[i][k] = 0;
+    }
+  }
 
   *returnrate = INT_MAX;
 
@@ -2820,6 +2849,8 @@
     // them for this frame.
     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
                                                           : cm->interp_filter;
+    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -2866,7 +2897,8 @@
                                   &rate_uv, &distortion_uv,
                                   &disable_skip, frame_mv,
                                   mi_row, mi_col,
-                                  single_newmv, &total_sse, best_rd);
+                                  single_newmv, single_inter_filter,
+                                  single_skippable, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
         continue;
 
@@ -3064,6 +3096,28 @@
       break;
   }
 
+  // The inter modes' rate costs are not calculated precisely in some cases.
+  // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+  // ZEROMV. Here, checks are added for those cases, and the mode decisions
+  // are corrected.
+  if (best_mbmode.mode == NEWMV) {
+    const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+        best_mbmode.ref_frame[1]};
+    int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+    if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARESTMV;
+    else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+        ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int) || !comp_pred_mode))
+      best_mbmode.mode = NEARMV;
+    else if (best_mbmode.mv[0].as_int == 0 &&
+        ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+      best_mbmode.mode = ZEROMV;
+  }
+
   if (best_mode_index < 0 || best_rd >= best_rd_so_far)
     return INT64_MAX;
 
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index eb5ae2e..afbb191 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -103,8 +103,9 @@
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
 
-  for (i = 0; i < 256; i++)
+  for (i = 0; i < 256; ++i) {
     sum += src_ptr[i] * src_ptr[i];
+  }
 
   return sum;
 }
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
deleted file mode 100644
index 6278f2a..0000000
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2) PRIVATE
-sym(vp9_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 1cc4bbc..b4d2b0a 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -19,6 +19,21 @@
                                        const unsigned char *ref, int ref_stride,
                                        unsigned int *sse, int *sum);
 
+unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return  _mm_cvtsi128_si32(vsum);
+}
+
 #define READ64(p, stride, i) \
   _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
       _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8e3e885..90f0342 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -80,7 +80,6 @@
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 26db30c..0f0b7a5 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -670,7 +670,6 @@
       return VPX_CODEC_MEM_ERROR;
 
     ctx->priv = (vpx_codec_priv_t *)priv;
-    ctx->priv->sz = sizeof(*priv);
     ctx->priv->init_flags = ctx->init_flags;
     ctx->priv->enc.total_encoders = 1;
 
@@ -837,18 +836,19 @@
                                       vpx_enc_frame_flags_t flags,
                                       unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
+  VP9_COMP *const cpi = ctx->cpi;
   const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
 
   if (img != NULL) {
     res = validate_img(ctx, img);
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
-    if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+    if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) {
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
       ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
                         get_image_bps(img) / 8 *
-                        (ctx->cpi->multi_arf_allowed ? 8 : 2);
+                        (cpi->multi_arf_allowed ? 8 : 2);
       if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
 
       ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
@@ -868,7 +868,7 @@
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  vp9_apply_encoding_flags(ctx->cpi, flags);
+  vp9_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
   if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -880,7 +880,7 @@
   }
 
   // Initialize the encoder instance on the first frame.
-  if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
+  if (res == VPX_CODEC_OK && cpi != NULL) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -891,16 +891,15 @@
 
     // Set up internal flags
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
-      ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+      cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
 
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
-      if (vp9_receive_raw_frame(ctx->cpi, flags,
+      if (vp9_receive_raw_frame(cpi, flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
-        VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
         res = update_error_state(ctx, &cpi->common.error);
       }
     }
@@ -925,11 +924,10 @@
     }
 
     while (cx_data_sz >= ctx->cx_data_sz / 2 &&
-           -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+           -1 != vp9_get_compressed_data(cpi, &lib_flags, &size,
                                          cx_data, &dst_time_stamp,
                                          &dst_end_time_stamp, !img)) {
       if (size) {
-        VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
         vpx_codec_cx_pkt_t pkt;
 
 #if CONFIG_SPATIAL_SVC
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index b0fb282..393c66e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -63,7 +63,6 @@
       return VPX_CODEC_MEM_ERROR;
 
     ctx->priv = (vpx_codec_priv_t *)priv;
-    ctx->priv->sz = sizeof(*priv);
     ctx->priv->init_flags = ctx->init_flags;
 
     priv->si.sz = sizeof(priv->si);
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 312f717..e450f7b 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -93,7 +93,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 02f2079..cbfffd0 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -335,7 +335,6 @@
  * and the pointer cast to the proper type.
  */
 struct vpx_codec_priv {
-  unsigned int                    sz;
   const char                     *err_detail;
   vpx_codec_flags_t               init_flags;
   struct {
diff --git a/vpxenc.c b/vpxenc.c
index b1156e1..5afca24 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1202,7 +1202,7 @@
 
   // Check that the codec bit depth is greater than the input bit depth.
   if (stream->config.cfg.g_input_bit_depth >
-      (int)stream->config.cfg.g_bit_depth) {
+      (unsigned int)stream->config.cfg.g_bit_depth) {
     fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
           stream->index, (int)stream->config.cfg.g_bit_depth,
           stream->config.cfg.g_input_bit_depth);