Merge "x86.h,x86_simd_caps: add an explicit cast w/strtol"
diff --git a/README b/README
index 979440e..29072b9 100644
--- a/README
+++ b/README
@@ -47,7 +47,6 @@
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
-    armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-none-rvct
diff --git a/build/make/configure.sh b/build/make/configure.sh
index b305d33..7b471ca 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1018,18 +1018,7 @@
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
           AS_SFX=.s
-
-          # Special handling of ld for armv6 because libclang_rt.ios.a does
-          # not contain armv6 support in Apple's clang package:
-          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
-          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
-          # renders support for armv6 unnecessary because the 3GS and up
-          # support neon.
-          if [ "${tgt_isa}" = "armv6" ]; then
-            LD="$(${XCRUN_FIND} ld)"
-          else
-            LD="${CXX:-$(${XCRUN_FIND} ld)}"
-          fi
+          LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
           # because we need to overwrite all of ASFLAGS and purge the
@@ -1069,7 +1058,7 @@
           if enabled rvct; then
             # Check if we have CodeSourcery GCC in PATH. Needed for
             # libraries
-            hash arm-none-linux-gnueabi-gcc 2>&- || \
+            which arm-none-linux-gnueabi-gcc 2>&- || \
               die "Couldn't find CodeSourcery GCC from PATH"
 
             # Use armcc as a linker to enable translation of
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 0cf335b..2b91fbf 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -193,7 +193,7 @@
 done
 
 # Make one call to fix_path for file_list to improve performance.
-fix_file_list
+fix_file_list file_list
 
 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 182ea28..e98611d 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -211,7 +211,7 @@
 done
 
 # Make one call to fix_path for file_list to improve performance.
-fix_file_list
+fix_file_list file_list
 
 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index 90c1488..88f1cf9 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -39,11 +39,12 @@
 }
 
 # Corrects the paths in file_list in one pass for efficiency.
+# $1 is the name of the array to be modified.
 fix_file_list() {
-    # TODO(jzern): this could be more generic and take the array as a param.
-    files=$(fix_path "${file_list[@]}")
+    declare -n array_ref=$1
+    files=$(fix_path "${array_ref[@]}")
     local IFS=$'\n'
-    file_list=($files)
+    array_ref=($files)
 }
 
 generate_uuid() {
diff --git a/configure b/configure
index ff97dee..91407d3 100755
--- a/configure
+++ b/configure
@@ -98,7 +98,6 @@
 
 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 12022be..22a2e77 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -69,6 +69,21 @@
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
+#define ALL_SIZES(convolve_fn) \
+    make_tuple(4, 4, &convolve_fn),     \
+    make_tuple(8, 4, &convolve_fn),     \
+    make_tuple(4, 8, &convolve_fn),     \
+    make_tuple(8, 8, &convolve_fn),     \
+    make_tuple(16, 8, &convolve_fn),    \
+    make_tuple(8, 16, &convolve_fn),    \
+    make_tuple(16, 16, &convolve_fn),   \
+    make_tuple(32, 16, &convolve_fn),   \
+    make_tuple(16, 32, &convolve_fn),   \
+    make_tuple(32, 32, &convolve_fn),   \
+    make_tuple(64, 32, &convolve_fn),   \
+    make_tuple(32, 64, &convolve_fn),   \
+    make_tuple(64, 64, &convolve_fn)
+
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
@@ -1034,20 +1049,6 @@
     wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
     wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
     wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
-INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_c),
-    make_tuple(8, 4, &convolve8_c),
-    make_tuple(4, 8, &convolve8_c),
-    make_tuple(8, 8, &convolve8_c),
-    make_tuple(16, 8, &convolve8_c),
-    make_tuple(8, 16, &convolve8_c),
-    make_tuple(16, 16, &convolve8_c),
-    make_tuple(32, 16, &convolve8_c),
-    make_tuple(16, 32, &convolve8_c),
-    make_tuple(32, 32, &convolve8_c),
-    make_tuple(64, 32, &convolve8_c),
-    make_tuple(32, 64, &convolve8_c),
-    make_tuple(64, 64, &convolve8_c)));
 const ConvolveFunctions convolve10_c(
     wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
     wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
@@ -1056,20 +1057,6 @@
     wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
     wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
     wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
-INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve10_c),
-    make_tuple(8, 4, &convolve10_c),
-    make_tuple(4, 8, &convolve10_c),
-    make_tuple(8, 8, &convolve10_c),
-    make_tuple(16, 8, &convolve10_c),
-    make_tuple(8, 16, &convolve10_c),
-    make_tuple(16, 16, &convolve10_c),
-    make_tuple(32, 16, &convolve10_c),
-    make_tuple(16, 32, &convolve10_c),
-    make_tuple(32, 32, &convolve10_c),
-    make_tuple(64, 32, &convolve10_c),
-    make_tuple(32, 64, &convolve10_c),
-    make_tuple(64, 64, &convolve10_c)));
 const ConvolveFunctions convolve12_c(
     wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
@@ -1078,23 +1065,13 @@
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
     wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
-INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve12_c),
-    make_tuple(8, 4, &convolve12_c),
-    make_tuple(4, 8, &convolve12_c),
-    make_tuple(8, 8, &convolve12_c),
-    make_tuple(16, 8, &convolve12_c),
-    make_tuple(8, 16, &convolve12_c),
-    make_tuple(16, 16, &convolve12_c),
-    make_tuple(32, 16, &convolve12_c),
-    make_tuple(16, 32, &convolve12_c),
-    make_tuple(32, 32, &convolve12_c),
-    make_tuple(64, 32, &convolve12_c),
-    make_tuple(32, 64, &convolve12_c),
-    make_tuple(64, 64, &convolve12_c)));
+const ConvolveParam kArrayConvolve_c[] = {
+    ALL_SIZES(convolve8_c),
+    ALL_SIZES(convolve10_c),
+    ALL_SIZES(convolve12_c)
+};
 
 #else
-
 const ConvolveFunctions convolve8_c(
     vpx_convolve_copy_c, vpx_convolve_avg_c,
     vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
@@ -1103,22 +1080,10 @@
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-
-INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_c),
-    make_tuple(8, 4, &convolve8_c),
-    make_tuple(4, 8, &convolve8_c),
-    make_tuple(8, 8, &convolve8_c),
-    make_tuple(16, 8, &convolve8_c),
-    make_tuple(8, 16, &convolve8_c),
-    make_tuple(16, 16, &convolve8_c),
-    make_tuple(32, 16, &convolve8_c),
-    make_tuple(16, 32, &convolve8_c),
-    make_tuple(32, 32, &convolve8_c),
-    make_tuple(64, 32, &convolve8_c),
-    make_tuple(32, 64, &convolve8_c),
-    make_tuple(64, 64, &convolve8_c)));
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
 #endif
+INSTANTIATE_TEST_CASE_P(C, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_c));
 
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1158,46 +1123,11 @@
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_sse2),
-    make_tuple(8, 4, &convolve8_sse2),
-    make_tuple(4, 8, &convolve8_sse2),
-    make_tuple(8, 8, &convolve8_sse2),
-    make_tuple(16, 8, &convolve8_sse2),
-    make_tuple(8, 16, &convolve8_sse2),
-    make_tuple(16, 16, &convolve8_sse2),
-    make_tuple(32, 16, &convolve8_sse2),
-    make_tuple(16, 32, &convolve8_sse2),
-    make_tuple(32, 32, &convolve8_sse2),
-    make_tuple(64, 32, &convolve8_sse2),
-    make_tuple(32, 64, &convolve8_sse2),
-    make_tuple(64, 64, &convolve8_sse2),
-    make_tuple(4, 4, &convolve10_sse2),
-    make_tuple(8, 4, &convolve10_sse2),
-    make_tuple(4, 8, &convolve10_sse2),
-    make_tuple(8, 8, &convolve10_sse2),
-    make_tuple(16, 8, &convolve10_sse2),
-    make_tuple(8, 16, &convolve10_sse2),
-    make_tuple(16, 16, &convolve10_sse2),
-    make_tuple(32, 16, &convolve10_sse2),
-    make_tuple(16, 32, &convolve10_sse2),
-    make_tuple(32, 32, &convolve10_sse2),
-    make_tuple(64, 32, &convolve10_sse2),
-    make_tuple(32, 64, &convolve10_sse2),
-    make_tuple(64, 64, &convolve10_sse2),
-    make_tuple(4, 4, &convolve12_sse2),
-    make_tuple(8, 4, &convolve12_sse2),
-    make_tuple(4, 8, &convolve12_sse2),
-    make_tuple(8, 8, &convolve12_sse2),
-    make_tuple(16, 8, &convolve12_sse2),
-    make_tuple(8, 16, &convolve12_sse2),
-    make_tuple(16, 16, &convolve12_sse2),
-    make_tuple(32, 16, &convolve12_sse2),
-    make_tuple(16, 32, &convolve12_sse2),
-    make_tuple(32, 32, &convolve12_sse2),
-    make_tuple(64, 32, &convolve12_sse2),
-    make_tuple(32, 64, &convolve12_sse2),
-    make_tuple(64, 64, &convolve12_sse2)));
+const ConvolveParam kArrayConvolve_sse2[] = {
+    ALL_SIZES(convolve8_sse2),
+    ALL_SIZES(convolve10_sse2),
+    ALL_SIZES(convolve12_sse2)
+};
 #else
 const ConvolveFunctions convolve8_sse2(
 #if CONFIG_USE_X86INC
@@ -1212,21 +1142,10 @@
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_sse2),
-    make_tuple(8, 4, &convolve8_sse2),
-    make_tuple(4, 8, &convolve8_sse2),
-    make_tuple(8, 8, &convolve8_sse2),
-    make_tuple(16, 8, &convolve8_sse2),
-    make_tuple(8, 16, &convolve8_sse2),
-    make_tuple(16, 16, &convolve8_sse2),
-    make_tuple(32, 16, &convolve8_sse2),
-    make_tuple(16, 32, &convolve8_sse2),
-    make_tuple(32, 32, &convolve8_sse2),
-    make_tuple(64, 32, &convolve8_sse2),
-    make_tuple(32, 64, &convolve8_sse2),
-    make_tuple(64, 64, &convolve8_sse2)));
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
 #if HAVE_SSSE3
@@ -1237,22 +1156,11 @@
     vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
-    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_ssse3),
-    make_tuple(8, 4, &convolve8_ssse3),
-    make_tuple(4, 8, &convolve8_ssse3),
-    make_tuple(8, 8, &convolve8_ssse3),
-    make_tuple(16, 8, &convolve8_ssse3),
-    make_tuple(8, 16, &convolve8_ssse3),
-    make_tuple(16, 16, &convolve8_ssse3),
-    make_tuple(32, 16, &convolve8_ssse3),
-    make_tuple(16, 32, &convolve8_ssse3),
-    make_tuple(32, 32, &convolve8_ssse3),
-    make_tuple(64, 32, &convolve8_ssse3),
-    make_tuple(32, 64, &convolve8_ssse3),
-    make_tuple(64, 64, &convolve8_ssse3)));
+const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif
 
 #if HAVE_AVX2 && HAVE_SSSE3
@@ -1265,20 +1173,9 @@
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_avx2),
-    make_tuple(8, 4, &convolve8_avx2),
-    make_tuple(4, 8, &convolve8_avx2),
-    make_tuple(8, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2),
-    make_tuple(16, 8, &convolve8_avx2),
-    make_tuple(16, 16, &convolve8_avx2),
-    make_tuple(32, 16, &convolve8_avx2),
-    make_tuple(16, 32, &convolve8_avx2),
-    make_tuple(32, 32, &convolve8_avx2),
-    make_tuple(64, 32, &convolve8_avx2),
-    make_tuple(32, 64, &convolve8_avx2),
-    make_tuple(64, 64, &convolve8_avx2)));
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_avx2));
 #endif  // HAVE_AVX2 && HAVE_SSSE3
 
 #if HAVE_NEON
@@ -1302,20 +1199,9 @@
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 #endif  // HAVE_NEON_ASM
 
-INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_neon),
-    make_tuple(8, 4, &convolve8_neon),
-    make_tuple(4, 8, &convolve8_neon),
-    make_tuple(8, 8, &convolve8_neon),
-    make_tuple(16, 8, &convolve8_neon),
-    make_tuple(8, 16, &convolve8_neon),
-    make_tuple(16, 16, &convolve8_neon),
-    make_tuple(32, 16, &convolve8_neon),
-    make_tuple(16, 32, &convolve8_neon),
-    make_tuple(32, 32, &convolve8_neon),
-    make_tuple(64, 32, &convolve8_neon),
-    make_tuple(32, 64, &convolve8_neon),
-    make_tuple(64, 64, &convolve8_neon)));
+const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) };
+INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_DSPR2
@@ -1328,21 +1214,10 @@
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_dspr2),
-    make_tuple(8, 4, &convolve8_dspr2),
-    make_tuple(4, 8, &convolve8_dspr2),
-    make_tuple(8, 8, &convolve8_dspr2),
-    make_tuple(16, 8, &convolve8_dspr2),
-    make_tuple(8, 16, &convolve8_dspr2),
-    make_tuple(16, 16, &convolve8_dspr2),
-    make_tuple(32, 16, &convolve8_dspr2),
-    make_tuple(16, 32, &convolve8_dspr2),
-    make_tuple(32, 32, &convolve8_dspr2),
-    make_tuple(64, 32, &convolve8_dspr2),
-    make_tuple(32, 64, &convolve8_dspr2),
-    make_tuple(64, 64, &convolve8_dspr2)));
-#endif
+const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) };
+INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_dspr2));
+#endif  // HAVE_DSPR2
 
 #if HAVE_MSA
 const ConvolveFunctions convolve8_msa(
@@ -1354,19 +1229,8 @@
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_msa),
-    make_tuple(8, 4, &convolve8_msa),
-    make_tuple(4, 8, &convolve8_msa),
-    make_tuple(8, 8, &convolve8_msa),
-    make_tuple(16, 8, &convolve8_msa),
-    make_tuple(8, 16, &convolve8_msa),
-    make_tuple(16, 16, &convolve8_msa),
-    make_tuple(32, 16, &convolve8_msa),
-    make_tuple(16, 32, &convolve8_msa),
-    make_tuple(32, 32, &convolve8_msa),
-    make_tuple(64, 32, &convolve8_msa),
-    make_tuple(32, 64, &convolve8_msa),
-    make_tuple(64, 64, &convolve8_msa)));
+const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
+INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
 }  // namespace
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 4983d7f..f0b8cef 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -29,16 +29,9 @@
         encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
-    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-    cfg.w = 1280;
-    cfg.h = 720;
-    decoder_ = codec_->CreateDecoder(cfg, 0);
-
     md5_.clear();
   }
-  virtual ~VPxEncoderThreadTest() {
-    delete decoder_;
-  }
+  virtual ~VPxEncoderThreadTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -81,31 +74,28 @@
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-#if CONFIG_VP9_DECODER
-    const vpx_codec_err_t res = decoder_->DecodeFrame(
-        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
-    if (res != VPX_CODEC_OK) {
-      abort_ = true;
-      ASSERT_EQ(VPX_CODEC_OK, res);
-    }
-    const vpx_image_t *img = decoder_->GetDxData().Next();
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t /*pts*/) {
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
 
-    if (img) {
-      ::libvpx_test::MD5 md5_res;
-      md5_res.Add(img);
-      md5_.push_back(md5_res.Get());
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
+                                  const libvpx_test::VideoSource& /*video*/,
+                                  libvpx_test::Decoder * /*decoder*/) {
+    if (res != VPX_CODEC_OK) {
+      EXPECT_EQ(VPX_CODEC_OK, res);
+      return false;
     }
-#else
-    ASSERT_EQ(NULL, decoder_);
-#endif
+
+    return true;
   }
 
   bool encoder_initialized_;
   int tiles_;
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
-  ::libvpx_test::Decoder *decoder_;
   std::vector<std::string> md5_;
 };
 
diff --git a/vp10/common/postproc.c b/vp10/common/postproc.c
index a6ea9c0..e8a9f81 100644
--- a/vp10/common/postproc.c
+++ b/vp10/common/postproc.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp10_rtcd.h"
 
@@ -587,32 +588,6 @@
   state->last_noise = a;
 }
 
-void vp10_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP10_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO *temp = cm->postproc_state.prev_mip;
@@ -727,7 +702,7 @@
       fillrd(ppstate, 63 - q, noise_level);
     }
 
-    vp10_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                         ppstate->whiteclamp, ppstate->bothclamp,
                         ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 9860bae..f2414f8 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -70,10 +70,6 @@
 specialize qw/vp10_post_proc_down_and_across sse2/;
 $vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm;
 
-add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp10_plane_add_noise sse2/;
-$vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt;
-
 add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp10_filter_by_weight16x16 sse2 msa/;
 
@@ -326,9 +322,6 @@
 
     add_proto qw/void vp10_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
     specialize qw/vp10_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp10_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp10_highbd_plane_add_noise/;
   }
 
   #
diff --git a/vp10/common/x86/postproc_sse2.asm b/vp10/common/x86/postproc_sse2.asm
index d5f8e92..d477a65 100644
--- a/vp10/common/x86/postproc_sse2.asm
+++ b/vp10/common/x86/postproc_sse2.asm
@@ -624,68 +624,6 @@
 %undef flimit4
 
 
-;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp10_plane_add_noise_wmt) PRIVATE
-sym(vp10_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
diff --git a/vp10/encoder/lookahead.c b/vp10/encoder/lookahead.c
index dce0139..3185cb6 100644
--- a/vp10/encoder/lookahead.c
+++ b/vp10/encoder/lookahead.c
@@ -20,8 +20,8 @@
 
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -35,7 +35,7 @@
 void vp10_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++)
         vpx_free_frame_buffer(&ctx->buf[i].img);
@@ -221,9 +221,9 @@
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
         index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
diff --git a/vp10/encoder/lookahead.h b/vp10/encoder/lookahead.h
index 22429ae..f650f80 100644
--- a/vp10/encoder/lookahead.h
+++ b/vp10/encoder/lookahead.h
@@ -31,10 +31,10 @@
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 86b324f..136efe3 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -219,12 +219,12 @@
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
-  int l;
+  int l, m;
   t = d;
   for (l = 0; t > 1; l++)
     t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (int16_t)(t - (1 << 16));
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
   *shift = 1 << (16 - l);
 }
 
diff --git a/vp8/common/mips/msa/postproc_msa.c b/vp8/common/mips/msa/postproc_msa.c
index c88f302..23dcde2 100644
--- a/vp8/common/mips/msa/postproc_msa.c
+++ b/vp8/common/mips/msa/postproc_msa.c
@@ -10,6 +10,7 @@
 
 #include <stdlib.h>
 #include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 
 static const int16_t vp8_rv_msa[] =
@@ -798,54 +799,3 @@
         }
     }
 }
-
-void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16],
-                             uint32_t width, uint32_t height,
-                             int32_t pitch)
-{
-    uint32_t i, j;
-
-    for (i = 0; i < height / 2; ++i)
-    {
-        uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
-        int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
-        uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
-        int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
-        for (j = width / 16; j--;)
-        {
-            v16i8 temp00_s, temp01_s;
-            v16u8 temp00, temp01, black_clamp, white_clamp;
-            v16u8 pos0, ref0, pos1, ref1;
-            v16i8 const127 = __msa_ldi_b(127);
-
-            pos0 = LD_UB(pos0_ptr);
-            ref0 = LD_UB(ref0_ptr);
-            pos1 = LD_UB(pos1_ptr);
-            ref1 = LD_UB(ref1_ptr);
-            black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
-            white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
-            temp00 = (pos0 < black_clamp);
-            pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-            temp01 = (pos1 < black_clamp);
-            pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp00 = (v16u8)(temp00_s < pos0);
-            pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-            temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp01 = (temp01_s < pos1);
-            pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            pos0 += ref0;
-            ST_UB(pos0, pos0_ptr);
-            pos1 += ref1;
-            ST_UB(pos1, pos1_ptr);
-            pos0_ptr += 16;
-            pos1_ptr += 16;
-            ref0_ptr += 16;
-            ref1_ptr += 16;
-        }
-    }
-}
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 322b613..6baf00f 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -10,6 +10,7 @@
 
 
 #include "vpx_config.h"
+#include "vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "vpx_scale_rtcd.h"
 #include "vpx_scale/yv12config.h"
@@ -490,54 +491,6 @@
     state->last_noise = a;
 }
 
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
- *                                  noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int Width, unsigned int Height, int Pitch)
-{
-    unsigned int i, j;
-    (void)bothclamp;
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = (char *)(noise + (rand() & 0xff));
-
-        for (j = 0; j < Width; j++)
-        {
-            if (Pos[j] < blackclamp[0])
-                Pos[j] = blackclamp[0];
-
-            if (Pos[j] > 255 + whiteclamp[0])
-                Pos[j] = 255 + whiteclamp[0];
-
-            Pos[j] += Ref[j];
-        }
-    }
-}
-
 /* Blend the macro block with a solid colored square.  Leave the
  * edges unblended to give distinction to macro blocks in areas
  * filled with the same color block.
@@ -828,7 +781,7 @@
             fillrd(&oci->postproc_state, 63 - q, noise_level);
         }
 
-        vp8_plane_add_noise
+        vpx_plane_add_noise
         (oci->post_proc_buffer.y_buffer,
          oci->postproc_state.noise,
          oci->postproc_state.blackclamp,
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 6799c27..b942d5b 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -167,10 +167,6 @@
     add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
     specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
 
-    add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
-    specialize qw/vp8_plane_add_noise mmx sse2 msa/;
-    $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
-
     add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
     # no asm yet
 
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index a2b1632..1a89e7e 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -241,68 +241,6 @@
 %undef flimit2
 
 
-;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_mmx) PRIVATE
-sym(vp8_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 Blur:
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index fed4ee5..de17afa 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -655,68 +655,6 @@
 %undef flimit4
 
 
-;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_wmt) PRIVATE
-sym(vp8_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 four8s:
diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c
index ee922c9..0d101ba 100644
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@@ -227,12 +227,12 @@
     if(improved_quant)
     {
         unsigned t;
-        int l;
+        int l, m;
         t = d;
         for(l = 0; t > 1; l++)
             t>>=1;
-        t = 1 + (1<<(16+l))/d;
-        *quant = (short)(t - (1<<16));
+        m = 1 + (1<<(16+l))/d;
+        *quant = (short)(m - (1<<16));
         *shift = l;
         /* use multiplication and constant shift by 16 */
         *shift = 1 << (16 - *shift);
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index b685d81..c04cc8f 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
@@ -587,32 +588,6 @@
   state->last_noise = a;
 }
 
-void vp9_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO *temp = cm->postproc_state.prev_mip;
@@ -726,8 +701,7 @@
         ppstate->last_noise != noise_level) {
       fillrd(ppstate, 63 - q, noise_level);
     }
-
-    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                         ppstate->whiteclamp, ppstate->bothclamp,
                         ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1cf636c..d7f5a21 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -70,10 +70,6 @@
 specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
-add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise sse2/;
-$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
-
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
 
@@ -169,9 +165,6 @@
 
     add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
     specialize qw/vp9_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp9_highbd_plane_add_noise/;
   }
 
   #
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index ec8bfdb..4307628 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -624,68 +624,6 @@
 %undef flimit4
 
 
-;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_wmt) PRIVATE
-sym(vp9_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ae5ca7d..e96c96c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -662,12 +662,79 @@
   }
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
+// Check if most of the superblock is skin content, and if so, force split to
+// 32x32, and set x->sb_is_skin for use in mode selection.
+static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
+                         int mi_row, int mi_col, int *force_split) {
+  VP9_COMMON * const cm = &cpi->common;
+  // Avoid checking superblocks on/near boundary and avoid low resolutions.
+  // Note superblock may still pick 64X64 if y_sad is very small
+  // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
+  if (!low_res && (mi_col >= 8 && mi_col + 8 < cm->mi_cols && mi_row >= 8 &&
+      mi_row + 8 < cm->mi_rows)) {
+    int num_16x16_skin = 0;
+    int num_16x16_nonskin = 0;
+    uint8_t *ysignal = x->plane[0].src.buf;
+    uint8_t *usignal = x->plane[1].src.buf;
+    uint8_t *vsignal = x->plane[2].src.buf;
+    int sp = x->plane[0].src.stride;
+    int spuv = x->plane[1].src.stride;
+    const int block_index = mi_row * cm->mi_cols + mi_col;
+    const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+    const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+    const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+    const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+    // Loop through the 16x16 sub-blocks.
+    int i, j;
+    for (i = 0; i < ymis; i+=2) {
+      for (j = 0; j < xmis; j+=2) {
+        int bl_index = block_index + i * cm->mi_cols + j;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+        int is_skin = vp9_compute_skin_block(ysignal,
+                                             usignal,
+                                             vsignal,
+                                             sp,
+                                             spuv,
+                                             BLOCK_16X16,
+                                             consec_zeromv,
+                                             0);
+        num_16x16_skin += is_skin;
+        num_16x16_nonskin += (1 - is_skin);
+        if (num_16x16_nonskin > 3) {
+          // Exit loop if at least 4 of the 16x16 blocks are not skin.
+          i = ymis;
+          break;
+        }
+        ysignal += 16;
+        usignal += 8;
+        vsignal += 8;
+      }
+      ysignal += (sp << 4) - 64;
+      usignal += (spuv << 3) - 32;
+      vsignal += (spuv << 3) - 32;
+    }
+    if (num_16x16_skin > 12) {
+      *force_split = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static int choose_partitioning(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
+                               const TileInfo *const tile,
+                               MACROBLOCK *x,
+                               int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
@@ -771,70 +838,13 @@
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    // Check if most of the superblock is skin content, and if so, force split
-    // to 32x32, and set x->sb_is_skin for use in mode selection.
-    // Avoid checking superblocks on/near boundary and avoid low resolutions.
-    // Note superblock may still pick 64X64 if y_sad is very small
-    // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
     x->sb_is_skin = 0;
 #if !CONFIG_VP9_HIGHBITDEPTH
-    if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
-        mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
-      int bl_index1, bl_index2, bl_index3;
-      int num_16x16_skin = 0;
-      int num_16x16_nonskin = 0;
-      int is_skin = 0;
-      int consec_zeromv = 0;
-      uint8_t *ysignal = x->plane[0].src.buf;
-      uint8_t *usignal = x->plane[1].src.buf;
-      uint8_t *vsignal = x->plane[2].src.buf;
-      int spuv = x->plane[1].src.stride;
-      const int block_index = mi_row * cm->mi_cols + mi_col;
-      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
-      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
-      // Loop through the 16x16 sub-blocks.
-      int j, i;
-      for (i = 0; i < ymis; i+=2) {
-        for (j = 0; j < xmis; j+=2) {
-          int bl_index = block_index + i * cm->mi_cols + j;
-          bl_index1 = bl_index + 1;
-          bl_index2 = bl_index + cm->mi_cols;
-          bl_index3 = bl_index2 + 1;
-          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
-                                 VPXMIN(cpi->consec_zero_mv[bl_index1],
-                                 VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                 cpi->consec_zero_mv[bl_index3])));
-          is_skin = vp9_compute_skin_block(ysignal,
-                                           usignal,
-                                           vsignal,
-                                           sp,
-                                           spuv,
-                                           BLOCK_16X16,
-                                           consec_zeromv,
-                                           0);
-          num_16x16_skin += is_skin;
-          num_16x16_nonskin += (1 - is_skin);
-          if (num_16x16_nonskin > 3) {
-            // Exit loop if at least 4 of the 16x16 blocks are not skin.
-            i = ymis;
-            j = xmis;
-          }
-          ysignal += 16;
-          usignal += 8;
-          vsignal += 8;
-        }
-        ysignal += (sp << 4) - 64;
-        usignal += (spuv << 3) - 32;
-        vsignal += (spuv << 3) - 32;
-      }
-      if (num_16x16_skin > 12) {
-        x->sb_is_skin = 1;
-        force_split[0] = 1;
-      }
-    }
+    if (cpi->use_skin_detection)
+      x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col,
+                                    &force_split[0]);
 #endif
+
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 68537e9..4be043d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1692,9 +1692,6 @@
   cpi->use_skin_detection = 0;
   cpi->common.buffer_pool = pool;
 
-  cpi->rc.high_source_sad = 0;
-  cpi->rc.count_last_scene_change = 0;
-
   init_config(cpi, oxcf);
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index def9b8c..441280c 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -20,8 +20,8 @@
 
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -35,7 +35,7 @@
 void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++)
         vpx_free_frame_buffer(&ctx->buf[i].img);
@@ -221,9 +221,9 @@
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
         index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 1382038..db0fd1c 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -36,10 +36,10 @@
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 2e27f94..fd51598 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1845,8 +1845,7 @@
       cpi->denoiser.denoising_level > kDenLowLow &&
       cpi->denoiser.reset == 0) {
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
-    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, VPXMAX(BLOCK_8X8, bsize),
-                         ctx, &decision);
+    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision);
     // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
     // result. Only do this under noise conditions, and if rdcost of ZEROMV on
     // original source is not significantly higher than rdcost of best mode.
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 91f877e..9766c05 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -219,12 +219,12 @@
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
-  int l;
+  int l, m;
   t = d;
   for (l = 0; t > 1; l++)
     t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (int16_t)(t - (1 << 16));
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
   *shift = 1 << (16 - l);
 }
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d53e60a..0675d4a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -339,6 +339,9 @@
   rc->total_target_vs_actual = 0;
   rc->avg_intersize_gfint = 0;
   rc->avg_frame_low_motion = 0;
+  rc->high_source_sad = 0;
+  rc->count_last_scene_change = 0;
+  rc->avg_source_sad = 0;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ebe28b8..2ba2750 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -143,8 +143,8 @@
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = i + idy;
-          int col = j + idx;
+          int row = (int)i + idy;
+          int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height &&
               col >= 0 && col < (int)block_width) {
@@ -211,8 +211,8 @@
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = i + idy;
-          int col = j + idx;
+          int row = (int)i + idy;
+          int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height &&
               col >= 0 && col < (int)block_width) {
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c
index f4a149d..883507a 100644
--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -147,8 +147,9 @@
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
 
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+  for (r = 0; r < b_height; ++r) {
     memcpy(sig_buffer[r], sig, width);
     memcpy(sig_buffer[r] + width, sig + sig_stride, width);
     memcpy(mc_running_buffer[r], mc_running_avg_y, width);
@@ -188,8 +189,8 @@
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
               sig_buffer[r], mc_running_buffer[r], running_buffer[r],
               k_0, k_delta, acc_diff);
@@ -235,38 +236,37 @@
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
 
-  for (c = 0; c < 4; ++c) {
-    for (r = 0; r < 4; ++r) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
       acc_diff[c][r] = _mm_setzero_si128();
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
           sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
-          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
       running_avg_y += 16;
     }
 
-    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
       }
     }
 
     // Update pointers for next iteration.
-    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-    mc_running_avg_y = mc_running_avg_y -
-                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                       mc_avg_y_stride;
-    running_avg_y = running_avg_y -
-                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                    avg_y_stride;
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
   }
 
   {
@@ -278,33 +278,29 @@
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
-        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
         sum_diff = 0;
-        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
                 sig, mc_running_avg_y, running_avg_y, k_0,
-                k_delta, acc_diff[c>>4][r>>4]);
+                k_delta, acc_diff[c][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
             running_avg_y += 16;
           }
 
-          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
             }
           }
-          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-          mc_running_avg_y = mc_running_avg_y -
-                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                             mc_avg_y_stride;
-          running_avg_y = running_avg_y -
-                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                          avg_y_stride;
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
         }
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;
diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c
index 6ad806a..8140e78 100644
--- a/vpx_dsp/bitreader.c
+++ b/vpx_dsp/bitreader.c
@@ -69,7 +69,7 @@
       buffer += (bits >> 3);
       value = r->value | (nv << (shift & 0x7));
   } else {
-    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
     int loop_end = 0;
     if (bits_over >= 0) {
       count += LOTS_OF_BITS;
diff --git a/vpx_dsp/mips/postproc_msa.c b/vpx_dsp/mips/postproc_msa.c
new file mode 100644
index 0000000..366770c
--- /dev/null
+++ b/vpx_dsp/mips/postproc_msa.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+void vpx_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], uint32_t width,
+                             uint32_t height, int32_t pitch) {
+  uint32_t i, j;
+
+  for (i = 0; i < height / 2; ++i) {
+    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
+    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
+    for (j = width / 16; j--;) {
+      v16i8 temp00_s, temp01_s;
+      v16u8 temp00, temp01, black_clamp, white_clamp;
+      v16u8 pos0, ref0, pos1, ref1;
+      v16i8 const127 = __msa_ldi_b(127);
+
+      pos0 = LD_UB(pos0_ptr);
+      ref0 = LD_UB(ref0_ptr);
+      pos1 = LD_UB(pos1_ptr);
+      ref1 = LD_UB(ref1_ptr);
+      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+      temp00 = (pos0 < black_clamp);
+      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+      temp01 = (pos1 < black_clamp);
+      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp00 = (v16u8)(temp00_s < pos0);
+      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp01 = (temp01_s < pos1);
+      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      pos0 += ref0;
+      ST_UB(pos0, pos0_ptr);
+      pos1 += ref1;
+      ST_UB(pos1, pos1_ptr);
+      pos0_ptr += 16;
+      pos1_ptr += 16;
+      ref0_ptr += 16;
+      ref1_ptr += 16;
+    }
+  }
+}
diff --git a/vpx_dsp/postproc.c b/vpx_dsp/postproc.c
new file mode 100644
index 0000000..1fa0204
--- /dev/null
+++ b/vpx_dsp/postproc.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_plane_add_noise_c(uint8_t *start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
+  // fix..
+  (void) bothclamp;
+  for (i = 0; i < height; i++) {
+    uint8_t *pos = start + i * pitch;
+    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; j++) {
+      if (pos[j] < blackclamp[0])
+        pos[j] = blackclamp[0];
+
+      if (pos[j] > 255 + whiteclamp[0])
+        pos[j] = 255 + whiteclamp[0];
+
+      pos[j] += ref[j];
+    }
+  }
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 581ec3a..ef319a8 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -53,6 +53,13 @@
 endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
+ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes += postproc.c
+DSP_SRCS-$(HAVE_MSA) += mips/postproc_msa.c
+DSP_SRCS-$(HAVE_MMX) += x86/postproc_mmx.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/postproc_sse2.asm
+endif # CONFIG_POSTPROC
+
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9ea80a0..f883ce5 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1907,6 +1907,15 @@
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
 }  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Post Processing
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vpx_plane_add_noise mmx sse2 msa/;
+}
+
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 1;
diff --git a/vpx_dsp/x86/postproc_mmx.asm b/vpx_dsp/x86/postproc_mmx.asm
new file mode 100644
index 0000000..9703975
--- /dev/null
+++ b/vpx_dsp/x86/postproc_mmx.asm
@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vpx_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+global sym(vpx_plane_add_noise_mmx) PRIVATE
+sym(vpx_plane_add_noise_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movq        mm1,[rsi+rax]         ; get the source
+
+            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     mm1, [rdx+32] ;bothclamp
+            psubusb     mm1, [rdx+16] ;whiteclamp
+
+            movq        mm2,[rdi+rax]         ; get the noise for this line
+            paddb       mm1,mm2              ; add it in
+            movq        [rsi+rax],mm1         ; store the result
+
+            add         rax,8                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+    times 16 dw 16
+    times  8 dw 64
+    times 16 dw 16
+    times  8 dw  0
+
+rd:
+    times 4 dw 0x40
diff --git a/vpx_dsp/x86/postproc_sse2.asm b/vpx_dsp/x86/postproc_sse2.asm
new file mode 100644
index 0000000..f4bc893
--- /dev/null
+++ b/vpx_dsp/x86/postproc_sse2.asm
@@ -0,0 +1,82 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vpx_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
+;                              unsigned char blackclamp[16],
+;                              unsigned char whiteclamp[16],
+;                              unsigned char bothclamp[16],
+;                              unsigned int width, unsigned int height,
+;                              int pitch)
+global sym(vpx_plane_add_noise_sse2) PRIVATE
+sym(vpx_plane_add_noise_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index e6c9365..43f4603 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -171,7 +171,7 @@
                                   unsigned int *sse) {
   int sum;
   get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
+  return *sse - ((sum * sum) >> 4);
 }
 
 unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
@@ -180,7 +180,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
                 sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
+  return *sse - ((sum * sum) >> 5);
 }
 
 unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
@@ -189,7 +189,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
                 sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
+  return *sse - ((sum * sum) >> 5);
 }
 
 unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
@@ -197,7 +197,7 @@
                                   unsigned int *sse) {
   int sum;
   vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
+  return *sse - ((sum * sum) >> 6);
 }
 
 unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
@@ -206,7 +206,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
                 sse, &sum, vpx_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
+  return *sse - ((sum * sum) >> 7);
 }
 
 unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
@@ -215,7 +215,7 @@
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
                 sse, &sum, vpx_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
+  return *sse - ((sum * sum) >> 7);
 }
 
 unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
@@ -223,7 +223,7 @@
                                     unsigned int *sse) {
   int sum;
   vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 8);
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
@@ -329,7 +329,7 @@
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
 unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
                                                      int src_stride, \
                                                      int x_offset, \
@@ -365,23 +365,23 @@
     } \
   } \
   *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
 }
 
 #define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (int32_t), (int32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (int32_t), (int32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (int32_t), (int32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (int32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (int32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
@@ -410,7 +410,7 @@
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
 unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
                                                          int src_stride, \
                                                          int x_offset, \
@@ -451,23 +451,23 @@
     } \
   } \
   *sseptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
 }
 
 #define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t), (int32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t), (int32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t), (int32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);