Merge "Fix sub8x8 motion search on scaled reference frame"
diff --git a/test/vp9_avg_test.cc b/test/avg_test.cc
similarity index 78%
rename from test/vp9_avg_test.cc
rename to test/avg_test.cc
index cbc667e..7d5380f 100644
--- a/test/vp9_avg_test.cc
+++ b/test/avg_test.cc
@@ -15,9 +15,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
-#if CONFIG_VP9_ENCODER
-#include "./vp9_rtcd.h"
-#endif
+#include "./vpx_dsp_rtcd.h"
 
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
@@ -323,91 +321,91 @@
 INSTANTIATE_TEST_CASE_P(
     C, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
-        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
+        make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
+        make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
 
 INSTANTIATE_TEST_CASE_P(
     C, SatdTest,
     ::testing::Values(
-        make_tuple(16, &vp9_satd_c),
-        make_tuple(64, &vp9_satd_c),
-        make_tuple(256, &vp9_satd_c),
-        make_tuple(1024, &vp9_satd_c)));
+        make_tuple(16, &vpx_satd_c),
+        make_tuple(64, &vpx_satd_c),
+        make_tuple(256, &vpx_satd_c),
+        make_tuple(1024, &vpx_satd_c)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, IntProRowTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
-        make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
-        make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));
+        make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
+        make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, IntProColTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
-        make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
-        make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
+        make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
+        make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, SatdTest,
     ::testing::Values(
-        make_tuple(16, &vp9_satd_sse2),
-        make_tuple(64, &vp9_satd_sse2),
-        make_tuple(256, &vp9_satd_sse2),
-        make_tuple(1024, &vp9_satd_sse2)));
+        make_tuple(16, &vpx_satd_sse2),
+        make_tuple(64, &vpx_satd_sse2),
+        make_tuple(256, &vpx_satd_sse2),
+        make_tuple(1024, &vpx_satd_sse2)));
 #endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, IntProRowTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
-        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
-        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+        make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+        make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, IntProColTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
-        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
-        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+        make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+        make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, SatdTest,
     ::testing::Values(
-        make_tuple(16, &vp9_satd_neon),
-        make_tuple(64, &vp9_satd_neon),
-        make_tuple(256, &vp9_satd_neon),
-        make_tuple(1024, &vp9_satd_neon)));
+        make_tuple(16, &vpx_satd_neon),
+        make_tuple(64, &vpx_satd_neon),
+        make_tuple(256, &vpx_satd_neon),
+        make_tuple(1024, &vpx_satd_neon)));
 #endif
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
     MSA, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
 #endif
 
 }  // namespace
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 9dec3cb..a144cfc 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -702,18 +702,6 @@
 
 #if HAVE_SSE
 #if CONFIG_USE_X86INC
-const SadMxNParam sse_tests[] = {
-  make_tuple(4, 8, &vpx_sad4x8_sse, -1),
-  make_tuple(4, 4, &vpx_sad4x4_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::ValuesIn(sse_tests));
-
-const SadMxNAvgParam avg_sse_tests[] = {
-  make_tuple(4, 8, &vpx_sad4x8_avg_sse, -1),
-  make_tuple(4, 4, &vpx_sad4x4_avg_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADavgTest, ::testing::ValuesIn(avg_sse_tests));
-
 const SadMxNx4Param x4d_sse_tests[] = {
   make_tuple(4, 8, &vpx_sad4x8x4d_sse, -1),
   make_tuple(4, 4, &vpx_sad4x4x4d_sse, -1),
@@ -736,6 +724,8 @@
   make_tuple(8, 16, &vpx_sad8x16_sse2, -1),
   make_tuple(8, 8, &vpx_sad8x8_sse2, -1),
   make_tuple(8, 4, &vpx_sad8x4_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 8),
@@ -786,6 +776,8 @@
   make_tuple(8, 16, &vpx_sad8x16_avg_sse2, -1),
   make_tuple(8, 8, &vpx_sad8x8_avg_sse2, -1),
   make_tuple(8, 4, &vpx_sad8x4_avg_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 8),
diff --git a/test/test.mk b/test/test.mk
index 2487bd2..80b57e5 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -143,7 +143,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
@@ -170,6 +169,11 @@
 endif # VP10
 
 ## Multi-codec / unconditional whitebox tests.
+
+ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)
+LIBVPX_TEST_SRCS-yes += avg_test.cc
+endif
+
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index c270072..3e65fec 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -337,28 +337,19 @@
                 vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
 
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-#if ARCH_X86_64
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
-#else
-INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
-                vpx_dc_left_predictor_32x32_sse2,
-                vpx_dc_top_predictor_32x32_sse2,
-                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif  // ARCH_X86_64
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_32x32_ssse3, vpx_d45_predictor_32x32_ssse3,
-                NULL, NULL, vpx_d153_predictor_32x32_ssse3,
-                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
-                NULL)
+                NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
+                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
+                vpx_d63_predictor_32x32_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_NEON
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index ad3327e..2bebdcb 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -141,13 +141,13 @@
                                        &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -155,20 +155,20 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -176,7 +176,7 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
@@ -194,14 +194,14 @@
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -211,21 +211,21 @@
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -233,7 +233,7 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 10),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
@@ -251,14 +251,14 @@
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -268,21 +268,21 @@
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -290,7 +290,7 @@
                                        &vpx_highbd_v_predictor_16x16_c, 16, 12),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de3..e7d3fa5 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,7 +119,7 @@
 %if ABI_IS_32BIT
     %if CONFIG_PIC=1
         %ifidn __OUTPUT_FORMAT__,elf32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %define WRT_PLT wrt ..plt
             %macro GET_GOT 1
                 extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +138,7 @@
                 %define RESTORE_GOT pop %1
             %endmacro
         %elifidn __OUTPUT_FORMAT__,macho32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %macro GET_GOT 1
                 push %1
                 call %%get_got
@@ -149,6 +149,8 @@
                 %undef RESTORE_GOT
                 %define RESTORE_GOT pop %1
             %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
         %endif
     %endif
 
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index b852a65..e9e3949 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -262,7 +262,7 @@
 }
 
 #if CONFIG_MISC_FIXES
-static inline void memset16(uint16_t *dst, int val, int n) {
+static INLINE void memset16(uint16_t *dst, int val, int n) {
   while (n--)
     *dst++ = val;
 }
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 4404701..9860bae 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -351,42 +351,6 @@
 #
 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
 
-add_proto qw/unsigned int vp10_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp10_avg_8x8 sse2 neon msa/;
-
-add_proto qw/unsigned int vp10_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp10_avg_4x4 sse2 msa/;
-
-add_proto qw/void vp10_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-specialize qw/vp10_minmax_8x8 sse2/;
-
-add_proto qw/void vp10_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-specialize qw/vp10_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
-
-add_proto qw/void vp10_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-specialize qw/vp10_hadamard_16x16 sse2/;
-
-add_proto qw/int16_t vp10_satd/, "const int16_t *coeff, int length";
-specialize qw/vp10_satd sse2/;
-
-add_proto qw/void vp10_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
-specialize qw/vp10_int_pro_row sse2 neon/;
-
-add_proto qw/int16_t vp10_int_pro_col/, "uint8_t const *ref, const int width";
-specialize qw/vp10_int_pro_col sse2 neon/;
-
-add_proto qw/int vp10_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
-specialize qw/vp10_vector_var neon sse2/;
-
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vp10_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vp10_highbd_avg_8x8/;
-  add_proto qw/unsigned int vp10_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vp10_highbd_avg_4x4/;
-  add_proto qw/void vp10_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vp10_highbd_minmax_8x8/;
-}
-
 # ENCODEMB INVOKE
 
 #
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 80f1778..c0fbc49 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1143,32 +1143,13 @@
       vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4) : 0;
 }
 
-static void setup_quantization(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+static void setup_quantization(VP10_COMMON *const cm,
                                struct vpx_read_bit_buffer *rb) {
-  int i;
-
   cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
   cm->y_dc_delta_q = read_delta_q(rb);
   cm->uv_dc_delta_q = read_delta_q(rb);
   cm->uv_ac_delta_q = read_delta_q(rb);
   cm->dequant_bit_depth = cm->bit_depth;
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
-                      cm->uv_dc_delta_q == 0 &&
-                      cm->uv_ac_delta_q == 0;
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  xd->bd = (int)cm->bit_depth;
-#endif
 }
 
 static void setup_segmentation_dequant(VP10_COMMON *const cm) {
@@ -1878,9 +1859,7 @@
 static size_t read_uncompressed_header(VP10Decoder *pbi,
                                        struct vpx_read_bit_buffer *rb) {
   VP10_COMMON *const cm = &pbi->common;
-#if CONFIG_MISC_FIXES
   MACROBLOCKD *const xd = &pbi->mb;
-#endif
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
   int i, mask, ref_index = 0;
@@ -2108,12 +2087,30 @@
     vp10_setup_past_independence(cm);
 
   setup_loopfilter(&cm->lf, rb);
-  setup_quantization(cm, &pbi->mb, rb);
+  setup_quantization(cm, rb);
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->bd = (int)cm->bit_depth;
+#endif
+
   setup_segmentation(cm, rb);
+
+  {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+          vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+          cm->base_qindex;
+      xd->lossless[i] = qindex == 0 &&
+          cm->y_dc_delta_q == 0 &&
+          cm->uv_dc_delta_q == 0 &&
+          cm->uv_ac_delta_q == 0;
+    }
+  }
+
   setup_segmentation_dequant(cm);
 #if CONFIG_MISC_FIXES
-  cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4
-                                                      : read_tx_mode(rb);
+  cm->tx_mode = (xd->lossless[0]) ? ONLY_4X4
+                                  : read_tx_mode(rb);
   cm->reference_mode = read_frame_reference_mode(cm, rb);
 #endif
 
diff --git a/vp10/encoder/arm/neon/avg_neon.c b/vp10/encoder/arm/neon/avg_neon.c
deleted file mode 100644
index c4ec5c4..0000000
--- a/vp10/encoder/arm/neon/avg_neon.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "./vp10_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vpx/vpx_integer.h"
-
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
-  const uint32x4_t a = vpaddlq_u16(v_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-unsigned int vp10_avg_8x8_neon(const uint8_t *s, int p) {
-  uint8x8_t v_s0 = vld1_u8(s);
-  const uint8x8_t v_s1 = vld1_u8(s + p);
-  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
-
-  v_s0 = vld1_u8(s + 2 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 3 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 4 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 5 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 6 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 7 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
-}
-
-void vp10_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
-                          const int ref_stride, const int height) {
-  int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
-
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
-
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
-
-    ref += ref_stride * 8;
-  }
-
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
-
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
-}
-
-int16_t vp10_int_pro_col_neon(uint8_t const *ref, const int width) {
-  int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
-
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
-  }
-
-  return horizontal_add_u16x8(vec_sum);
-}
-
-// ref, src = [0, 510] - max diff = 16-bits
-// bwl = {2, 3, 4}, width = {16, 32, 64}
-int vp10_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
-  int width = 4 << bwl;
-  int32x4_t sse = vdupq_n_s32(0);
-  int16x8_t total = vdupq_n_s16(0);
-
-  assert(width >= 8);
-  assert((width % 8) == 0);
-
-  do {
-    const int16x8_t r = vld1q_s16(ref);
-    const int16x8_t s = vld1q_s16(src);
-    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
-    sse = vmlal_s16(sse, diff_hi, diff_hi);
-    total = vaddq_s16(total, diff);  // dynamic range 16 bits.
-
-    ref += 8;
-    src += 8;
-    width -= 8;
-  } while (width != 0);
-
-  {
-    // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
-    // with the summation of 'sse' performed better on a Cortex-A15.
-    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
-    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
-    const int32x2_t t2 = vpadd_s32(t1, t1);
-    const int t = vget_lane_s32(t2, 0);
-    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int s = vget_lane_s32(s1, 0);
-    const int shift_factor = bwl + 2;
-    return s - ((t * t) >> shift_factor);
-  }
-}
diff --git a/vp10/encoder/avg.c b/vp10/encoder/avg.c
deleted file mode 100644
index 738c427..0000000
--- a/vp10/encoder/avg.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vp10_rtcd.h"
-#include "vp10/common/common.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp10_avg_8x8_c(const uint8_t *s, int p) {
-  int i, j;
-  int sum = 0;
-  for (i = 0; i < 8; ++i, s+=p)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
-
-  return (sum + 32) >> 6;
-}
-
-unsigned int vp10_avg_4x4_c(const uint8_t *s, int p) {
-  int i, j;
-  int sum = 0;
-  for (i = 0; i < 4; ++i, s+=p)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
-
-  return (sum + 8) >> 4;
-}
-
-// src_diff: first pass, 9 bit, dynamic range [-255, 255]
-//           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
-  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
-  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
-  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
-  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
-  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
-  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
-  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
-  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
-
-  int16_t c0 = b0 + b2;
-  int16_t c1 = b1 + b3;
-  int16_t c2 = b0 - b2;
-  int16_t c3 = b1 - b3;
-  int16_t c4 = b4 + b6;
-  int16_t c5 = b5 + b7;
-  int16_t c6 = b4 - b6;
-  int16_t c7 = b5 - b7;
-
-  coeff[0] = c0 + c4;
-  coeff[7] = c1 + c5;
-  coeff[3] = c2 + c6;
-  coeff[4] = c3 + c7;
-  coeff[2] = c0 - c4;
-  coeff[6] = c1 - c5;
-  coeff[1] = c2 - c6;
-  coeff[5] = c3 - c7;
-}
-
-void vp10_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
-                        int16_t *coeff) {
-  int idx;
-  int16_t buffer[64];
-  int16_t *tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
-                                                   // dynamic range [-255, 255]
-    tmp_buf += 8;
-    ++src_diff;
-  }
-
-  tmp_buf = &buffer[0];
-  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
-                                       // dynamic range [-2040, 2040]
-    coeff += 8;  // coeff: 15 bit
-                 // dynamic range [-16320, 16320]
-    ++tmp_buf;
-  }
-}
-
-// In place 16x16 2D Hadamard transform
-void vp10_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
-                          int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    // src_diff: 9 bit, dynamic range [-255, 255]
-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
-                                + (idx & 0x01) * 8;
-    vp10_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  // coeff: 15 bit, dynamic range [-16320, 16320]
-  for (idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
-
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
-
-    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64]  = b1 + b3;
-    coeff[128] = b0 - b2;
-    coeff[192] = b1 - b3;
-
-    ++coeff;
-  }
-}
-
-// coeff: 16 bits, dynamic range [-32640, 32640].
-// length: value range {16, 64, 256, 1024}.
-int16_t vp10_satd_c(const int16_t *coeff, int length) {
-  int i;
-  int satd = 0;
-  for (i = 0; i < length; ++i)
-    satd += abs(coeff[i]);
-
-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return (int16_t)satd;
-}
-
-// Integer projection onto row vectors.
-// height: value range {16, 32, 64}.
-void vp10_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
-                       const int ref_stride, const int height) {
-  int idx;
-  const int norm_factor = height >> 1;
-  for (idx = 0; idx < 16; ++idx) {
-    int i;
-    hbuf[idx] = 0;
-    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
-    for (i = 0; i < height; ++i)
-      hbuf[idx] += ref[i * ref_stride];
-    // hbuf[idx]: 9 bit, dynamic range [0, 510].
-    hbuf[idx] /= norm_factor;
-    ++ref;
-  }
-}
-
-// width: value range {16, 32, 64}.
-int16_t vp10_int_pro_col_c(uint8_t const *ref, const int width) {
-  int idx;
-  int16_t sum = 0;
-  // sum: 14 bit, dynamic range [0, 16320]
-  for (idx = 0; idx < width; ++idx)
-    sum += ref[idx];
-  return sum;
-}
-
-// ref: [0 - 510]
-// src: [0 - 510]
-// bwl: {2, 3, 4}
-int vp10_vector_var_c(int16_t const *ref, int16_t const *src,
-                     const int bwl) {
-  int i;
-  int width = 4 << bwl;
-  int sse = 0, mean = 0, var;
-
-  for (i = 0; i < width; ++i) {
-    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
-    mean += diff;                // mean: dynamic range 16 bits.
-    sse += diff * diff;          // sse:  dynamic range 26 bits.
-  }
-
-  // (mean * mean): dynamic range 31 bits.
-  var = sse - ((mean * mean) >> (bwl + 2));
-  return var;
-}
-
-void vp10_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
-                      int *min, int *max) {
-  int i, j;
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp10_highbd_avg_8x8_c(const uint8_t *s8, int p) {
-  int i, j;
-  int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 8; ++i, s+=p)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
-
-  return (sum + 32) >> 6;
-}
-
-unsigned int vp10_highbd_avg_4x4_c(const uint8_t *s8, int p) {
-  int i, j;
-  int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 4; ++i, s+=p)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
-
-  return (sum + 8) >> 4;
-}
-
-void vp10_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
-                             int dp, int *min, int *max) {
-  int i, j;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
-  *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
-    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
-      *min = diff < *min ? diff : *min;
-      *max = diff > *max ? diff : *max;
-    }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 32275d4..361ac99 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -897,18 +897,17 @@
   if (mode != TX_MODE_SELECT)
     vpx_wb_write_literal(wb, mode, 2);
 }
+#else
+static void write_txfm_mode(TX_MODE mode, struct vpx_writer *wb) {
+  vpx_write_literal(wb, VPXMIN(mode, ALLOW_32X32), 2);
+  if (mode >= ALLOW_32X32)
+    vpx_write_bit(wb, mode == TX_MODE_SELECT);
+}
 #endif
 
+
 static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
-#if !CONFIG_MISC_FIXES
-  // Mode
-  vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2);
-  if (cm->tx_mode >= ALLOW_32X32)
-    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
-
-  // Probabilities
-#endif
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
@@ -1261,7 +1260,7 @@
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
 #if CONFIG_MISC_FIXES
-  if (!cm->seg.enabled && xd->lossless[0])
+  if (xd->lossless[0])
     cm->tx_mode = TX_4X4;
   else
     write_txfm_mode(cm->tx_mode, wb);
@@ -1291,10 +1290,12 @@
   vpx_start_encode(&header_bc, data);
 
 #if !CONFIG_MISC_FIXES
-  if (cpi->td.mb.e_mbd.lossless[0])
+  if (cpi->td.mb.e_mbd.lossless[0]) {
     cm->tx_mode = TX_4X4;
-  else
+  } else {
+    write_txfm_mode(cm->tx_mode, &header_bc);
     update_txfm_probs(cm, &header_bc, counts);
+  }
 #else
   update_txfm_probs(cm, &header_bc, counts);
 #endif
@@ -1473,7 +1474,7 @@
     assert(n_log2_tiles > 0);
     vpx_wb_write_literal(&saved_wb, mag, 2);
     if (mag < 3)
-      data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
   } else {
     assert(n_log2_tiles == 0);
   }
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index bcdcff3..9381b65 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -536,16 +536,16 @@
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        vp10_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                               d + y8_idx * dp + x8_idx, dp,
                               &min, &max);
       } else {
-        vp10_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                        d + y8_idx * dp + x8_idx, dp,
                        &min, &max);
       }
 #else
-      vp10_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                      d + y8_idx * dp + x8_idx, dp,
                      &min, &max);
 #endif
@@ -577,18 +577,18 @@
       int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vp10_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
         if (!is_key_frame)
-          d_avg = vp10_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
       } else {
-        s_avg = vp10_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
         if (!is_key_frame)
-          d_avg = vp10_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
       }
 #else
-      s_avg = vp10_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
       if (!is_key_frame)
-        d_avg = vp10_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
 #endif
       sum = s_avg - d_avg;
       sse = sum * sum;
@@ -616,18 +616,18 @@
       int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vp10_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
         if (!is_key_frame)
-          d_avg = vp10_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
       } else {
-        s_avg = vp10_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
         if (!is_key_frame)
-          d_avg = vp10_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
       }
 #else
-      s_avg = vp10_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
       if (!is_key_frame)
-        d_avg = vp10_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
 #endif
       sum = s_avg - d_avg;
       sse = sum * sum;
@@ -1155,7 +1155,7 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     x->source_variance =
         vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
-                                           bsize, xd->bd);
+                                            bsize, xd->bd);
   } else {
     x->source_variance =
       vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
@@ -2579,7 +2579,7 @@
 }
 
 static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) {
-  if (!cpi->common.seg.enabled && xd->lossless[0])
+  if (xd->lossless[0])
     return ONLY_4X4;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
@@ -2702,16 +2702,12 @@
   rdc->m_search_count = 0;   // Count of motion search hits.
   rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
-  for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
-    const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
-    xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
-                      qindex == 0 &&
-#else
-                      cm->base_qindex == 0 &&
-#endif
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+                       cm->base_qindex;
+    xd->lossless[i] = qindex == 0 &&
+                      cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
                       cm->uv_ac_delta_q == 0;
   }
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 6bba848..175c6d8 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -1422,7 +1422,11 @@
   cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 04e1daf..2c1c591 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -1759,7 +1759,7 @@
   int center, offset = 0;
   int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
   for (d = 0; d <= bw; d += 16) {
-    this_sad = vp10_vector_var(&ref[d], src, bwl);
+    this_sad = vpx_vector_var(&ref[d], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       offset = d;
@@ -1772,7 +1772,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1785,7 +1785,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1798,7 +1798,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1811,7 +1811,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1880,25 +1880,25 @@
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {
-    vp10_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
     ref_buf += 16;
   }
 
   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
   for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = vp10_int_pro_col(ref_buf, bw) >> norm_factor;
+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
     ref_buf += ref_stride;
   }
 
   // Set up src 1-D reference set
   for (idx = 0; idx < bw; idx += 16) {
     src_buf = x->plane[0].src.buf + idx;
-    vp10_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
   }
 
   src_buf = x->plane[0].src.buf;
   for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = vp10_int_pro_col(src_buf, bw) >> norm_factor;
+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
     src_buf += src_stride;
   }
 
diff --git a/vp10/encoder/mips/msa/avg_msa.c b/vp10/encoder/mips/msa/avg_msa.c
deleted file mode 100644
index e8cfd53..0000000
--- a/vp10/encoder/mips/msa/avg_msa.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp10_rtcd.h"
-#include "vpx_dsp/mips/macros_msa.h"
-
-uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
-  uint32_t sum_out;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
-  v4u32 sum = { 0 };
-
-  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);
-  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);
-  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);
-  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);
-  sum0 += sum4;
-
-  sum = __msa_hadd_u_w(sum0, sum0);
-  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);
-  sum = __msa_hadd_u_w(sum0, sum0);
-  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);
-  sum_out = __msa_copy_u_w((v4i32)sum, 0);
-
-  return sum_out;
-}
-
-uint32_t vp10_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
-  uint32_t sum_out;
-  uint32_t src0, src1, src2, src3;
-  v16u8 vec = { 0 };
-  v8u16 sum0;
-  v4u32 sum1;
-  v2u64 sum2;
-
-  LW4(src, src_stride, src0, src1, src2, src3);
-  INSERT_W4_UB(src0, src1, src2, src3, vec);
-
-  sum0 = __msa_hadd_u_h(vec, vec);
-  sum1 = __msa_hadd_u_w(sum0, sum0);
-  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
-  sum1 = __msa_hadd_u_w(sum0, sum0);
-  sum2 = __msa_hadd_u_d(sum1, sum1);
-  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
-  sum_out = __msa_copy_u_w((v4i32)sum1, 0);
-
-  return sum_out;
-}
diff --git a/vp10/encoder/ratectrl.c b/vp10/encoder/ratectrl.c
index 3ff2476..6068775 100644
--- a/vp10/encoder/ratectrl.c
+++ b/vp10/encoder/ratectrl.c
@@ -794,16 +794,18 @@
   ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
-
-    // Handle the special case for key frames forced when we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp10_compute_qdelta(rc, q, q * 0.25,
+                                             cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp10_compute_qdelta(rc, last_boosted_q,
-                                            last_boosted_q * 0.75,
-                                            cm->bit_depth);
+                                             last_boosted_q * 0.75,
+                                             cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // not first frame of one pass and kf_boost is set
@@ -823,8 +825,8 @@
       // on active_best_quality.
       q_val = vp10_convert_qindex_to_q(active_best_quality, cm->bit_depth);
       active_best_quality += vp10_compute_qdelta(rc, q_val,
-                                                q_val * q_adj_factor,
-                                                cm->bit_depth);
+                                                 q_val * q_adj_factor,
+                                                 cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
@@ -848,17 +850,28 @@
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == VPX_Q) {
-      if (!cpi->refresh_alt_ref_frame) {
-        active_best_quality = cq_level;
-      } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-      }
+      int qindex = cq_level;
+      double q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex;
+      if (cpi->refresh_alt_ref_frame)
+        delta_qindex = vp10_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+      else
+        delta_qindex = vp10_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
-      active_best_quality = cq_level;
+      int qindex = cq_level;
+      double q = vp10_convert_qindex_to_q(qindex, cm->bit_depth);
+      double delta_rate[FIXED_GF_INTERVAL] =
+          {0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0};
+      int delta_qindex =
+          vp10_compute_qdelta(rc, q,
+                              q * delta_rate[cm->current_video_frame %
+                              FIXED_GF_INTERVAL], cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Use the lower of active_worst_quality and recent/average Q.
       if (cm->current_video_frame > 1)
@@ -1254,7 +1267,7 @@
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
-    if (rc->is_src_frame_alt_ref ||
+    if (!rc->is_src_frame_alt_ref &&
         !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
@@ -1562,29 +1575,36 @@
                                   RATE_CONTROL *const rc) {
   const VP10EncoderConfig *const oxcf = &cpi->oxcf;
 
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = oxcf->max_gf_interval;
-  rc->min_gf_interval = oxcf->min_gf_interval;
-  if (rc->min_gf_interval == 0)
-    rc->min_gf_interval = vp10_rc_get_default_min_gf_interval(
-        oxcf->width, oxcf->height, cpi->framerate);
-  if (rc->max_gf_interval == 0)
-    rc->max_gf_interval = vp10_rc_get_default_max_gf_interval(
-        cpi->framerate, rc->min_gf_interval);
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = vp10_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = vp10_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
 
-  // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
 
-  if (is_altref_enabled(cpi)) {
-    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
   }
-
-  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
-    rc->max_gf_interval = rc->static_scene_max_gf_interval;
-
-  // Clamp min to max
-  rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
 }
 
 void vp10_rc_update_framerate(VP10_COMP *cpi) {
diff --git a/vp10/encoder/ratectrl.h b/vp10/encoder/ratectrl.h
index 8008c16..0b9fd45 100644
--- a/vp10/encoder/ratectrl.h
+++ b/vp10/encoder/ratectrl.h
@@ -26,6 +26,7 @@
 
 #define MIN_GF_INTERVAL     4
 #define MAX_GF_INTERVAL     16
+#define FIXED_GF_INTERVAL   8    // Used in some testing modes only
 
 typedef enum {
   INTER_NORMAL = 0,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index bba2171..bbddc1d 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -725,7 +725,7 @@
 
   assert(bs == xd->mi[0]->mbmi.sb_type);
 
-  if (CONFIG_MISC_FIXES && xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+  if (CONFIG_MISC_FIXES && xd->lossless[0]) {
     choose_smallest_tx_size(cpi, x, rate, distortion, skip, ret_sse,
                             ref_best_rd, bs);
   } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
diff --git a/vp10/encoder/x86/avg_intrin_sse2.c b/vp10/encoder/x86/avg_intrin_sse2.c
deleted file mode 100644
index cf23013..0000000
--- a/vp10/encoder/x86/avg_intrin_sse2.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>
-
-#include "./vp10_rtcd.h"
-#include "vpx_ports/mem.h"
-
-void vp10_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
-                         int *min, int *max) {
-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
-  u0  = _mm_setzero_si128();
-  // Row 0
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff0 = _mm_max_epi16(diff, negdiff);
-  // Row 1
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
-  // Row 2
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 3
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 4
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 5
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 6
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-  // Row 7
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
-  diff = _mm_subs_epi16(s0, d0);
-  negdiff = _mm_subs_epi16(u0, diff);
-  absdiff = _mm_max_epi16(diff, negdiff);
-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
-  *max = _mm_extract_epi16(maxabsdiff, 0);
-
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
-  *min = _mm_extract_epi16(minabsdiff, 0);
-}
-
-unsigned int vp10_avg_8x8_sse2(const uint8_t *s, int p) {
-  __m128i s0, s1, u0;
-  unsigned int avg = 0;
-  u0  = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
-  return (avg + 32) >> 6;
-}
-
-unsigned int vp10_avg_4x4_sse2(const uint8_t *s, int p) {
-  __m128i s0, s1, u0;
-  unsigned int avg = 0;
-  u0  = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
-  return (avg + 8) >> 4;
-}
-
-static void hadamard_col8_sse2(__m128i *in, int iter) {
-  __m128i a0 = in[0];
-  __m128i a1 = in[1];
-  __m128i a2 = in[2];
-  __m128i a3 = in[3];
-  __m128i a4 = in[4];
-  __m128i a5 = in[5];
-  __m128i a6 = in[6];
-  __m128i a7 = in[7];
-
-  __m128i b0 = _mm_add_epi16(a0, a1);
-  __m128i b1 = _mm_sub_epi16(a0, a1);
-  __m128i b2 = _mm_add_epi16(a2, a3);
-  __m128i b3 = _mm_sub_epi16(a2, a3);
-  __m128i b4 = _mm_add_epi16(a4, a5);
-  __m128i b5 = _mm_sub_epi16(a4, a5);
-  __m128i b6 = _mm_add_epi16(a6, a7);
-  __m128i b7 = _mm_sub_epi16(a6, a7);
-
-  a0 = _mm_add_epi16(b0, b2);
-  a1 = _mm_add_epi16(b1, b3);
-  a2 = _mm_sub_epi16(b0, b2);
-  a3 = _mm_sub_epi16(b1, b3);
-  a4 = _mm_add_epi16(b4, b6);
-  a5 = _mm_add_epi16(b5, b7);
-  a6 = _mm_sub_epi16(b4, b6);
-  a7 = _mm_sub_epi16(b5, b7);
-
-  if (iter == 0) {
-    b0 = _mm_add_epi16(a0, a4);
-    b7 = _mm_add_epi16(a1, a5);
-    b3 = _mm_add_epi16(a2, a6);
-    b4 = _mm_add_epi16(a3, a7);
-    b2 = _mm_sub_epi16(a0, a4);
-    b6 = _mm_sub_epi16(a1, a5);
-    b1 = _mm_sub_epi16(a2, a6);
-    b5 = _mm_sub_epi16(a3, a7);
-
-    a0 = _mm_unpacklo_epi16(b0, b1);
-    a1 = _mm_unpacklo_epi16(b2, b3);
-    a2 = _mm_unpackhi_epi16(b0, b1);
-    a3 = _mm_unpackhi_epi16(b2, b3);
-    a4 = _mm_unpacklo_epi16(b4, b5);
-    a5 = _mm_unpacklo_epi16(b6, b7);
-    a6 = _mm_unpackhi_epi16(b4, b5);
-    a7 = _mm_unpackhi_epi16(b6, b7);
-
-    b0 = _mm_unpacklo_epi32(a0, a1);
-    b1 = _mm_unpacklo_epi32(a4, a5);
-    b2 = _mm_unpackhi_epi32(a0, a1);
-    b3 = _mm_unpackhi_epi32(a4, a5);
-    b4 = _mm_unpacklo_epi32(a2, a3);
-    b5 = _mm_unpacklo_epi32(a6, a7);
-    b6 = _mm_unpackhi_epi32(a2, a3);
-    b7 = _mm_unpackhi_epi32(a6, a7);
-
-    in[0] = _mm_unpacklo_epi64(b0, b1);
-    in[1] = _mm_unpackhi_epi64(b0, b1);
-    in[2] = _mm_unpacklo_epi64(b2, b3);
-    in[3] = _mm_unpackhi_epi64(b2, b3);
-    in[4] = _mm_unpacklo_epi64(b4, b5);
-    in[5] = _mm_unpackhi_epi64(b4, b5);
-    in[6] = _mm_unpacklo_epi64(b6, b7);
-    in[7] = _mm_unpackhi_epi64(b6, b7);
-  } else {
-    in[0] = _mm_add_epi16(a0, a4);
-    in[7] = _mm_add_epi16(a1, a5);
-    in[3] = _mm_add_epi16(a2, a6);
-    in[4] = _mm_add_epi16(a3, a7);
-    in[2] = _mm_sub_epi16(a0, a4);
-    in[6] = _mm_sub_epi16(a1, a5);
-    in[1] = _mm_sub_epi16(a2, a6);
-    in[5] = _mm_sub_epi16(a3, a7);
-  }
-}
-
-void vp10_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
-                           int16_t *coeff) {
-  __m128i src[8];
-  src[0] = _mm_load_si128((const __m128i *)src_diff);
-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-
-  hadamard_col8_sse2(src, 0);
-  hadamard_col8_sse2(src, 1);
-
-  _mm_store_si128((__m128i *)coeff, src[0]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
-}
-
-void vp10_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int idx;
-  for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
-                                + (idx & 0x01) * 8;
-    vp10_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
-  }
-
-  for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
-
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
-    b0 = _mm_srai_epi16(b0, 1);
-    b1 = _mm_srai_epi16(b1, 1);
-    b2 = _mm_srai_epi16(b2, 1);
-    b3 = _mm_srai_epi16(b3, 1);
-
-    coeff0 = _mm_add_epi16(b0, b2);
-    coeff1 = _mm_add_epi16(b1, b3);
-    _mm_store_si128((__m128i *)coeff, coeff0);
-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
-
-    coeff2 = _mm_sub_epi16(b0, b2);
-    coeff3 = _mm_sub_epi16(b1, b3);
-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
-
-    coeff += 8;
-  }
-}
-
-int16_t vp10_satd_sse2(const int16_t *coeff, int length) {
-  int i;
-  __m128i sum = _mm_load_si128((const __m128i *)coeff);
-  __m128i sign = _mm_srai_epi16(sum, 15);
-  __m128i val = _mm_xor_si128(sum, sign);
-  sum = _mm_sub_epi16(val, sign);
-  coeff += 8;
-
-  for (i = 8; i < length; i += 8) {
-    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
-    sign = _mm_srai_epi16(src_line, 15);
-    val = _mm_xor_si128(src_line, sign);
-    val = _mm_sub_epi16(val, sign);
-    sum = _mm_add_epi16(sum, val);
-    coeff += 8;
-  }
-
-  val = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, val);
-  val = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, val);
-  val = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, val);
-
-  return _mm_extract_epi16(sum, 0);
-}
-
-void vp10_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
-                          const int ref_stride, const int height) {
-  int idx;
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
-  __m128i t0, t1;
-  int height_1 = height - 1;
-  ref += ref_stride;
-
-  for (idx = 1; idx < height_1; idx += 2) {
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-
-    src_line = _mm_loadu_si128((const __m128i *)ref);
-    t0 = _mm_unpacklo_epi8(src_line, zero);
-    t1 = _mm_unpackhi_epi8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, t0);
-    s1 = _mm_adds_epu16(s1, t1);
-    ref += ref_stride;
-  }
-
-  src_line = _mm_loadu_si128((const __m128i *)ref);
-  t0 = _mm_unpacklo_epi8(src_line, zero);
-  t1 = _mm_unpackhi_epi8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, t0);
-  s1 = _mm_adds_epu16(s1, t1);
-
-  if (height == 64) {
-    s0 = _mm_srai_epi16(s0, 5);
-    s1 = _mm_srai_epi16(s1, 5);
-  } else if (height == 32) {
-    s0 = _mm_srai_epi16(s0, 4);
-    s1 = _mm_srai_epi16(s1, 4);
-  } else {
-    s0 = _mm_srai_epi16(s0, 3);
-    s1 = _mm_srai_epi16(s1, 3);
-  }
-
-  _mm_storeu_si128((__m128i *)hbuf, s0);
-  hbuf += 8;
-  _mm_storeu_si128((__m128i *)hbuf, s1);
-}
-
-int16_t vp10_int_pro_col_sse2(uint8_t const *ref, const int width) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
-  __m128i s0 = _mm_sad_epu8(src_line, zero);
-  __m128i s1;
-  int i;
-
-  for (i = 16; i < width; i += 16) {
-    ref += 16;
-    src_line = _mm_load_si128((const __m128i *)ref);
-    s1 = _mm_sad_epu8(src_line, zero);
-    s0 = _mm_adds_epu16(s0, s1);
-  }
-
-  s1 = _mm_srli_si128(s0, 8);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  return _mm_extract_epi16(s0, 0);
-}
-
-int vp10_vector_var_sse2(int16_t const *ref, int16_t const *src,
-                        const int bwl) {
-  int idx;
-  int width = 4 << bwl;
-  int16_t mean;
-  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
-  __m128i v1 = _mm_load_si128((const __m128i *)src);
-  __m128i diff = _mm_subs_epi16(v0, v1);
-  __m128i sum = diff;
-  __m128i sse = _mm_madd_epi16(diff, diff);
-
-  ref += 8;
-  src += 8;
-
-  for (idx = 8; idx < width; idx += 8) {
-    v0 = _mm_loadu_si128((const __m128i *)ref);
-    v1 = _mm_load_si128((const __m128i *)src);
-    diff = _mm_subs_epi16(v0, v1);
-
-    sum = _mm_add_epi16(sum, diff);
-    v0  = _mm_madd_epi16(diff, diff);
-    sse = _mm_add_epi32(sse, v0);
-
-    ref += 8;
-    src += 8;
-  }
-
-  v0  = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, v0);
-  v0  = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, v0);
-  v0  = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, v0);
-
-  v1  = _mm_srli_si128(sse, 8);
-  sse = _mm_add_epi32(sse, v1);
-  v1  = _mm_srli_epi64(sse, 32);
-  sse = _mm_add_epi32(sse, v1);
-
-  mean = _mm_extract_epi16(sum, 0);
-
-  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
-}
diff --git a/vp10/encoder/x86/dct_ssse3_x86_64.asm b/vp10/encoder/x86/dct_ssse3_x86_64.asm
deleted file mode 100644
index 5e8adab..0000000
--- a/vp10/encoder/x86/dct_ssse3_x86_64.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
-SECTION .text
-
-%if ARCH_X86_64
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro HMD8_1D 0
-  psubw              m8, m0, m1
-  psubw              m9, m2, m3
-  paddw              m0, m1
-  paddw              m2, m3
-  SWAP               1, 8
-  SWAP               3, 9
-  psubw              m8, m4, m5
-  psubw              m9, m6, m7
-  paddw              m4, m5
-  paddw              m6, m7
-  SWAP               5, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m2
-  psubw              m9, m1, m3
-  paddw              m0, m2
-  paddw              m1, m3
-  SWAP               2, 8
-  SWAP               3, 9
-  psubw              m8, m4, m6
-  psubw              m9, m5, m7
-  paddw              m4, m6
-  paddw              m5, m7
-  SWAP               6, 8
-  SWAP               7, 9
-
-  psubw              m8, m0, m4
-  psubw              m9, m1, m5
-  paddw              m0, m4
-  paddw              m1, m5
-  SWAP               4, 8
-  SWAP               5, 9
-  psubw              m8, m2, m6
-  psubw              m9, m3, m7
-  paddw              m2, m6
-  paddw              m3, m7
-  SWAP               6, 8
-  SWAP               7, 9
-%endmacro
-
-INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  HMD8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-  HMD8_1D
-
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-
-  RET
-%endif
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index ead993a..dc3b271 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -17,7 +17,6 @@
 
 VP10_CX_SRCS-yes += vp10_cx_iface.c
 
-VP10_CX_SRCS-yes += encoder/avg.c
 VP10_CX_SRCS-yes += encoder/bitstream.c
 VP10_CX_SRCS-yes += encoder/context_tree.c
 VP10_CX_SRCS-yes += encoder/context_tree.h
@@ -87,7 +86,6 @@
 VP10_CX_SRCS-yes += encoder/mbgraph.c
 VP10_CX_SRCS-yes += encoder/mbgraph.h
 
-VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/avg_intrin_sse2.c
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -102,7 +100,6 @@
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm
-VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3_x86_64.asm
 endif
 endif
 
@@ -119,10 +116,8 @@
 VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c
 VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c
 endif
-VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/avg_neon.c
 VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c
 
-VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/avg_msa.c
 VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c
 VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c
 VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d166bbf..d6c86fe 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -194,42 +194,6 @@
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 
-add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2 neon msa/;
-
-add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 neon msa/;
-
-add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-specialize qw/vp9_minmax_8x8 sse2/;
-
-add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
-
-add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-specialize qw/vp9_hadamard_16x16 sse2/;
-
-add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
-specialize qw/vp9_satd sse2 neon/;
-
-add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
-specialize qw/vp9_int_pro_row sse2 neon/;
-
-add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
-specialize qw/vp9_int_pro_col sse2 neon/;
-
-add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
-specialize qw/vp9_vector_var neon sse2/;
-
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vp9_highbd_avg_8x8/;
-  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vp9_highbd_avg_4x4/;
-  add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vp9_highbd_minmax_8x8/;
-}
-
 # ENCODEMB INVOKE
 
 #
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 0def2cf..7c71d9d 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -191,7 +191,8 @@
                                        BLOCK_SIZE bsize,
                                        int64_t rate,
                                        int64_t dist,
-                                       int skip) {
+                                       int skip,
+                                       struct macroblock_plane *const p) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -199,12 +200,33 @@
   const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
-                                                      bsize);
+  int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
+  int is_skin = 0;
+  if (refresh_this_block == 0 &&
+      bsize <= BLOCK_16X16 &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const int stride = p[0].src.stride;
+    const int strideuv = p[1].src.stride;
+    const uint8_t ysource =
+        p[0].src.buf[y_height_shift * stride + y_width_shift];
+    const uint8_t usource =
+        p[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource =
+        p[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
+    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+    if (is_skin)
+      refresh_this_block = 1;
+  }
+
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
   if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index a5b3813..edf0a97 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -14,6 +14,8 @@
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,7 +95,8 @@
 void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                       int64_t rate, int64_t dist, int skip);
+                                       int64_t rate, int64_t dist, int skip,
+                                       struct macroblock_plane *const p);
 
 void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
                                              const MB_MODE_INFO *const mbmi,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index c643b18..c07eee9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -496,6 +496,8 @@
         threshold_base = 3 * threshold_base;
       else if (noise_level == kMedium)
         threshold_base = threshold_base << 1;
+      else if (noise_level < kLow)
+        threshold_base = (7 * threshold_base) >> 3;
     }
     if (cm->width <= 352 && cm->height <= 288) {
       thresholds[0] = threshold_base >> 3;
@@ -556,16 +558,16 @@
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                               d + y8_idx * dp + x8_idx, dp,
                               &min, &max);
       } else {
-        vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                        d + y8_idx * dp + x8_idx, dp,
                        &min, &max);
       }
 #else
-      vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                      d + y8_idx * dp + x8_idx, dp,
                      &min, &max);
 #endif
@@ -597,18 +599,18 @@
       int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
       } else {
-        s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
       }
 #else
-      s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
       if (!is_key_frame)
-        d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
 #endif
       sum = s_avg - d_avg;
       sse = sum * sum;
@@ -636,18 +638,18 @@
       int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
       } else {
-        s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
       }
 #else
-      s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
       if (!is_key_frame)
-        d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
 #endif
       sum = s_avg - d_avg;
       sse = sum * sum;
@@ -668,6 +670,8 @@
   v64x64 vt;
   v16x16 vt2[16];
   int force_split[21];
+  int avg_32x32;
+  int avg_16x16[4];
   uint8_t *s;
   const uint8_t *d;
   int sp;
@@ -676,9 +680,13 @@
   int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
       cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
 
+  // For the variance computation under SVC mode, we treat the frame as key if
+  // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
+  const int is_key_frame = (cm->frame_type == KEY_FRAME ||
+      (is_one_pass_cbr_svc(cpi) &&
+      cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
   // Always use 4x4 partition for key frame.
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int use_4x4_partition = is_key_frame;
+  const int use_4x4_partition = cm->frame_type == KEY_FRAME;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
 
@@ -704,8 +712,7 @@
   s = x->plane[0].src.buf;
   sp = x->plane[0].src.stride;
 
-  if (!is_key_frame && !(is_one_pass_cbr_svc(cpi) &&
-      cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+  if (!is_key_frame) {
     // In the case of spatial/temporal scalable coding, the assumption here is
     // that the temporal reference frame will always be of type LAST_FRAME.
     // TODO(marpan): If that assumption is broken, we need to revisit this code.
@@ -819,6 +826,7 @@
     const int y32_idx = ((i >> 1) << 5);
     const int i2 = i << 2;
     force_split[i + 1] = 0;
+    avg_16x16[i] = 0;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
@@ -836,6 +844,7 @@
                             is_key_frame);
         fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
         get_variance(&vt.split[i].split[j].part_variances.none);
+        avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance;
         if (vt.split[i].split[j].part_variances.none.variance >
             thresholds[2]) {
           // 16X16 variance is above threshold for split, so force split to 8x8
@@ -862,9 +871,7 @@
           }
         }
       }
-      // TODO(marpan): There is an issue with variance based on 4x4 average in
-      // svc mode, don't allow it for now.
-      if (is_key_frame || (low_res && !cpi->use_svc &&
+      if (is_key_frame || (low_res &&
           vt.split[i].split[j].part_variances.none.variance >
           (thresholds[1] << 1))) {
         force_split[split_index] = 0;
@@ -886,8 +893,8 @@
       }
     }
   }
-
   // Fill the rest of the variance tree by summing split partition values.
+  avg_32x32 = 0;
   for (i = 0; i < 4; i++) {
     const int i2 = i << 2;
     for (j = 0; j < 4; j++) {
@@ -908,19 +915,30 @@
       }
     }
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
-    // If variance of this 32x32 block is above the threshold, force the block
-    // to split. This also forces a split on the upper (64x64) level.
+    // If variance of this 32x32 block is above the threshold, or if its above
+    // (some threshold of) the average variance over the sub-16x16 blocks, then
+    // force this block to split. This also forces a split on the upper
+    // (64x64) level.
     if (!force_split[i + 1]) {
       get_variance(&vt.split[i].part_variances.none);
-      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+      if (vt.split[i].part_variances.none.variance > thresholds[1] ||
+          (!is_key_frame &&
+          vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) &&
+          vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) {
         force_split[i + 1] = 1;
         force_split[0] = 1;
       }
+      avg_32x32 += vt.split[i].part_variances.none.variance;
     }
   }
   if (!force_split[0]) {
     fill_variance_tree(&vt, BLOCK_64X64);
     get_variance(&vt.part_variances.none);
+    // If variance of this 64x64 block is above (some threshold of) the average
+    // variance over the sub-32x32 blocks, then force this block to split.
+    if (!is_key_frame &&
+        vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+      force_split[0] = 1;
   }
 
   // Now go through the entire structure, splitting every block size until
@@ -1027,7 +1045,7 @@
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
                                         mi_col, bsize, ctx->rate, ctx->dist,
-                                        x->skip);
+                                        x->skip, p);
     }
   }
 
@@ -1687,6 +1705,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = x->plane;
   const struct segmentation *const seg = &cm->seg;
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
@@ -1707,7 +1726,7 @@
     } else {
     // Setting segmentation map for cyclic_refresh.
       vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip);
+                                        ctx->rate, ctx->dist, x->skip, p);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 72fa828..e4681f6 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1478,7 +1478,11 @@
   cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
@@ -2793,6 +2797,22 @@
                                    cpi->resize_pending);
   }
 #endif
+  if (is_one_pass_cbr_svc(cpi)) {
+    // Keep track of frame index for each reference frame.
+    SVC *const svc = &cpi->svc;
+    if (cm->frame_type == KEY_FRAME) {
+      svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
+      svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
+      svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+    } else {
+      if (cpi->refresh_last_frame)
+        svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_golden_frame)
+        svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_alt_ref_frame)
+        svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+    }
+  }
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 327ac19..a84202b 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1755,7 +1755,7 @@
   int center, offset = 0;
   int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
   for (d = 0; d <= bw; d += 16) {
-    this_sad = vp9_vector_var(&ref[d], src, bwl);
+    this_sad = vpx_vector_var(&ref[d], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       offset = d;
@@ -1768,7 +1768,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1781,7 +1781,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1794,7 +1794,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1807,7 +1807,7 @@
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1876,25 +1876,25 @@
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {
-    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
     ref_buf += 16;
   }
 
   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
   for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;
+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
     ref_buf += ref_stride;
   }
 
   // Set up src 1-D reference set
   for (idx = 0; idx < bw; idx += 16) {
     src_buf = x->plane[0].src.buf + idx;
-    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
   }
 
   src_buf = x->plane[0].src.buf;
   for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;
+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
     src_buf += src_stride;
   }
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 90650db..b929758 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -619,14 +619,14 @@
                                   scan_order->scan, scan_order->iscan);
             break;
           case TX_16X16:
-            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
             vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                             pd->dequant, eob,
                             scan_order->scan, scan_order->iscan);
             break;
           case TX_8X8:
-            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
             vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                             pd->dequant, eob,
@@ -673,7 +673,7 @@
         if (*eob == 1)
           *rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          *rate += vp9_satd((const int16_t *)qcoeff, step << 4);
+          *rate += vpx_satd((const int16_t *)qcoeff, step << 4);
 
         *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
       }
@@ -1094,6 +1094,7 @@
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
+  const SVC *const svc = &cpi->svc;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -1143,6 +1144,7 @@
   int best_pred_sad = INT_MAX;
   int best_early_term = 0;
   int ref_frame_cost[MAX_REF_FRAMES];
+  int svc_force_zero_mode[3] = {0};
 #if CONFIG_VP9_TEMPORAL_DENOISING
   int64_t zero_last_cost_orig = INT64_MAX;
 #endif
@@ -1196,6 +1198,17 @@
   } else {
     usable_ref_frame = GOLDEN_FRAME;
   }
+
+  // If the reference is temporally aligned with current superframe
+  // (e.g., spatial reference within superframe), constrain the inter mode:
+  // for now only test zero motion.
+  if (cpi->use_svc && svc ->force_zero_mode_spatial_ref) {
+    if (svc->ref_frame_index[cpi->lst_fb_idx] == svc->current_superframe)
+      svc_force_zero_mode[LAST_FRAME - 1] = 1;
+    if (svc->ref_frame_index[cpi->gld_fb_idx] == svc->current_superframe)
+      svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+  }
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
 
@@ -1248,8 +1261,13 @@
       continue;
 
     ref_frame = ref_mode_set[idx].ref_frame;
-    if (cpi->use_svc)
+    if (cpi->use_svc) {
       ref_frame = ref_mode_set_svc[idx].ref_frame;
+      if (svc_force_zero_mode[ref_frame - 1] &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+    }
+
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
     if (const_motion[ref_frame] && this_mode == NEARMV)
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 8ab51cd..2579c60 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -833,10 +833,16 @@
   ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
-    // Handle the special case for key frames forced when we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25,
+                                            cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      // Handle the special case for key frames forced when we have reached
+      // the maximum key frame interval. Here force the Q to a range
+      // based on the ambient Q to reduce the risk of popping.
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
@@ -886,17 +892,28 @@
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == VPX_Q) {
-      if (!cpi->refresh_alt_ref_frame) {
-        active_best_quality = cq_level;
-      } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-      }
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex;
+      if (cpi->refresh_alt_ref_frame)
+        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+      else
+        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
-      active_best_quality = cq_level;
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double delta_rate[FIXED_GF_INTERVAL] =
+          {0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0};
+      int delta_qindex =
+          vp9_compute_qdelta(rc, q,
+                             q * delta_rate[cm->current_video_frame %
+                             FIXED_GF_INTERVAL], cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Use the lower of active_worst_quality and recent/average Q.
       if (cm->current_video_frame > 1)
@@ -1313,9 +1330,9 @@
       }
     }
   } else {
-    if (rc->is_src_frame_alt_ref ||
-        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
-        (cpi->use_svc && oxcf->rc_mode == VPX_CBR)) {
+    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+        (!rc->is_src_frame_alt_ref &&
+         !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@@ -1722,29 +1739,36 @@
                                   RATE_CONTROL *const rc) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = oxcf->max_gf_interval;
-  rc->min_gf_interval = oxcf->min_gf_interval;
-  if (rc->min_gf_interval == 0)
-    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-        oxcf->width, oxcf->height, cpi->framerate);
-  if (rc->max_gf_interval == 0)
-    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-        cpi->framerate, rc->min_gf_interval);
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
 
-  // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
 
-  if (is_altref_enabled(cpi)) {
-    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
   }
-
-  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
-    rc->max_gf_interval = rc->static_scene_max_gf_interval;
-
-  // Clamp min to max
-  rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
 }
 
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 136fd3e..3df909c 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -26,6 +26,7 @@
 
 #define MIN_GF_INTERVAL     4
 #define MAX_GF_INTERVAL     16
+#define FIXED_GF_INTERVAL   8    // Used in some testing modes only
 #define ONEHALFONLY_RESIZE  0
 
 typedef enum {
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 318d810..c5f0bad 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -394,7 +394,7 @@
           sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
       } else {
         for (i = 0; i < BLOCK_SIZES; ++i)
-          if (i >= BLOCK_16X16)
+          if (i > BLOCK_16X16)
             sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
           else
             // Use H and V intra mode for block sizes <= 16X16.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index b0617c1..a4e7eb1 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -25,13 +25,17 @@
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int mi_rows = cpi->common.mi_rows;
   int mi_cols = cpi->common.mi_cols;
-  int sl, tl;
+  int sl, tl, i;
   int alt_ref_idx = svc->number_spatial_layers;
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
   svc->first_spatial_layer_to_encode = 0;
   svc->rc_drop_superframe = 0;
+  svc->force_zero_mode_spatial_ref = 0;
+  svc->current_superframe = 0;
+  for (i = 0; i < REF_FRAMES; ++i)
+    svc->ref_frame_index[i] = -1;
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
@@ -353,6 +357,8 @@
                               cpi->svc.number_temporal_layers];
   ++lc->current_video_frame_in_layer;
   ++lc->frames_from_key_frame;
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    ++cpi->svc.current_superframe;
 }
 
 int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
@@ -542,6 +548,7 @@
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   LAYER_CONTEXT *lc = NULL;
+  cpi->svc.force_zero_mode_spatial_ref = 1;
 
   if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
     set_flags_and_fb_idx_for_temporal_mode3(cpi);
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 5dbf9b4..1f446d7 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -83,6 +83,9 @@
   int ext_lst_fb_idx[VPX_MAX_LAYERS];
   int ext_gld_fb_idx[VPX_MAX_LAYERS];
   int ext_alt_fb_idx[VPX_MAX_LAYERS];
+  int ref_frame_index[REF_FRAMES];
+  int force_zero_mode_spatial_ref;
+  int current_superframe;
 } SVC;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 7e4c61f..015dbc0 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -205,15 +205,34 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
       modifier += rounding;
       modifier >>= strength;
 
@@ -406,56 +425,58 @@
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
-            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
-            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+            vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+                                               f->y_stride,
+                                               predictor, 16, 16, adj_strength,
+                                               filter_weight,
+                                               accumulator, count);
+            vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 256,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength,
+                                               filter_weight, accumulator + 256,
+                                               count + 256);
+            vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 512,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16,
-                                      strength, filter_weight,
-                                      accumulator, count);
-            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 256,
-                                      count + 256);
-            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 512,
-                                      count + 512);
+            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16,
+                                        strength, filter_weight,
+                                        accumulator, count);
+            vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 256,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 256,
+                                        count + 256);
+            vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 512,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 512,
+                                        count + 512);
           }
 #else
           // Apply the filter (YUV)
           // TODO(jingning): Need SIMD optimization for this.
           vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, 16,
-                                    strength, filter_weight,
-                                    accumulator, count);
+                                      predictor, 16, 16,
+                                      strength, filter_weight,
+                                      accumulator, count);
           vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 256,
-                                    count + 256);
+                                      predictor + 256,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 256,
+                                      count + 256);
           vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 512,
-                                    count + 512);
+                                      predictor + 512,
+                                      mb_uv_width, mb_uv_height, strength,
+                                      filter_weight, accumulator + 512,
+                                      count + 512);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 5918240..de688bf 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -17,7 +17,6 @@
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
 
-VP9_CX_SRCS-yes += encoder/vp9_avg.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
@@ -93,7 +92,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
@@ -114,7 +112,6 @@
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
 endif
 
@@ -131,10 +128,8 @@
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
 endif
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vpx_dsp/arm/avg_neon.c
similarity index 93%
rename from vp9/encoder/arm/neon/vp9_avg_neon.c
rename to vpx_dsp/arm/avg_neon.c
index 78467ce..d054c41 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -11,7 +11,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
@@ -24,7 +24,7 @@
   return vget_lane_u32(c, 0);
 }
 
-unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
   uint16x8_t v_sum;
   uint32x2_t v_s0 = vdup_n_u32(0);
   uint32x2_t v_s1 = vdup_n_u32(0);
@@ -36,7 +36,7 @@
   return (horizontal_add_u16x8(v_sum) + 8) >> 4;
 }
 
-unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
   uint8x8_t v_s0 = vld1_u8(s);
   const uint8x8_t v_s1 = vld1_u8(s + p);
   uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
@@ -64,7 +64,7 @@
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vp9_satd_neon(const int16_t *coeff, int length) {
+int vpx_satd_neon(const int16_t *coeff, int length) {
   const int16x4_t zero = vdup_n_s16(0);
   int32x4_t accum = vdupq_n_s32(0);
 
@@ -89,7 +89,7 @@
   }
 }
 
-void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
   uint16x8_t vec_sum_lo = vdupq_n_u16(0);
@@ -142,7 +142,7 @@
   vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }
 
-int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
   int i;
   uint16x8_t vec_sum = vdupq_n_u16(0);
 
@@ -158,7 +158,7 @@
 
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
-int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
   int width = 4 << bwl;
   int32x4_t sse = vdupq_n_s32(0);
   int16x8_t total = vdupq_n_s16(0);
diff --git a/vp9/encoder/vp9_avg.c b/vpx_dsp/avg.c
similarity index 86%
rename from vp9/encoder/vp9_avg.c
rename to vpx_dsp/avg.c
index 7baa09a..26fe785 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vpx_dsp/avg.c
@@ -7,11 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
   int i, j;
   int sum = 0;
   for (i = 0; i < 8; ++i, s+=p)
@@ -20,7 +21,7 @@
   return (sum + 32) >> 6;
 }
 
-unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
   int i, j;
   int sum = 0;
   for (i = 0; i < 4; ++i, s+=p)
@@ -61,7 +62,7 @@
   coeff[5] = c3 - c7;
 }
 
-void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
                         int16_t *coeff) {
   int idx;
   int16_t buffer[64];
@@ -84,14 +85,14 @@
 }
 
 // In place 16x16 2D Hadamard transform
-void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
                           int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
     int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
                                 + (idx & 0x01) * 8;
-    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
   }
 
   // coeff: 15 bit, dynamic range [-16320, 16320]
@@ -117,7 +118,7 @@
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vp9_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const int16_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i)
@@ -129,7 +130,7 @@
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
+void vpx_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
@@ -146,7 +147,7 @@
 }
 
 // width: value range {16, 32, 64}.
-int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_c(uint8_t const *ref, const int width) {
   int idx;
   int16_t sum = 0;
   // sum: 14 bit, dynamic range [0, 16320]
@@ -158,7 +159,7 @@
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4}
-int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+int vpx_vector_var_c(int16_t const *ref, int16_t const *src,
                      const int bwl) {
   int i;
   int width = 4 << bwl;
@@ -175,7 +176,7 @@
   return var;
 }
 
-void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
                       int *min, int *max) {
   int i, j;
   *min = 255;
@@ -190,7 +191,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
   int sum = 0;
   const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
@@ -200,7 +201,7 @@
   return (sum + 32) >> 6;
 }
 
-unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
   int i, j;
   int sum = 0;
   const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
@@ -210,7 +211,7 @@
   return (sum + 8) >> 4;
 }
 
-void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
                              int dp, int *min, int *max) {
   int i, j;
   const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vpx_dsp/mips/avg_msa.c
similarity index 91%
rename from vp9/encoder/mips/msa/vp9_avg_msa.c
rename to vpx_dsp/mips/avg_msa.c
index 611adb1..52a24ed 100644
--- a/vp9/encoder/mips/msa/vp9_avg_msa.c
+++ b/vpx_dsp/mips/avg_msa.c
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
-uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
   uint32_t sum_out;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
@@ -33,7 +33,7 @@
   return sum_out;
 }
 
-uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
   uint32_t sum_out;
   uint32_t src0, src1, src2, src3;
   v16u8 vec = { 0 };
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 9620eaa..e394688 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -252,6 +252,18 @@
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
 endif
 endif
+
+# avg
+DSP_SRCS-yes           += avg.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
+endif
+endif
+
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
 ifeq ($(CONFIG_ENCODERS),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 5ce7134..a2a0674 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -226,7 +226,7 @@
 specialize qw/vpx_d63e_predictor_32x32/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_32x32/;
@@ -241,7 +241,7 @@
 specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
+specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
@@ -288,13 +288,13 @@
   specialize qw/vpx_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4/;
@@ -990,14 +990,43 @@
 specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
 
 #
 # Avg
 #
+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+  add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vpx_avg_8x8 sse2 neon msa/;
+
+  add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vpx_avg_4x4 sse2 neon msa/;
+
+  add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vpx_minmax_8x8 sse2/;
+
+  add_proto qw/void vpx_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vpx_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/vpx_hadamard_16x16 sse2/;
+
+  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+  specialize qw/vpx_satd sse2 neon/;
+
+  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
+  specialize qw/vpx_int_pro_row sse2 neon/;
+
+  add_proto qw/int16_t vpx_int_pro_col/, "uint8_t const *ref, const int width";
+  specialize qw/vpx_int_pro_col sse2 neon/;
+
+  add_proto qw/int vpx_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+  specialize qw/vpx_vector_var neon sse2/;
+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
 
@@ -1032,10 +1061,10 @@
 specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
@@ -1195,6 +1224,13 @@
   #
   # Avg
   #
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vpx_highbd_avg_8x8/;
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vpx_highbd_avg_4x4/;
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8/;
+
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
 
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
similarity index 95%
rename from vp9/encoder/x86/vp9_avg_intrin_sse2.c
rename to vpx_dsp/x86/avg_intrin_sse2.c
index 4414871..f9af6cf 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -10,10 +10,10 @@
 
 #include <emmintrin.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
   u0  = _mm_setzero_si128();
@@ -91,7 +91,7 @@
   *min = _mm_extract_epi16(minabsdiff, 0);
 }
 
-unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0  = _mm_setzero_si128();
@@ -118,7 +118,7 @@
   return (avg + 32) >> 6;
 }
 
-unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0  = _mm_setzero_si128();
@@ -212,7 +212,7 @@
   }
 }
 
-void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
                            int16_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
@@ -244,13 +244,13 @@
   _mm_store_si128((__m128i *)coeff, src[7]);
 }
 
-void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
                              int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
                                 + (idx & 0x01) * 8;
-    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+    vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
   }
 
   for (idx = 0; idx < 64; idx += 8) {
@@ -283,7 +283,7 @@
   }
 }
 
-int vp9_satd_sse2(const int16_t *coeff, int length) {
+int vpx_satd_sse2(const int16_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
   __m128i accum = zero;
@@ -309,7 +309,7 @@
   return _mm_cvtsi128_si32(accum);
 }
 
-void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
@@ -358,7 +358,7 @@
   _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
-int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
   __m128i zero = _mm_setzero_si128();
   __m128i src_line = _mm_load_si128((const __m128i *)ref);
   __m128i s0 = _mm_sad_epu8(src_line, zero);
@@ -378,7 +378,7 @@
   return _mm_extract_epi16(s0, 0);
 }
 
-int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
                         const int bwl) {
   int idx;
   int width = 4 << bwl;
diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vpx_dsp/x86/avg_ssse3_x86_64.asm
similarity index 96%
rename from vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
rename to vpx_dsp/x86/avg_ssse3_x86_64.asm
index 74c52df..26412e8 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -8,11 +8,11 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-%define private_prefix vp9
+%define private_prefix vpx
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
+; This file provides SSSE3 version of the hadamard transformation. Part
 ; of the macro definitions are originally derived from the ffmpeg project.
 ; The current version applies to x86 64-bit only.
 
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm
index b12d29c..233958a 100644
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -17,24 +17,20 @@
 pw_32: times 4 dd 32
 
 SECTION .text
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   movq                  m0, [aboveq]
   movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, one
-  mov                 oned, 0x0001
-  pxor                  m1, m1
-  movd                  m3, oned
-  pshufw                m3, m3, 0x0
   paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq*2], m0
   lea                 dstq, [dstq+strideq*4]
@@ -183,7 +179,7 @@
   REP_RET
 %endif
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   movq    [dstq          ], m0
@@ -261,43 +257,44 @@
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
-  pshufw                m1, m1, 0x0
+  pshuflw               m1, m1, 0x0
+  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
+  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  movd                  m3, oned
+  pcmpeqw               m3, m3
   movd                  m4, bpsd
-  pshufw                m3, m3, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  mova                  m2, m3
+  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
-  add                leftq, 8
-  psubw                 m3, m2 ; max possible value
-  pxor                  m4, m4 ; min possible value
-  psubw                 m0, m1
-.loop:
-  movq                  m1, [leftq+lineq*4]
-  movq                  m2, [leftq+lineq*4+2]
-  pshufw                m1, m1, 0x0
-  pshufw                m2, m2, 0x0
-  paddw                 m1, m0
+  pcmpeqw               m2, m2
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m2         ; max possible value
+  mova                  m1, [leftq]
+  pshuflw               m2, m1, 0x0
+  pshuflw               m5, m1, 0x55
+  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
   paddw                 m2, m0
   ;Clamp to the bit-depth
-  pminsw                m1, m3
   pminsw                m2, m3
-  pmaxsw                m1, m4
   pmaxsw                m2, m4
   ;Store the values
-  movq    [dstq          ], m1
-  movq    [dstq+strideq*2], m2
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
   lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m2, m1, 0xaa
+  pshuflw               m5, m1, 0xff
+  movlhps               m2, m5
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  RET
 
 INIT_XMM sse2
 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 5f9c963..30ee81b 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -79,20 +79,13 @@
 
 %macro INC_SRC_BY_SRC_STRIDE  0
 %if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*2]
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
 %else
   lea                srcq, [srcq + src_strideq*2]
 %endif
 %endmacro
 
-%macro INC_SRC_BY_SRC_2STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*4]
-%else
-  lea                srcq, [srcq + src_strideq*4]
-%endif
-%endmacro
-
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %define bilin_filter_m bilin_filter_m_sse2
 %define filter_idx_shift 5
@@ -123,7 +116,10 @@
       %define sec_str sec_stridemp
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -131,7 +127,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
@@ -140,7 +135,10 @@
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -148,7 +146,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
@@ -980,8 +977,9 @@
 .x_other_y_other_loop:
   movu                 m2, [srcq]
   movu                 m4, [srcq+2]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
   pmullw               m2, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m2, filter_rnd
@@ -1014,7 +1012,7 @@
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
-  INC_SRC_BY_SRC_2STRIDE
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq + dst_strideq * 4]
 %if %2 == 1 ; avg
   add                secq, sec_str
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index b45331c..81ec5db 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -243,13 +243,18 @@
 }
 
 #if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
 #define DECL(w, opt) \
   int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                                  ptrdiff_t src_stride, \
                                                  int x_offset, int y_offset, \
                                                  const uint16_t *dst, \
                                                  ptrdiff_t dst_stride, \
-                                                 int height, unsigned int *sse);
+                                                 int height, \
+                                                 unsigned int *sse, \
+                                                 void *unused0, void *unused);
 #define DECLS(opt1, opt2) \
   DECL(8, opt1); \
   DECL(16, opt1)
@@ -274,7 +279,7 @@
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, h, \
-                                                       &sse); \
+                                                       &sse, NULL, NULL); \
   if (w > wf) { \
     unsigned int sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -282,19 +287,20 @@
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2); \
+          dst + 48, dst_stride, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -312,7 +318,7 @@
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, \
-                                                       h, &sse); \
+                                                       h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -320,20 +326,21 @@
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 48, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -359,27 +366,27 @@
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
         src + (start_row * src_stride), src_stride, \
         x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2); \
+        dst_stride, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 16 + (start_row * src_stride), src_stride, \
           x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2); \
+          dst_stride, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 32 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 48 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       }\
@@ -410,6 +417,7 @@
 #undef FNS
 #undef FN
 
+// The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
                                                    ptrdiff_t src_stride, \
@@ -419,7 +427,8 @@
                                                    const uint16_t *sec, \
                                                    ptrdiff_t sec_stride, \
                                                    int height, \
-                                                   unsigned int *sse);
+                                                   unsigned int *sse, \
+                                                   void *unused0, void *unused);
 #define DECLS(opt1) \
 DECL(16, opt1) \
 DECL(8, opt1)
@@ -439,23 +448,23 @@
   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse); \
+               y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                   src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -475,14 +484,15 @@
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src, src_stride, x_offset, \
                                             y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse); \
+                                            sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 16, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2); \
+                                            sec + 16, w, h, &sse2, \
+                                            NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
@@ -490,14 +500,16 @@
                                             src + 32, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2); \
+                                            sec + 32, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 48, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2); \
+                                            sec + 48, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -525,7 +537,7 @@
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + (start_row * src_stride), src_stride, x_offset, \
                 y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2); \
+                sec + (start_row * w), w, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
@@ -533,7 +545,7 @@
                 src + 16 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2); \
+                sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
@@ -541,14 +553,14 @@
                 src + 32 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2); \
+                sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2); \
+                sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       } \
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index edbf05e..c24d536 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -47,9 +47,9 @@
 
 INIT_XMM sse2
 cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
-  movifnidn          leftq, leftmp
   pxor                  m1, m1
   movd                  m0, [leftq]
   psadbw                m0, m1
@@ -143,9 +143,9 @@
 
 INIT_XMM sse2
 cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
-  movifnidn          leftq, leftmp
   pxor                  m1, m1
   movq                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3
@@ -239,14 +239,11 @@
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -271,14 +268,11 @@
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -582,6 +576,34 @@
   REP_RET
 
 INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
 cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
   pxor                  m1, m1
   movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
@@ -641,45 +663,45 @@
   REP_RET
 
 INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
+cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
   pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  mova                  m0, [aboveq]
-  punpcklbw             m2, m1
+  mova                  m2, [aboveq-16];
+  mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
+  punpckhbw             m2, m1         ; [127:112] tl [word]
   punpckhbw             m4, m0, m1
-  punpcklbw             m0, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
+  punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
+  DEFINE_ARGS dst, stride, line, left, stride8
   mov                lineq, -8
-  punpcklqdq            m2, m2
-  add                leftq, 16
+  pshufhw               m2, m2, 0xff
+  mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
+  punpckhqdq            m2, m2         ; tl repeated 8 times [word]
   psubw                 m0, m2
-  psubw                 m4, m2
+  psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
+  punpckhbw             m5, m3, m1
+  punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
+  lea             stride8q, [strideq*8]
 .loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m5, m2, m0
-  paddw                 m6, m3, m0
-  paddw                 m2, m4
-  paddw                 m3, m4
-  packuswb              m5, m2
-  packuswb              m6, m3
-  mova      [dstq        ], m5
-  mova      [dstq+strideq], m6
-  lea                 dstq, [dstq+strideq*2]
+  pshuflw               m6, m3, 0x0
+  pshuflw               m7, m5, 0x0
+  punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
+  punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
+  paddw                 m1, m6, m0
+  paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
+  psrldq                m5, 2
+  packuswb              m1, m6
+  mova     [dstq         ], m1
+  paddw                 m1, m7, m0
+  paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
+  psrldq                m3, 2
+  packuswb              m1, m7
+  mova     [dstq+stride8q], m1
   inc                lineq
+  lea                 dstq, [dstq+strideq]
   jnz .loop
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
+cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
   pxor                  m1, m1
   movd                  m2, [aboveq-1]
   mova                  m0, [aboveq]
@@ -700,31 +722,29 @@
   psubw                 m5, m2
 .loop:
   movd                  m2, [leftq+lineq*2]
-  movd                  m6, [leftq+lineq*2+1]
+  pxor                  m1, m1
   punpcklbw             m2, m1
-  punpcklbw             m6, m1
+  pshuflw               m7, m2, 0x55
   pshuflw               m2, m2, 0x0
-  pshuflw               m6, m6, 0x0
   punpcklqdq            m2, m2
-  punpcklqdq            m6, m6
-  paddw                 m7, m2, m0
-  paddw                 m8, m2, m3
-  paddw                 m9, m2, m4
-  paddw                 m2, m5
-  packuswb              m7, m8
-  packuswb              m9, m2
-  paddw                 m2, m6, m0
-  paddw                 m8, m6, m3
-  mova   [dstq           ], m7
-  paddw                 m7, m6, m4
-  paddw                 m6, m5
-  mova   [dstq        +16], m9
-  packuswb              m2, m8
-  packuswb              m7, m6
-  mova   [dstq+strideq   ], m2
-  mova   [dstq+strideq+16], m7
+  punpcklqdq            m7, m7
+  paddw                 m6, m2, m3
+  paddw                 m1, m2, m0
+  packuswb              m1, m6
+  mova   [dstq           ], m1
+  paddw                 m6, m2, m5
+  paddw                 m1, m2, m4
+  packuswb              m1, m6
+  mova   [dstq+16        ], m1
+  paddw                 m6, m7, m3
+  paddw                 m1, m7, m0
+  packuswb              m1, m6
+  mova   [dstq+strideq   ], m1
+  paddw                 m6, m7, m5
+  paddw                 m1, m7, m4
+  packuswb              m1, m6
+  mova   [dstq+strideq+16], m1
   lea                 dstq, [dstq+strideq*2]
   inc                lineq
   jnz .loop
   REP_RET
-%endif
diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm
index f1a193b..d061278 100644
--- a/vpx_dsp/x86/intrapred_ssse3.asm
+++ b/vpx_dsp/x86/intrapred_ssse3.asm
@@ -33,26 +33,6 @@
 
 SECTION .text
 
-INIT_XMM ssse3
-cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 32
-  mov                lineq, -16
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  mova   [dstq           ], m1
-  mova   [dstq        +16], m1
-  mova   [dstq+strideq   ], m2
-  mova   [dstq+strideq+16], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
 INIT_MMX ssse3
 cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   GET_GOT     goffsetq
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index 0defe1b..1ec906c 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -17,7 +17,7 @@
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %else ; avg
@@ -25,7 +25,7 @@
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -222,8 +222,8 @@
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
 
-; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
-;                                  uint8_t *ref, int ref_stride);
+; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
   mov              n_rowsd, %1/4
@@ -236,31 +236,32 @@
   movd                  m4, [refq+ref_stride3q]
   punpckldq             m1, m2
   punpckldq             m3, m4
+  movlhps               m1, m3
 %if %2 == 1
   pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m3, [second_predq+mmsize*1]
-  lea         second_predq, [second_predq+mmsize*2]
+  lea         second_predq, [second_predq+mmsize*1]
 %endif
   movd                  m2, [srcq]
   movd                  m5, [srcq+src_strideq]
   movd                  m4, [srcq+src_strideq*2]
-  movd                  m6, [srcq+src_stride3q]
+  movd                  m3, [srcq+src_stride3q]
   punpckldq             m2, m5
-  punpckldq             m4, m6
+  punpckldq             m4, m3
+  movlhps               m2, m4
   psadbw                m1, m2
-  psadbw                m3, m4
   lea                 refq, [refq+ref_strideq*4]
   paddd                 m0, m1
   lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
   dec              n_rowsd
   jg .loop
 
+  movhlps               m1, m0
+  paddd                 m0, m1
   movd                 eax, m0
   RET
 %endmacro
 
-INIT_MMX sse
+INIT_XMM sse2
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 1176a2f..c655e4b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -139,7 +139,10 @@
       %define sec_str sec_stridemp
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -147,7 +150,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
@@ -156,7 +158,10 @@
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -164,7 +169,6 @@
       lea ecx, [GLOBAL(pw_8)]
       mov g_pw_8m, ecx
 
-      RESTORE_GOT               ; restore esp
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
   %else
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a..708fa10 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
 %if ABI_IS_32BIT
   %if CONFIG_PIC=1
   %ifidn __OUTPUT_FORMAT__,elf32
-    %define GET_GOT_SAVE_ARG 1
     %define WRT_PLT wrt ..plt
     %macro GET_GOT 1
       extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
       %define RESTORE_GOT pop %1
     %endmacro
   %elifidn __OUTPUT_FORMAT__,macho32
-    %define GET_GOT_SAVE_ARG 1
     %macro GET_GOT 1
       push %1
       call %%get_got
diff --git a/vpxenc.c b/vpxenc.c
index fd2acf6..afbaeac 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -2060,9 +2060,11 @@
 
 #if !CONFIG_WEBM_IO
     FOREACH_STREAM({
-      stream->config.write_webm = 0;
-      warn("vpxenc was compiled without WebM container support."
-           "Producing IVF output");
+      if (stream->config.write_webm) {
+        stream->config.write_webm = 0;
+        warn("vpxenc was compiled without WebM container support."
+             "Producing IVF output");
+      }
     });
 #endif