Merge "mips msa vp9 avg subpel variance optimization rebased"

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 66ca4bb..5a4b1ed 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -901,14 +901,6 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),

diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 42ac13f..32be1f4 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc

@@ -129,8 +129,13 @@
   vpx_codec_ctx_t dec;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
   const uint32_t frame_size = static_cast<uint32_t>(video.frame_size());
+#if CONFIG_VP9_HIGHBITDEPTH
   EXPECT_EQ(VPX_CODEC_MEM_ERROR,
             vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#else
+  EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#endif
   vpx_codec_iter_t iter = NULL;
   EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
 

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 6294af1..08a5ff6 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -494,6 +494,14 @@
         make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
 #endif
 
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
@@ -523,14 +531,6 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 2cbbc6b..1b5ef5c 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -112,7 +112,9 @@
 
 const DecodeParam kVP9InvalidFileTests[] = {
   {1, "invalid-vp90-02-v2.webm"},
+#if CONFIG_VP9_HIGHBITDEPTH
   {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
+#endif
   {1, "invalid-vp90-03-v3.webm"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index ba51309..96aaa23 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc

@@ -19,7 +19,7 @@
 #include "test/util.h"
 
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx/vpx_integer.h"
@@ -60,49 +60,49 @@
 void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count, int bd) {
-  vp9_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count, int bd) {
-  vp9_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
-  vp9_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count, int bd) {
-  vp9_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
 }
 #else
 void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count) {
-  vp9_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
-  vp9_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count) {
-  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
@@ -114,25 +114,25 @@
 void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
-  vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count) {
-  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON_ASM
@@ -141,13 +141,13 @@
 void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh,
                              int count) {
-  vp9_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
@@ -534,46 +534,46 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 12, 1),
         make_tuple(&wrapper_vertical_16_dual_sse2,
@@ -586,10 +586,10 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
@@ -598,8 +598,8 @@
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8,
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
                    2)));
 #endif
 
@@ -608,42 +608,42 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_sse2,
-                   &vp9_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_sse2,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_sse2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_sse2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_sse2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_sse2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -657,36 +657,36 @@
 #if HAVE_NEON_ASM
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
-        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&wrapper_vertical_16_neon,
                    &wrapper_vertical_16_c, 8, 1),
         make_tuple(&wrapper_vertical_16_dual_neon,
                    &wrapper_vertical_16_dual_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_8_neon,
-                   &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_vertical_8_neon,
-                   &vp9_lpf_vertical_8_c, 8, 1),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_4_neon,
-                   &vp9_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vp9_lpf_vertical_4_neon,
-                   &vp9_lpf_vertical_4_c, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_8_neon,
+                   &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_8_neon,
+                   &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_4_neon,
+                   &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_4_neon,
+                   &vpx_lpf_vertical_4_c, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
 #if HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_8_dual_neon,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_neon,
-                   &vp9_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                   &vpx_lpf_vertical_8_dual_c, 8),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_4_dual_neon,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_neon,
-                   &vp9_lpf_vertical_4_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                   &vpx_lpf_vertical_4_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
@@ -694,23 +694,23 @@
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_msa, &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_lpf_vertical_8_msa, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_msa,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_dual_msa,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_msa,
-                   &vp9_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_msa,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_msa,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_msa,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_msa,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_msa,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
 }  // namespace

diff --git a/test/test.mk b/test/test.mk
index a8a365e..8ecc856 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -91,6 +91,7 @@
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 
 ## VP8
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
@@ -142,7 +143,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc

diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 92c236f..07968bc 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc

@@ -21,8 +21,8 @@
 const unsigned int kFrames = 100;
 const int kBitrate = 500;
 
-#define ARF_NOT_SEEN   1000001
-#define ARF_SEEN_ONCE  1000000
+#define ARF_NOT_SEEN               1000001
+#define ARF_SEEN_ONCE              1000000
 
 typedef struct {
   const char *filename;
@@ -108,7 +108,7 @@
   }
 
   virtual void BeginPassHook(unsigned int) {
-    min_arf_ = ARF_NOT_SEEN;
+    min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
 
@@ -137,15 +137,15 @@
     if (frames == 1) {
       run_of_visible_frames_++;
     } else if (frames == 2) {
-      if (min_arf_ == ARF_NOT_SEEN) {
-        min_arf_ = ARF_SEEN_ONCE;
-      } else if (min_arf_ == ARF_SEEN_ONCE ||
-                 run_of_visible_frames_ < min_arf_) {
-        min_arf_ = run_of_visible_frames_;
+      if (min_run_ == ARF_NOT_SEEN) {
+        min_run_ = ARF_SEEN_ONCE;
+      } else if (min_run_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_run_) {
+        min_run_ = run_of_visible_frames_;
       }
       run_of_visible_frames_ = 1;
     } else {
-      min_arf_ = 0;
+      min_run_ = 0;
       run_of_visible_frames_ = 1;
     }
   }
@@ -166,8 +166,8 @@
     }
   }
 
-  int GetMinArfDistance() const {
-    return min_arf_;
+  int GetMinVisibleRun() const {
+    return min_run_;
   }
 
   int GetMinArfDistanceRequested() const {
@@ -185,7 +185,7 @@
 
  private:
   int min_arf_requested_;
-  int min_arf_;
+  int min_run_;
   int run_of_visible_frames_;
 };
 
@@ -214,9 +214,10 @@
   }
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video));
-  const int min_arf_dist = GetMinArfDistance();
+  const int min_run = GetMinVisibleRun();
   const int min_arf_dist_requested = GetMinArfDistanceRequested();
-  if (min_arf_dist != ARF_NOT_SEEN && min_arf_dist != ARF_SEEN_ONCE) {
+  if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+    const int min_arf_dist = min_run + 1;
     EXPECT_GE(min_arf_dist, min_arf_dist_requested);
   }
   delete(video);

diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index 856b4c1..09c2069 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc

@@ -291,6 +291,12 @@
         make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
         make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
         make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProColTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
 #endif
 
 #if HAVE_MSA

diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc
index c7f0cd8..b917429 100644
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc

@@ -14,11 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "vp9/decoder/vp9_reader.h"
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/bitwriter.h"
 
 #include "test/acm_random.h"
-#include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -50,9 +50,9 @@
         const int random_seed = 6432;
         const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
-        vp9_writer bw;
+        vpx_writer bw;
         uint8_t bw_buffer[kBufferSize];
-        vp9_start_encode(&bw, bw_buffer);
+        vpx_start_encode(&bw, bw_buffer);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
         for (int i = 0; i < kBitsToTest; ++i) {
@@ -61,16 +61,16 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          vp9_write(&bw, bit, static_cast<int>(probas[i]));
+          vpx_write(&bw, bit, static_cast<int>(probas[i]));
         }
 
-        vp9_stop_encode(&bw);
+        vpx_stop_encode(&bw);
 
         // First bit should be zero
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
-        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vpx_reader br;
+        vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
@@ -78,7 +78,7 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
+          GTEST_ASSERT_EQ(vpx_read(&br, probas[i]), bit)
               << "pos: " << i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 943c00b..0e09652 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc

@@ -19,7 +19,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"

diff --git a/tools_common.c b/tools_common.c
index 901734e..8d356af 100644
--- a/tools_common.c
+++ b/tools_common.c

@@ -392,7 +392,7 @@
           (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
       uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
       for (x = 0; x < w; x++) {
-        *p_dst++ = *p_src++;
+        *p_dst++ = (uint8_t)(*p_src++);
       }
     }
   }

diff --git a/vp8/common/mips/msa/loopfilter_filters_msa.c b/vp8/common/mips/msa/loopfilter_filters_msa.c
new file mode 100644
index 0000000..a40f378
--- /dev/null
+++ b/vp8/common/mips/msa/loopfilter_filters_msa.c

@@ -0,0 +1,826 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+#define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask)         \
+{                                                              \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = ((v16u8)mask <= b_limit);                           \
+}
+
+#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
+                           mask_in, hev_in)                             \
+{                                                                       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80);                        \
+    p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80);                        \
+    q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80);                        \
+    q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80);                        \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
+    filt = filt & (v16i8)mask_in;                                       \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80);                        \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80);                        \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80);                        \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80);                        \
+}
+
+#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask)          \
+{                                                                  \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign;       \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign;           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;        \
+                                                                   \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                       \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                       \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                       \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                       \
+                                                                   \
+    filt = __msa_subs_s_b(p1_m, q1_m);                             \
+                                                                   \
+    q0_sub_p0 = q0_m - p0_m;                                       \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+                                                                   \
+    cnst3h = __msa_ldi_h(3);                                       \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                         \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
+    filt_r += q0_sub_p0_r;                                         \
+    filt_r = __msa_sat_s_h(filt_r, 7);                             \
+                                                                   \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                         \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
+    filt_l += q0_sub_p0_l;                                         \
+    filt_l = __msa_sat_s_h(filt_l, 7);                             \
+                                                                   \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
+    filt = filt & (v16i8)(mask);                                   \
+                                                                   \
+    cnst4b = __msa_ldi_b(4);                                       \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                          \
+    filt1 >>= 3;                                                   \
+                                                                   \
+    cnst3b = __msa_ldi_b(3);                                       \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                          \
+    filt2 >>= 3;                                                   \
+                                                                   \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                       \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                       \
+}
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)            \
+{                                                                  \
+    v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                      \
+    v16i8 filt, q0_sub_p0, cnst4b, cnst3b;                         \
+    v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign;              \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l;      \
+    v8i16 cnst3h, cnst27h, cnst18h, cnst63h;                       \
+                                                                   \
+    cnst3h = __msa_ldi_h(3);                                       \
+                                                                   \
+    p2_m = (v16i8)__msa_xori_b(p2, 0x80);                          \
+    p1_m = (v16i8)__msa_xori_b(p1, 0x80);                          \
+    p0_m = (v16i8)__msa_xori_b(p0, 0x80);                          \
+    q0_m = (v16i8)__msa_xori_b(q0, 0x80);                          \
+    q1_m = (v16i8)__msa_xori_b(q1, 0x80);                          \
+    q2_m = (v16i8)__msa_xori_b(q2, 0x80);                          \
+                                                                   \
+    filt = __msa_subs_s_b(p1_m, q1_m);                             \
+    q0_sub_p0 = q0_m - p0_m;                                       \
+    q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0);                 \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+                                                                   \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_r *= cnst3h;                                         \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                 \
+    filt_r = filt_r + q0_sub_p0_r;                                 \
+    filt_r = __msa_sat_s_h(filt_r, 7);                             \
+                                                                   \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0);  \
+    q0_sub_p0_l *= cnst3h;                                         \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                 \
+    filt_l = filt_l + q0_sub_p0_l;                                 \
+    filt_l = __msa_sat_s_h(filt_l, 7);                             \
+                                                                   \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);            \
+    filt = filt & (v16i8)mask;                                     \
+    filt2 = filt & (v16i8)hev;                                     \
+                                                                   \
+    hev = __msa_xori_b(hev, 0xff);                                 \
+    filt = filt & (v16i8)hev;                                      \
+    cnst4b = __msa_ldi_b(4);                                       \
+    filt1 = __msa_adds_s_b(filt2, cnst4b);                         \
+    filt1 >>= 3;                                                   \
+    cnst3b = __msa_ldi_b(3);                                       \
+    filt2 = __msa_adds_s_b(filt2, cnst3b);                         \
+    filt2 >>= 3;                                                   \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                            \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                            \
+                                                                   \
+    filt_sign = __msa_clti_s_b(filt, 0);                           \
+    ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);                  \
+                                                                   \
+    cnst27h = __msa_ldi_h(27);                                     \
+    cnst63h = __msa_ldi_h(63);                                     \
+                                                                   \
+    u_r = filt_r * cnst27h;                                        \
+    u_r += cnst63h;                                                \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+    u_l = filt_l * cnst27h;                                        \
+    u_l += cnst63h;                                                \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q0_m = __msa_subs_s_b(q0_m, u);                                \
+    q0 = __msa_xori_b((v16u8)q0_m, 0x80);                          \
+    p0_m = __msa_adds_s_b(p0_m, u);                                \
+    p0 = __msa_xori_b((v16u8)p0_m, 0x80);                          \
+    cnst18h = __msa_ldi_h(18);                                     \
+    u_r = filt_r * cnst18h;                                        \
+    u_r += cnst63h;                                                \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+                                                                   \
+    u_l = filt_l * cnst18h;                                        \
+    u_l += cnst63h;                                                \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q1_m = __msa_subs_s_b(q1_m, u);                                \
+    q1 = __msa_xori_b((v16u8)q1_m, 0x80);                          \
+    p1_m = __msa_adds_s_b(p1_m, u);                                \
+    p1 = __msa_xori_b((v16u8)p1_m, 0x80);                          \
+    u_r = filt_r << 3;                                             \
+    u_r += filt_r + cnst63h;                                       \
+    u_r >>= 7;                                                     \
+    u_r = __msa_sat_s_h(u_r, 7);                                   \
+                                                                   \
+    u_l = filt_l << 3;                                             \
+    u_l += filt_l + cnst63h;                                       \
+    u_l >>= 7;                                                     \
+    u_l = __msa_sat_s_h(u_l, 7);                                   \
+    u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);                     \
+    q2_m = __msa_subs_s_b(q2_m, u);                                \
+    q2 = __msa_xori_b((v16u8)q2_m, 0x80);                          \
+    p2_m = __msa_adds_s_b(p2_m, u);                                \
+    p2 = __msa_xori_b((v16u8)p2_m, 0x80);                          \
+}
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
+                     q0_in, q1_in, q2_in, q3_in,                   \
+                     limit_in, b_limit_in, thresh_in,              \
+                     hev_out, mask_out, flat_out)                  \
+{                                                                  \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+                                                                   \
+    p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in));               \
+    p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in));               \
+    p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in));               \
+    q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in));               \
+    q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in));               \
+    q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in));               \
+    p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in));               \
+    p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in));               \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+    hev_out = (thresh_in) < (v16u8)flat_out;                       \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+    p1_asub_q1_m >>= 1;                                            \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+    mask_out = (b_limit_in) < p0_asub_q0_m;                        \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+    mask_out = (limit_in) < (v16u8)mask_out;                       \
+    mask_out = __msa_xori_b(mask_out, 0xff);                       \
+}
+
+#define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)  \
+{                                                               \
+    uint16_t tmp0_h;                                            \
+    uint32_t tmp0_w;                                            \
+                                                                \
+    tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx);               \
+    tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx);               \
+    SW(tmp0_w, pdst);                                           \
+    SH(tmp0_h, pdst + stride);                                  \
+}
+
+
+static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr)
+{
+    v16u8 mask, hev, flat;
+    v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src - 4 + (8 * pitch), pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+    thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+    thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+    b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+    b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+    b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+    limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+    limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+    limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+    ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+    src -= 2;
+    ST4x8_UB(tmp2, tmp3, src, pitch);
+    src += (8 * pitch);
+    ST4x8_UB(tmp4, tmp5, src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch,
+                                                const uint8_t b_limit_in,
+                                                const uint8_t limit_in,
+                                                const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    temp_src = src - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    temp_src = src - 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, temp_src, pitch);
+    temp_src += (4 * pitch);
+    ST_UB2(q1, q2, temp_src, pitch);
+}
+
+static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                                 int32_t pitch,
+                                                 const uint8_t b_limit_in,
+                                                 const uint8_t limit_in,
+                                                 const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+
+    temp_src = src_u - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    temp_src = src_v - (pitch << 2);
+    LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    p2_d = __msa_copy_u_d((v2i64)p2, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2, 0);
+    src_u -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch);
+    src_u += 4 * pitch;
+    SD(q1_d, src_u);
+    src_u += pitch;
+    SD(q2_d, src_u);
+
+    p2_d = __msa_copy_u_d((v2i64)p2, 1);
+    p1_d = __msa_copy_u_d((v2i64)p1, 1);
+    p0_d = __msa_copy_u_d((v2i64)p0, 1);
+    q0_d = __msa_copy_u_d((v2i64)q0, 1);
+    q1_d = __msa_copy_u_d((v2i64)q1, 1);
+    q2_d = __msa_copy_u_d((v2i64)q2, 1);
+    src_v -= (pitch * 3);
+    SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch);
+    src_v += 4 * pitch;
+    SD(q1_d, src_v);
+    src_v += pitch;
+    SD(q2_d, src_v);
+}
+
+static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t b_limit_in,
+                                              const uint8_t limit_in,
+                                              const uint8_t thresh_in)
+{
+    uint8_t *temp_src;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    temp_src = src - 4;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    temp_src = src - 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
+    temp_src += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in)
+{
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+    ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+    ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+    ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+    ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+
+    src_u -= 3;
+    VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4);
+    src_u += pitch;
+    VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4);
+
+    src_v -= 3;
+    VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4);
+    src_v += pitch;
+    VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch,
+                                                const uint8_t *b_limit_ptr)
+{
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ST_UB2(p0, q0, (src - pitch), pitch);
+}
+
+void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit_ptr)
+{
+    uint8_t *temp_src;
+    v16u8 p1, p0, q1, q0;
+    v16u8 mask, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v8i16 tmp0, tmp1;
+
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    temp_src = src - 2;
+    LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    temp_src += (8 * pitch);
+    LD_UB8(temp_src, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p1, p0, q0, q1);
+    VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+    VP8_SIMPLE_FILT(p1, p0, q0, q1, mask);
+    ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+
+    src -= 1;
+    ST2x4_UB(tmp1, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp1, 4, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 0, src, pitch);
+    src += 4 * pitch;
+    ST2x4_UB(tmp0, 4, src, pitch);
+    src += 4 * pitch;
+}
+
+static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in)
+{
+    uint64_t p1_d, p0_d, q0_d, q1_d;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+    v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+    src_u = src_u - (pitch << 2);
+    LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+    src_u += (5 * pitch);
+    src_v = src_v - (pitch << 2);
+    LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+    src_v += (5 * pitch);
+
+    /* right 8 element of p3 are u pixel and
+       left 8 element of p3 are v pixel */
+    ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+    ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+    p1_d = __msa_copy_u_d((v2i64)p1, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1, 0);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_u, (- pitch));
+
+    p1_d = __msa_copy_u_d((v2i64)p1, 1);
+    p0_d = __msa_copy_u_d((v2i64)p0, 1);
+    q0_d = __msa_copy_u_d((v2i64)q0, 1);
+    q1_d = __msa_copy_u_d((v2i64)q1, 1);
+    SD4(q1_d, q0_d, p0_d, p1_d, src_v, (- pitch));
+}
+
+static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in)
+{
+    uint8_t *temp_src_u, *temp_src_v;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+    v16u8 mask, hev, flat, thresh, limit, b_limit;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+    v16u8 row9, row10, row11, row12, row13, row14, row15;
+    v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+    thresh = (v16u8)__msa_fill_b(thresh_in);
+    limit = (v16u8)__msa_fill_b(limit_in);
+    b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+    LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+    LD_UB8(src_v - 4, pitch,
+           row8, row9, row10, row11, row12, row13, row14, row15);
+    TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                        row8, row9, row10, row11, row12, row13, row14, row15,
+                        p3, p2, p1, p0, q0, q1, q2, q3);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+                 hev, mask, flat);
+    VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+    ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+    ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+    tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1);
+    tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0);
+    ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+
+    temp_src_u = src_u - 2;
+    ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
+    temp_src_u += 4 * pitch;
+    ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
+
+    temp_src_v = src_v - 2;
+    ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
+    temp_src_v += 4 * pitch;
+    ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+}
+
+void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u,
+                             uint8_t *src_v, int32_t pitch_y,
+                             int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr)
+{
+    mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y,
+                                        *lpf_info_ptr->mblim,
+                                        *lpf_info_ptr->lim,
+                                        *lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        mbloop_filter_horizontal_edge_uv_msa(src_u, src_v, pitch_u_v,
+                                             *lpf_info_ptr->mblim,
+                                             *lpf_info_ptr->lim,
+                                             *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u,
+                             uint8_t *src_v, int32_t pitch_y,
+                             int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr)
+{
+    mbloop_filter_vertical_edge_y_msa(src_y, pitch_y,
+                                      *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v,
+                                           *lpf_info_ptr->mblim,
+                                           *lpf_info_ptr->lim,
+                                           *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u,
+                            uint8_t *src_v, int32_t pitch_y,
+                            int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr)
+{
+    loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr,
+                                      lpf_info_ptr->blim,
+                                      lpf_info_ptr->lim,
+                                      lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        loop_filter_horizontal_edge_uv_msa(src_u + (4 * pitch_u_v),
+                                           src_v + (4 * pitch_u_v),
+                                           pitch_u_v,
+                                           *lpf_info_ptr->blim,
+                                           *lpf_info_ptr->lim,
+                                           *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u,
+                            uint8_t *src_v, int32_t pitch_y,
+                            int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr)
+{
+    loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr,
+                                    lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr);
+    if (src_u)
+    {
+        loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v,
+                                         *lpf_info_ptr->blim,
+                                         *lpf_info_ptr->lim,
+                                         *lpf_info_ptr->hev_thr);
+    }
+}
+
+void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y,
+                             const uint8_t *b_limit_ptr)
+{
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y),
+                                               pitch_y, b_limit_ptr);
+}
+
+void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y,
+                             const uint8_t *b_limit_ptr)
+{
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr);
+    vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr);
+}

diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 345a62f..e0fec12 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h

@@ -32,6 +32,210 @@
 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
 
+#if (__mips_isa_rev >= 6)
+#define LW(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint32_t val_m;                                   \
+                                                      \
+    asm volatile (                                    \
+        "lw  %[val_m],  %[psrc_m]  \n\t"              \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint64_t val_m = 0;                               \
+                                                      \
+    asm volatile (                                    \
+        "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc)                                             \
+({                                                           \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+    uint32_t val0_m, val1_m;                                 \
+    uint64_t val_m = 0;                                      \
+                                                             \
+    val0_m = LW(psrc_m);                                     \
+    val1_m = LW(psrc_m + 4);                                 \
+                                                             \
+    val_m = (uint64_t)(val1_m);                              \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                             \
+    val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint16_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sh  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+
+#define SW(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint32_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+
+#define SD(val, pdst)                     \
+{                                         \
+    uint8_t *pdst_m = (uint8_t *)(pdst);  \
+    const uint64_t val_m = (val);         \
+                                          \
+    asm volatile (                        \
+        "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                          \
+        : [pdst_m] "=m" (*pdst_m)         \
+        : [val_m] "r" (val_m)             \
+    );                                    \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LW(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint32_t val_m;                                   \
+                                                      \
+    asm volatile (                                    \
+        "ulw  %[val_m],  %[psrc_m]  \n\t"             \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+
+#if (__mips == 64)
+#define LD(psrc)                                      \
+({                                                    \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+    uint64_t val_m = 0;                               \
+                                                      \
+    asm volatile (                                    \
+        "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                      \
+        : [val_m] "=r" (val_m)                        \
+        : [psrc_m] "m" (*psrc_m)                      \
+    );                                                \
+                                                      \
+    val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc)                                             \
+({                                                           \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+    uint32_t val0_m, val1_m;                                 \
+    uint64_t val_m = 0;                                      \
+                                                             \
+    val0_m = LW(psrc_m1);                                    \
+    val1_m = LW(psrc_m1 + 4);                                \
+                                                             \
+    val_m = (uint64_t)(val1_m);                              \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                             \
+    val_m;                                                   \
+})
+#endif  // (__mips == 64)
+#define SH(val, pdst)                      \
+{                                          \
+    uint8_t *pdst_m = (uint8_t *)(pdst);   \
+    const uint16_t val_m = (val);          \
+                                           \
+    asm volatile (                         \
+        "ush  %[val_m],  %[pdst_m]  \n\t"  \
+                                           \
+        : [pdst_m] "=m" (*pdst_m)          \
+        : [val_m] "r" (val_m)              \
+    );                                     \
+}
+
+#define SW(val, pdst)                      \
+{                                          \
+    uint8_t *pdst_m = (uint8_t *)(pdst);   \
+    const uint32_t val_m = (val);          \
+                                           \
+    asm volatile (                         \
+        "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                           \
+        : [pdst_m] "=m" (*pdst_m)          \
+        : [val_m] "r" (val_m)              \
+    );                                     \
+}
+
+#define SD(val, pdst)                                         \
+{                                                             \
+    uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+    uint32_t val0_m, val1_m;                                  \
+                                                              \
+    val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                              \
+    SW(val0_m, pdst_m1);                                      \
+    SW(val1_m, pdst_m1 + 4);                                  \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride)  \
+{                                              \
+    SW(in0, (pdst));                           \
+    SW(in1, (pdst) + stride);                  \
+    SW(in2, (pdst) + 2 * stride);              \
+    SW(in3, (pdst) + 3 * stride);              \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride)  \
+{                                              \
+    SD(in0, (pdst));                           \
+    SD(in1, (pdst) + stride);                  \
+    SD(in2, (pdst) + 2 * stride);              \
+    SD(in3, (pdst) + 3 * stride);              \
+}
+
 /* Description : Load vectors with 16 byte elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
@@ -55,6 +259,14 @@
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 
+#define LD_B8(RTYPE, psrc, stride,                                      \
+              out0, out1, out2, out3, out4, out5, out6, out7)           \
+{                                                                       \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+
 /* Description : Load vectors with 8 halfword elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
@@ -85,6 +297,7 @@
     ST_B(RTYPE, in0, (pdst));                 \
     ST_B(RTYPE, in1, (pdst) + stride);        \
 }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
 
 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
 {                                                         \
@@ -106,6 +319,64 @@
 }
 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
 
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride)             \
+{                                                     \
+    uint16_t out0_m, out1_m, out2_m, out3_m;          \
+    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
+                                                      \
+    out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
+    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
+    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
+    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
+                                                      \
+    SH(out0_m, pblk_2x4_m);                           \
+    SH(out1_m, pblk_2x4_m + stride);                  \
+    SH(out2_m, pblk_2x4_m + 2 * stride);              \
+    SH(out3_m, pblk_2x4_m + 3 * stride);              \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \
+{                                                                 \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                      \
+    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                      \
+                                                                  \
+    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                    \
+    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                    \
+    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                    \
+    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                    \
+                                                                  \
+    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \
+}
+#define ST4x8_UB(in0, in1, pdst, stride)                            \
+{                                                                   \
+    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
+                                                                    \
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+}
+
 /* Description : Shuffle byte vector elements as per mask vector
    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
                  Outputs - out0, out1
@@ -162,6 +433,76 @@
     out_m;                                              \
 })
 
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);  \
+    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);  \
+}
+#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);  \
+    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);  \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);  \
+    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);  \
+}
+#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);  \
+    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);  \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);  \
+}
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
 /* Description : Interleave left half of halfword elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
@@ -203,6 +544,8 @@
     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);  \
 }
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
 
 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                 out0, out1, out2, out3)                         \
@@ -260,6 +603,14 @@
 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
 
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
 /* Description : Interleave both left and right half of input vectors
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -267,6 +618,13 @@
    Details     : Right half of byte elements from 'in0' and 'in1' are
                  interleaved and written to 'out0'
 */
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+}
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)            \
 {                                                        \
     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
@@ -480,6 +838,88 @@
     out3 = in0 - in3;                                            \
 }
 
+/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3)                        \
+{                                                                          \
+    v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+                                                                           \
+    ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
+    out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                           \
+                                                                           \
+    ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
+    out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                           \
+                                                                           \
+    ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
+    out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+                                                                           \
+    tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1);               \
+    tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m);           \
+    out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+}
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
+                            in8, in9, in10, in11, in12, in13, in14, in15,    \
+                            out0, out1, out2, out3, out4, out5, out6, out7)  \
+{                                                                            \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                             \
+    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
+    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
+    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
+    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
+                                                                             \
+    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                 \
+    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                 \
+    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                 \
+    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                 \
+    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                   \
+    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                 \
+    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                   \
+    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                 \
+                                                                             \
+    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                 \
+    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+                                                                             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);             \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \
+    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \
+}
+
 /* Description : Transpose 8x4 block with half word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index db38222..1dbe80e 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -51,57 +51,61 @@
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
 $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
 $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
 $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/;
 $vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
 $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
 $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
 
 add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
 $vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
 $vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
 $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
 
 add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
 $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
 
 add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
 $vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
 $vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
 $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
 
 #
 # IDCT

diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c
index 448217f..14282db 100644
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c

@@ -17,7 +17,7 @@
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 static int bsr(int mask) {
-  int eob;
+  unsigned long eob;
   _BitScanReverse(&eob, mask);
   eob++;
   if (mask == 0)

diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 47ad97c..9f6e936 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk

@@ -115,6 +115,7 @@
 
 # common (c)
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
 
 # common (c)

diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 6ebea9f..cd07a56 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h

@@ -15,14 +15,13 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/mips/common_dspr2.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #if HAVE_DSPR2
-#define CROP_WIDTH 512
 extern uint8_t *vp9_ff_cropTbl;
 
 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
@@ -51,40 +50,6 @@
    );                                                                          \
   out;                                                                    })
 
-static INLINE void vp9_prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   0,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   1,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
-static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   4,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   5,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
 void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                    int dest_stride);
 

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
index 91d62bc..aad7c45 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c

@@ -44,7 +44,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -148,8 +148,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -245,7 +245,7 @@
       : [pos] "r" (pos)
     );
 
-    vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -257,7 +257,7 @@
                                      filter_y, w, h);
         break;
       case 64:
-        vp9_prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
         convolve_bi_avg_vert_64_dspr2(src, src_stride,
                                       dst, dst_stride,
                                       filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
index 148b20f..bc60e93 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c

@@ -40,9 +40,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -135,9 +135,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -290,9 +290,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -539,11 +539,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -781,9 +781,9 @@
     );
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -807,8 +807,8 @@
                                       filter_x, h, 2);
         break;
       case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
         convolve_bi_avg_horiz_64_dspr2(src, src_stride,
                                       dst, dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
index 92644f2..b714f9a 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c

@@ -41,8 +41,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -132,8 +132,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -272,8 +272,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -504,9 +504,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -747,8 +747,8 @@
   );
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
+  prefetch_load(src);
+  prefetch_load(src + 32);
 
   switch (w) {
     case 4:
@@ -769,7 +769,7 @@
                                             (w/16));
       break;
     case 64:
-      vp9_prefetch_load(src + 32);
+      prefetch_load(src + 32);
       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
                                             dst, dst_stride,
                                             filter, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
index 1debdb4..27ea100 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c

@@ -39,9 +39,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -122,9 +122,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -252,9 +252,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -459,11 +459,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -651,7 +651,7 @@
   if (16 == x_step_q4) {
     uint32_t pos = 38;
 
-    vp9_prefetch_load((const uint8_t *)filter_x);
+    prefetch_load((const uint8_t *)filter_x);
 
     /* bit positon for extract from acc */
     __asm__ __volatile__ (
@@ -661,9 +661,9 @@
     );
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -687,8 +687,8 @@
                                    filter_x, (int32_t)h, 2);
         break;
       case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
         convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
                                    dst, (int32_t)dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
index bf01f11..32f5fb6 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c

@@ -44,7 +44,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -141,7 +141,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -230,7 +230,7 @@
       : [pos] "r" (pos)
     );
 
-    vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
     switch (w) {
       case 4 :
@@ -242,7 +242,7 @@
                                  filter_y, w, h);
         break;
       case 64 :
-        vp9_prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
         convolve_bi_vert_64_dspr2(src, src_stride,
                                   dst, dst_stride,
                                   filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index 1742279..d9cbfe6 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c

@@ -49,7 +49,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -210,8 +210,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -372,7 +372,7 @@
         : [pos] "r" (pos)
       );
 
-      vp9_prefetch_store(dst);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -384,7 +384,7 @@
                                     filter_y, w, h);
           break;
         case 64:
-          vp9_prefetch_store(dst + 32);
+          prefetch_store(dst + 32);
           convolve_avg_vert_64_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_y, h);
@@ -452,17 +452,17 @@
   uint32_t tp3, tp4, tn2;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -482,9 +482,9 @@
     case 8:
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -509,9 +509,9 @@
     case 16:
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -544,9 +544,9 @@
     case 32:
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -593,16 +593,16 @@
       }
       break;
     case 64:
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
index 69da1cf..cdb8312 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c

@@ -43,9 +43,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -165,9 +165,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -357,9 +357,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -668,11 +668,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -985,9 +985,9 @@
       );
 
       /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+      prefetch_load(src);
+      prefetch_load(src + 32);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -1011,8 +1011,8 @@
                                       filter_x, h, 2);
           break;
         case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+          prefetch_load(src + 64);
+          prefetch_store(dst + 32);
 
           convolve_avg_horiz_64_dspr2(src, src_stride,
                                       dst, dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 58b50d2..a1309d1 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c

@@ -60,8 +60,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -176,8 +176,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -355,8 +355,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -645,9 +645,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -993,8 +993,8 @@
     src -= (src_stride * 3 + 3);
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
+    prefetch_load(src);
+    prefetch_load(src + 32);
 
     switch (w) {
       case 4:
@@ -1015,7 +1015,7 @@
                                            (w/16));
         break;
       case 64:
-        vp9_prefetch_load(src + 32);
+        prefetch_load(src + 32);
         convolve_horiz_64_transposed_dspr2(src, src_stride,
                                            temp, intermediate_height,
                                            filter_x, intermediate_height);
@@ -1078,9 +1078,9 @@
   int x, y;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
@@ -1089,9 +1089,9 @@
 
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         (%[src])      \n\t"
@@ -1112,9 +1112,9 @@
 
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1137,9 +1137,9 @@
 
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1169,9 +1169,9 @@
 
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1209,16 +1209,16 @@
       uint32_t tp1, tp2, tp3, tp4;
       uint32_t tp5, tp6, tp7, tp8;
 
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
index 0303896..d0e3095 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c

@@ -43,9 +43,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -154,9 +154,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -323,9 +323,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -593,11 +593,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -859,7 +859,7 @@
     if (16 == x_step_q4) {
       uint32_t pos = 38;
 
-      vp9_prefetch_load((const uint8_t *)filter_x);
+      prefetch_load((const uint8_t *)filter_x);
       src -= 3;
 
       /* bit positon for extract from acc */
@@ -870,9 +870,9 @@
       );
 
       /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+      prefetch_load(src);
+      prefetch_load(src + 32);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -896,8 +896,8 @@
                                   filter_x, (int32_t)h, 2);
           break;
         case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+          prefetch_load(src + 64);
+          prefetch_store(dst + 32);
 
           convolve_horiz_64_dspr2(src, (int32_t)src_stride,
                                   dst, (int32_t)dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
index 0930bb3..98acb81 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c

@@ -49,7 +49,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -203,8 +203,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -358,7 +358,7 @@
         : [pos] "r" (pos)
       );
 
-      vp9_prefetch_store(dst);
+      prefetch_store(dst);
 
       switch (w) {
         case 4 :
@@ -370,7 +370,7 @@
                                 filter_y, w, h);
           break;
         case 64 :
-          vp9_prefetch_store(dst + 32);
+          prefetch_store(dst + 32);
           convolve_vert_64_dspr2(src, src_stride,
                                  dst, dst_stride,
                                  filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 202d913..10a24f3 100644
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

@@ -34,7 +34,7 @@
 
   for (i = no_rows; i--; ) {
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 16));
+    prefetch_load((const uint8_t *)(input + 16));
 
     __asm__ __volatile__ (
         "lh       %[load1],              0(%[input])                    \n\t"
@@ -421,14 +421,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 16; ++i) {
     dest_pix = (dest + i);
@@ -1124,7 +1124,7 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
         iadst16(input, outptr);
         input += 16;
@@ -1144,7 +1144,7 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
         iadst16(input, outptr);
         input += 16;

diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 7ceebb6..a256145 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c

@@ -44,14 +44,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 32; ++i) {
     dest_pix = dest + i;

diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index 74a90b0..dd18831 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

@@ -96,8 +96,8 @@
     }
 
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 32));
-    vp9_prefetch_load((const uint8_t *)(input + 48));
+    prefetch_load((const uint8_t *)(input + 32));
+    prefetch_load((const uint8_t *)(input + 48));
 
     __asm__ __volatile__ (
         "lh       %[load1],             2(%[input])                     \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 280190a..4e31f9f 100644
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

@@ -115,14 +115,14 @@
   uint8_t   *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 4; ++i) {
       dest_pix = (dest + i);

diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 04d2266..6898d56 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

@@ -211,14 +211,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 8; ++i) {
       dest_pix = (dest + i);

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index fd8fe3f..d776b44 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -128,6 +128,11 @@
   ENTROPY_CONTEXT *left_context;
   int16_t seg_dequant[MAX_SEGMENTS][2];
 
+  // number of 4x4s in current block
+  uint16_t n4_w, n4_h;
+  // log2 of n4_w, n4_h
+  uint8_t n4_wl, n4_hl;
+
   // encoder
   const int16_t *dequant;
 };
@@ -144,6 +149,8 @@
 
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
+  uint8_t bmode_blocks_wl;
+  uint8_t bmode_blocks_hl;
 
   FRAME_COUNTS *counts;
   TileInfo tile;
@@ -159,7 +166,7 @@
   int up_available;
   int left_available;
 
-  const vp9_prob (*partition_probs)[PARTITION_TYPES - 1];
+  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
 
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
@@ -187,12 +194,6 @@
   int bd;
 #endif
 
-  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-#if CONFIG_VP9_ENCODER
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
-#else
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-#endif
   int lossless;
   int corrupted;
 
@@ -261,7 +262,7 @@
   }
 }
 
-static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
+static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi,
                                                const MODE_INFO *above_mi,
                                                const MODE_INFO *left_mi,
                                                int block) {

diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 9c2d779..42c3a09 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h

@@ -16,6 +16,7 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -24,9 +25,6 @@
 extern "C" {
 #endif
 
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
     assert(sizeof(dest) == sizeof(src)); \
@@ -42,18 +40,6 @@
 #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
 #define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
 
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
@@ -70,20 +56,6 @@
       return (uint16_t)clamp(val, 0, 4095);
   }
 }
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-
-#else
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_DEBUG

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index ad6c04b..579857b 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 
 // Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   2, 6,                                // 0 = LOW_VAL
   -TWO_TOKEN, 4,                       // 1 = TWO
   -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
@@ -27,30 +27,30 @@
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
 };
 
-const vp9_prob vp9_cat1_prob[] = { 159 };
-const vp9_prob vp9_cat2_prob[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob[] = {
+const vpx_prob vp9_cat1_prob[] = { 159 };
+const vpx_prob vp9_cat2_prob[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob[] = {
     254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 #if CONFIG_VP9_HIGHBITDEPTH
-const vp9_prob vp9_cat1_prob_high10[] = { 159 };
-const vp9_prob vp9_cat2_prob_high10[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob_high10[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob_high10[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob_high10[] = {
+const vpx_prob vp9_cat1_prob_high10[] = { 159 };
+const vpx_prob vp9_cat2_prob_high10[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob_high10[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob_high10[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob_high10[] = {
     255, 255, 254, 254, 254, 252, 249, 243,
     230, 196, 177, 153, 140, 133, 130, 129
 };
-const vp9_prob vp9_cat1_prob_high12[] = { 159 };
-const vp9_prob vp9_cat2_prob_high12[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob_high12[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob_high12[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob_high12[] = {
+const vpx_prob vp9_cat1_prob_high12[] = { 159 };
+const vpx_prob vp9_cat2_prob_high12[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob_high12[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob_high12[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob_high12[] = {
     255, 255, 255, 255, 254, 254, 254, 252, 249,
     243, 230, 196, 177, 153, 140, 133, 130, 129
 };
@@ -147,7 +147,7 @@
 // by averaging :
 // vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] +
 //                              vp9_pareto8_full[l+1][node] ) >> 1;
-const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
+const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
   {  6,  86, 128,  11,  87,  42,  91,  52},
   {  9,  86, 129,  17,  88,  61,  94,  76},
@@ -742,14 +742,14 @@
   }
 };
 
-static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
+static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
   memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
-         MODEL_NODES * sizeof(vp9_prob));
+         MODEL_NODES * sizeof(vpx_prob));
 }
 
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
   if (full != model)
-    memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+    memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 2fc97c3..a1746bc 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -12,10 +12,10 @@
 #define VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -76,8 +76,8 @@
 #define EOB_MODEL_TOKEN 3
 
 typedef struct {
-  const vp9_tree_index *tree;
-  const vp9_prob *prob;
+  const vpx_tree_index *tree;
+  const vpx_prob *prob;
   int len;
   int base_val;
   const int16_t *cost;
@@ -160,17 +160,17 @@
 #define PIVOT_NODE                  2   // which node is pivot
 
 #define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
-extern const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
-extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+extern const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
+extern const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 
-typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
+typedef vpx_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
                                       [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
 
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
                                           [COEFF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
 
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full);
 
 typedef char ENTROPY_CONTEXT;
 

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 22d431b..670348b 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -13,7 +13,7 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
   {  // above = dc
     { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
     {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
@@ -127,7 +127,7 @@
   }
 };
 
-const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
   { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
   { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
   { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
@@ -140,14 +140,14 @@
   { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
 };
 
-static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
   {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
   { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
   { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
   { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
 };
 
-static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
   {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
   {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
@@ -160,7 +160,7 @@
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                      [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
   { 158,  97,  94 },  // a/l both not split
@@ -184,7 +184,7 @@
   {  12,   3,   3 },  // a/l both split
 };
 
-static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
   { 199, 122, 141 },  // a/l both not split
@@ -208,7 +208,7 @@
   {  10,   7,   6 },  // a/l both split
 };
 
-static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
                                               [INTER_MODES - 1] = {
   {2,       173,   34},  // 0 = both zero mv
   {7,       145,   85},  // 1 = one zero mv + one a predicted mv
@@ -220,7 +220,7 @@
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -232,31 +232,31 @@
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
 
-const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
+const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
 
-const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
+const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
 
-static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
 };
 
-static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
   239, 183, 119,  96,  41
 };
 
-static const vp9_prob default_comp_ref_p[REF_CONTEXTS] = {
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
   50, 126, 123, 221, 226
 };
 
-static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
   {  33,  16 },
   {  77,  74 },
   { 142, 142 },
@@ -302,11 +302,11 @@
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
-static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = {
+static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
@@ -328,7 +328,7 @@
   vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree
+const vpx_tree_index vp9_switchable_interp_tree
                          [TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
@@ -355,24 +355,24 @@
           pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+    vpx_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
                 counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
                          counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+    vpx_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
 
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      vp9_tree_merge_probs(vp9_switchable_interp_tree,
+      vpx_tree_merge_probs(vp9_switchable_interp_tree,
                            pre_fc->switchable_interp_prob[i],
                            counts->switchable_interp[i],
                            fc->switchable_interp_prob[i]);

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 8c9e6a7..371738a 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -28,9 +28,9 @@
 struct VP9Common;
 
 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
 };
 
 struct tx_counts {
@@ -41,19 +41,19 @@
 };
 
 typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+  vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vpx_prob single_ref_prob[REF_CONTEXTS][2];
+  vpx_prob comp_ref_prob[REF_CONTEXTS];
   struct tx_probs tx_probs;
-  vp9_prob skip_probs[SKIP_CONTEXTS];
+  vpx_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
   int initialized;
 } FRAME_CONTEXT;
@@ -77,15 +77,15 @@
   nmv_context_counts mv;
 } FRAME_COUNTS;
 
-extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+extern const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+extern const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
-extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern const vp9_tree_index vp9_switchable_interp_tree
+extern const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
+extern const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+extern const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+extern const vpx_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
 
 void vp9_setup_past_independence(struct VP9Common *cm);

diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 76cdb05..3acfe14 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c

@@ -14,13 +14,13 @@
 // Integer pel reference mv threshold for use of high-precision 1/8 mv
 #define COMPANDED_MVREF_THRESH 8
 
-const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
+const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
 
-const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
+const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
   -MV_CLASS_1, 4,
   6, 8,
@@ -33,11 +33,11 @@
   -MV_CLASS_9, -MV_CLASS_10,
 };
 
-const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
 
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
+const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
   -0, 2,
   -1, 4,
   -2, -3
@@ -183,7 +183,7 @@
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
-  vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+  vpx_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
                        fc->joints);
 
   for (i = 0; i < 2; ++i) {
@@ -192,19 +192,19 @@
     const nmv_component_counts *c = &counts->comps[i];
 
     comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
-    vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+    vpx_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
                          comp->classes);
-    vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+    vpx_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
                          comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
       comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+      vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
                            c->class0_fp[j], comp->class0_fp[j]);
 
-    vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+    vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
       comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);

diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 637f451..8c817bf 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h

@@ -14,8 +14,9 @@
 
 #include "./vpx_config.h"
 
+#include "vpx_dsp/prob.h"
+
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -76,24 +77,24 @@
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_joint_tree[];
-extern const vp9_tree_index vp9_mv_class_tree[];
-extern const vp9_tree_index vp9_mv_class0_tree[];
-extern const vp9_tree_index vp9_mv_fp_tree[];
+extern const vpx_tree_index vp9_mv_joint_tree[];
+extern const vpx_tree_index vp9_mv_class_tree[];
+extern const vpx_tree_index vp9_mv_class0_tree[];
+extern const vpx_tree_index vp9_mv_fp_tree[];
 
 typedef struct {
-  vp9_prob sign;
-  vp9_prob classes[MV_CLASSES - 1];
-  vp9_prob class0[CLASS0_SIZE - 1];
-  vp9_prob bits[MV_OFFSET_BITS];
-  vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
-  vp9_prob fp[MV_FP_SIZE - 1];
-  vp9_prob class0_hp;
-  vp9_prob hp;
+  vpx_prob sign;
+  vpx_prob classes[MV_CLASSES - 1];
+  vpx_prob class0[CLASS0_SIZE - 1];
+  vpx_prob bits[MV_OFFSET_BITS];
+  vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vpx_prob fp[MV_FP_SIZE - 1];
+  vpx_prob class0_hp;
+  vpx_prob hp;
 } nmv_component;
 
 typedef struct {
-  vp9_prob joints[MV_JOINTS - 1];
+  vpx_prob joints[MV_JOINTS - 1];
   nmv_component comps[2];
 } nmv_context;
 

diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index cee1682..cbce2dd 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h

@@ -37,6 +37,10 @@
   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
 
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
 // Constants:
 //  for (int i = 1; i< 32; ++i)
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
@@ -158,7 +162,7 @@
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
 #define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
 #else
-#define WRAPLOW(x, bd) (x)
+#define WRAPLOW(x, bd) ((int32_t)(x))
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 9816728..0915918 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -9,6 +9,7 @@
  */
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -326,55 +327,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                    lfi0->hev_thr);
         } else if (mask_16x16_0 & 1) {
-          vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                               lfi0->hev_thr);
         } else {
-          vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                               lfi1->lim, lfi1->hev_thr);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
-          vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
-          vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                              lfi0->hev_thr, 1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
@@ -426,55 +427,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                           lfi0->hev_thr, bd);
         } else if (mask_16x16_0 & 1) {
-          vp9_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                                      lfi0->hev_thr, bd);
         } else {
-          vp9_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                                      lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_8x8_0 & 1) {
-          vp9_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_0 & 1) {
-          vp9_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
@@ -512,11 +513,11 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 2);
           count = 2;
         } else {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1);
         }
       } else if (mask_8x8 & 1) {
@@ -524,28 +525,28 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4 & 1) {
@@ -553,31 +554,31 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4_int & 1) {
-        vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                              lfi->hev_thr, 1);
       }
     }
@@ -609,11 +610,11 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 2, bd);
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 1, bd);
         }
       } else if (mask_8x8 & 1) {
@@ -621,31 +622,31 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                              lfi->lim, lfi->hev_thr,
                                              lfin->mblim, lfin->lim,
                                              lfin->hev_thr, bd);
           } else {
             if (mask_4x4_int & 1) {
-              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1, bd);
             } else if (mask_4x4_int & 2) {
-              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                           lfin->lim, lfin->hev_thr, 1, bd);
             }
           }
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1, bd);
 
           if (mask_4x4_int & 1) {
-            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1, bd);
           }
         }
@@ -654,35 +655,35 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                              lfi->lim, lfi->hev_thr,
                                              lfin->mblim, lfin->lim,
                                              lfin->hev_thr, bd);
           } else {
             if (mask_4x4_int & 1) {
-              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1, bd);
             } else if (mask_4x4_int & 2) {
-              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                           lfin->lim, lfin->hev_thr, 1, bd);
             }
           }
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1, bd);
 
           if (mask_4x4_int & 1) {
-            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1, bd);
           }
         }
       } else if (mask_4x4_int & 1) {
-        vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1, bd);
       }
     }
@@ -1093,15 +1094,15 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       } else if (mask_4x4 & 1) {
-        vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
@@ -1127,18 +1128,18 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, bd);
       } else if (mask_8x8 & 1) {
-        vp9_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
                                   lfi->hev_thr, 1, bd);
       } else if (mask_4x4 & 1) {
-        vp9_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+      vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
     s += 8;
     lfl += 1;

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 8f1240a..c373c02 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -345,14 +345,15 @@
   xd->partition_probs =
       frame_is_intra_only(cm) ?
           &vp9_kf_partition_probs[0] :
-          (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+          (const vpx_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
 }
 
-static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->plane[i].dqcoeff = xd->dqcoeff;
+    xd->plane[i].dqcoeff = dqcoeff;
     xd->above_context[i] = cm->above_context +
         i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
 
@@ -372,7 +373,7 @@
   set_partition_probs(cm, xd);
 }
 
-static INLINE const vp9_prob* get_partition_probs(const MACROBLOCKD *xd,
+static INLINE const vpx_prob* get_partition_probs(const MACROBLOCKD *xd,
                                                   int ctx) {
   return xd->partition_probs[ctx];
 }

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 7616144..67b95db 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -47,7 +47,7 @@
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
+static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
@@ -60,7 +60,7 @@
   return above_skip + left_skip;
 }
 
-static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm,
                                          const MACROBLOCKD *xd) {
   return cm->fc->skip_probs[vp9_get_skip_context(xd)];
 }
@@ -69,14 +69,14 @@
 
 int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd) {
   return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
 int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
   return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
@@ -84,7 +84,7 @@
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                     const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
   return cm->fc->comp_ref_prob[pred_context];
@@ -92,14 +92,14 @@
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
@@ -127,7 +127,7 @@
   return (above_ctx + left_ctx) > max_tx_size;
 }
 
-static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
                                            const struct tx_probs *tx_probs) {
   switch (max_tx_size) {
     case TX_8X8:
@@ -142,7 +142,7 @@
   }
 }
 
-static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
+static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
                                             const MACROBLOCKD *xd,
                                             const struct tx_probs *tx_probs) {
   return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);

diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 1e9acb8..f969ff1 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c

@@ -1059,20 +1059,19 @@
   }
 }
 
-void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,
                              int aoff, int loff, int plane) {
-  const int bwl = bwl_in - tx_size;
-  const int wmask = (1 << bwl) - 1;
-  const int have_top = (block_idx >> bwl) || xd->up_available;
-  const int have_left = (block_idx & wmask) || xd->left_available;
-  const int have_right = ((block_idx & wmask) != wmask);
+  const int bw = (1 << bwl_in);
+  const int txw = (1 << tx_size);
+  const int have_top = loff || xd->up_available;
+  const int have_left = aoff || xd->left_available;
+  const int have_right = (aoff + txw) < bw;
   const int x = aoff * 4;
   const int y = loff * 4;
 
-  assert(bwl >= 0);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,

diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index da5e435..de45380 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h

@@ -20,7 +20,7 @@
 
 void vp9_init_intra_predictors(void);
 
-void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 524e79f..102f2a0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -220,49 +220,6 @@
 specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 #
-# Loopfilter
-#
-add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
-
-add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
-
-add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
-
-add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
-
-#
 # post proc
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -475,7 +432,7 @@
     specialize qw/vp9_iwht4x4_1_add msa/;
 
     add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_iwht4x4_16_add msa/;
+    specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
   }
 }
 
@@ -668,42 +625,6 @@
   specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
 
   #
-  # Loopfilter
-  #
-  add_proto qw/void vp9_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_16 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_16_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_8 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_8_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_4 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_4_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_16 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_8 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_8_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_4 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_4_dual sse2/;
-
-  #
   # post proc
   #
   if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -824,7 +745,7 @@
 specialize qw/vp9_int_pro_row sse2 neon/;
 
 add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
-specialize qw/vp9_int_pro_col sse2/;
+specialize qw/vp9_int_pro_col sse2 neon/;
 
 add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
 specialize qw/vp9_vector_var sse2/;
@@ -860,12 +781,6 @@
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b/;
-
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/;
-
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant/;
 } else {
@@ -881,12 +796,6 @@
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
-
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
-
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
@@ -1014,12 +923,6 @@
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b sse2/;
-
-  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
-
   #
   # Structured Similarity (SSIM)
   #
@@ -1030,13 +933,13 @@
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4 sse2/;
+  specialize qw/vp9_highbd_fht4x4/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8 sse2/;
+  specialize qw/vp9_highbd_fht8x8/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16 sse2/;
+  specialize qw/vp9_highbd_fht16x16/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fwht4x4/;

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 471e238..c8ef618 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -54,7 +54,7 @@
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
+const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7
 };

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 95c9918..5b75d8d 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_SEG_COMMON_H_
 #define VP9_COMMON_VP9_SEG_COMMON_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,8 +42,8 @@
   uint8_t abs_delta;
   uint8_t temporal_update;
 
-  vp9_prob tree_probs[SEG_TREE_PROBS];
-  vp9_prob pred_probs[PREDICTION_PROBS];
+  vpx_prob tree_probs[SEG_TREE_PROBS];
+  vpx_prob pred_probs[PREDICTION_PROBS];
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
   unsigned int feature_mask[MAX_SEGMENTS];
@@ -76,7 +76,7 @@
   return seg->feature_data[segment_id][feature_id];
 }
 
-extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
+extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
new file mode 100644
index 0000000..69b68e6
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_sse2.asm

@@ -0,0 +1,102 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
+  paddw           m0,        m2
+  psubw           m3,        m1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movq            m%3,       [outputq]
+  movq            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+  mova            m0,        [inputq +  0]
+  mova            m1,        [inputq + 16]
+
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  REORDER_INPUTS
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 6c64540..c6d3bf1 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -14,6 +14,8 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
@@ -38,8 +40,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
@@ -74,19 +74,19 @@
   return len != 0 && len <= (size_t)(end - start);
 }
 
-static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
-  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
+static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
+  const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max));
   return data > max ? max : data;
 }
 
-static TX_MODE read_tx_mode(vp9_reader *r) {
-  TX_MODE tx_mode = vp9_read_literal(r, 2);
+static TX_MODE read_tx_mode(vpx_reader *r) {
+  TX_MODE tx_mode = vpx_read_literal(r, 2);
   if (tx_mode == ALLOW_32X32)
-    tx_mode += vp9_read_bit(r);
+    tx_mode += vpx_read_bit(r);
   return tx_mode;
 }
 
-static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
   int i, j;
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
@@ -102,14 +102,14 @@
       vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
@@ -117,9 +117,9 @@
 }
 
 static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
-                                                vp9_reader *r) {
+                                                vpx_reader *r) {
   if (is_compound_reference_allowed(cm)) {
-    return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT
+    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
                                               : COMPOUND_REFERENCE)
                            : SINGLE_REFERENCE;
   } else {
@@ -127,7 +127,7 @@
   }
 }
 
-static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vpx_reader *r) {
   FRAME_CONTEXT *const fc = cm->fc;
   int i;
 
@@ -146,14 +146,14 @@
       vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
 }
 
-static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
+static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
-    if (vp9_read(r, MV_UPDATE_PROB))
-      p[i] = (vp9_read_literal(r, 7) << 1) | 1;
+    if (vpx_read(r, MV_UPDATE_PROB))
+      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
 }
 
-static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
@@ -182,35 +182,29 @@
   }
 }
 
-static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    TX_SIZE tx_size, uint8_t *dst, int stride,
-                                    int eob) {
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
-    TX_TYPE tx_type = DCT_DCT;
     tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       if (xd->lossless) {
-        tx_type = DCT_DCT;
         vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
       } else {
-        const PLANE_TYPE plane_type = pd->plane_type;
         switch (tx_size) {
           case TX_4X4:
-            tx_type = get_tx_type_4x4(plane_type, xd, block);
-            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           case TX_8X8:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           case TX_16X16:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           case TX_32X32:
-            tx_type = DCT_DCT;
             vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
             break;
           default:
@@ -219,25 +213,19 @@
       }
     } else {
       if (xd->lossless) {
-        tx_type = DCT_DCT;
         vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
       } else {
-        const PLANE_TYPE plane_type = pd->plane_type;
         switch (tx_size) {
           case TX_4X4:
-            tx_type = get_tx_type_4x4(plane_type, xd, block);
-            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
             break;
           case TX_8X8:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
             break;
           case TX_16X16:
-            tx_type = get_tx_type(plane_type, xd);
-            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
             break;
           case TX_32X32:
-            tx_type = DCT_DCT;
             vp9_idct32x32_add(dqcoeff, dst, stride, eob);
             break;
           default:
@@ -248,25 +236,19 @@
     }
 #else
     if (xd->lossless) {
-      tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
-      const PLANE_TYPE plane_type = pd->plane_type;
       switch (tx_size) {
         case TX_4X4:
-          tx_type = get_tx_type_4x4(plane_type, xd, block);
-          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct4x4_add(dqcoeff, dst, stride, eob);
           break;
         case TX_8X8:
-          tx_type = get_tx_type(plane_type, xd);
-          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct8x8_add(dqcoeff, dst, stride, eob);
           break;
         case TX_16X16:
-          tx_type = get_tx_type(plane_type, xd);
-          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          vp9_idct16x16_add(dqcoeff, dst, stride, eob);
           break;
         case TX_32X32:
-          tx_type = DCT_DCT;
           vp9_idct32x32_add(dqcoeff, dst, stride, eob);
           break;
         default:
@@ -277,7 +259,97 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (eob == 1) {
-      memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+      dqcoeff[0] = 0;
+    } else {
+      if (tx_size <= TX_16X16 && eob <= 10)
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+    }
+  }
+}
+
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+                                          const TX_TYPE tx_type,
+                                          const TX_SIZE tx_size,
+                                          uint8_t *dst, int stride,
+                                          int eob) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  if (eob > 0) {
+    tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (xd->lossless) {
+        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_8X8:
+            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_16X16:
+            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          case TX_32X32:
+            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+        }
+      }
+    } else {
+      if (xd->lossless) {
+        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+      } else {
+        switch (tx_size) {
+          case TX_4X4:
+            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_8X8:
+            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_16X16:
+            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+            break;
+          case TX_32X32:
+            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+            break;
+          default:
+            assert(0 && "Invalid transform size");
+            return;
+        }
+      }
+    }
+#else
+    if (xd->lossless) {
+      vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+    } else {
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_8X8:
+          vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_16X16:
+          vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+          break;
+        case TX_32X32:
+          vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
+          return;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (eob == 1) {
+      dqcoeff[0] = 0;
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
@@ -289,65 +361,49 @@
   }
 }
 
-struct intra_args {
-  MACROBLOCKD *xd;
-  vp9_reader *r;
-  int seg_id;
-};
-
-static void predict_and_reconstruct_intra_block(int plane, int block,
-                                                BLOCK_SIZE plane_bsize,
-                                                TX_SIZE tx_size, void *arg) {
-  struct intra_args *const args = (struct intra_args *)arg;
-  MACROBLOCKD *const xd = args->xd;
+static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
+                                                vpx_reader *r,
+                                                MB_MODE_INFO *const mbmi,
+                                                int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->mi[0];
-  const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
-                                            : mi->mbmi.uv_mode;
-  int x, y;
+  PREDICTION_MODE mode = (plane == 0) ? mbmi->mode : mbmi->uv_mode;
   uint8_t *dst;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
 
-  vp9_predict_intra_block(xd, block >> (tx_size << 1),
-                          b_width_log2_lookup[plane_bsize], tx_size, mode,
+  if (mbmi->sb_type < BLOCK_8X8)
+    if (plane == 0)
+      mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride,
-                          x, y, plane);
+                          col, row, plane);
 
-  if (!mi->mbmi.skip) {
+  if (!mbmi->skip) {
+    const TX_TYPE tx_type = (plane || xd->lossless) ?
+        DCT_DCT : intra_mode_to_tx_type_lookup[mode];
     const scan_order *sc = (plane || xd->lossless) ?
-        &vp9_default_scan_orders[tx_size] :
-        &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
-    const int eob = vp9_decode_block_tokens(xd, plane, sc,
-                                            plane_bsize, x, y, tx_size,
-                                            args->r, args->seg_id);
-    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
-                            eob);
+        &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
+    const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
+                                            r, mbmi->segment_id);
+    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                  dst, pd->dst.stride, eob);
   }
 }
 
-struct inter_args {
-  MACROBLOCKD *xd;
-  vp9_reader *r;
-  int *eobtotal;
-  int seg_id;
-};
-
-static void reconstruct_inter_block(int plane, int block,
-                                    BLOCK_SIZE plane_bsize,
-                                    TX_SIZE tx_size, void *arg) {
-  struct inter_args *args = (struct inter_args *)arg;
-  MACROBLOCKD *const xd = args->xd;
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
+                                   MB_MODE_INFO *const mbmi, int plane,
+                                   int row, int col, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int x, y, eob;
   const scan_order *sc = &vp9_default_scan_orders[tx_size];
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  eob = vp9_decode_block_tokens(xd, plane, sc, plane_bsize,
-                                x, y, tx_size, args->r, args->seg_id);
-  inverse_transform_block(xd, plane, block, tx_size,
-                          &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
-                          pd->dst.stride, eob);
-  *args->eobtotal += eob;
+  const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
+                                          mbmi->segment_id);
+
+  inverse_transform_block_inter(xd, plane, tx_size,
+                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                            pd->dst.stride, eob);
+  return eob;
 }
 
 static void build_mc_border(const uint8_t *src, int src_stride,
@@ -648,8 +704,7 @@
 
 static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
                                           MACROBLOCKD *xd,
-                                          int mi_row, int mi_col,
-                                          BLOCK_SIZE bsize) {
+                                          int mi_row, int mi_col) {
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
@@ -659,15 +714,13 @@
   const int is_compound = has_second_ref(&mi->mbmi);
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
     struct macroblockd_plane *const pd = &xd->plane[plane];
     struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
 
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
+    const int n4w_x4 = 4 * num_4x4_w;
+    const int n4h_x4 = 4 * num_4x4_h;
     int ref;
 
     for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -680,11 +733,10 @@
 
       if (sb_type < BLOCK_8X8) {
         int i = 0, x, y;
-        assert(bsize == BLOCK_8X8);
         for (y = 0; y < num_4x4_h; ++y) {
           for (x = 0; x < num_4x4_w; ++x) {
             const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
                                        4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
                                        sf, pre_buf, dst_buf, &mv,
                                        ref_frame_buf, is_scaled, ref);
@@ -692,8 +744,8 @@
         }
       } else {
         const MV mv = mi->mbmi.mv[ref].as_mv;
-        dec_build_inter_predictors(pbi, xd, plane, bw, bh,
-                                   0, 0, bw, bh, mi_x, mi_y, kernel,
+        dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+                                   0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
                                    sf, pre_buf, dst_buf, &mv, ref_frame_buf,
                                    is_scaled, ref);
       }
@@ -701,24 +753,53 @@
   }
 }
 
+static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
+                                         int n4_wl, int n4_hl) {
+  // get minimum log2 num4x4s dimension
+  const int x = MIN(n4_wl, n4_hl);
+  return MIN(mbmi->tx_size,  x);
+}
+
+static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_w);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * pd->n4_h);
+  }
+}
+
+static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+                         int bhl) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+  }
+}
+
 static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int bw, int bh, int x_mis, int y_mis,
+                                 int bwl, int bhl) {
   const int offset = mi_row * cm->mi_stride + mi_col;
   int x, y;
   const TileInfo *const tile = &xd->tile;
 
   xd->mi = cm->mi_grid_visible + offset;
   xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
   xd->mi[0]->mbmi.sb_type = bsize;
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) {
       xd->mi[y * cm->mi_stride + x] = xd->mi[0];
     }
 
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
   set_skip_context(xd, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
@@ -731,10 +812,17 @@
 
 static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          int mi_row, int mi_col,
-                         vp9_reader *r, BLOCK_SIZE bsize) {
+                         vpx_reader *r, BLOCK_SIZE bsize,
+                         int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col);
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+
+  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                                   bw, bh, x_mis, y_mis, bwl, bhl);
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -744,50 +832,110 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r);
-
-  if (less8x8)
-    bsize = BLOCK_8X8;
+  vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
   if (mbmi->skip) {
-    reset_skip_context(xd, bsize);
+    dec_reset_skip_context(xd);
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = {xd, r, mbmi->segment_id};
-    vp9_foreach_transformed_block(xd, bsize,
-                                  predict_and_reconstruct_intra_block, &arg);
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const TX_SIZE tx_size =
+          plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                  : mbmi->tx_size;
+      const int num_4x4_w = pd->n4_w;
+      const int num_4x4_h = pd->n4_h;
+      const int step = (1 << tx_size);
+      int row, col;
+      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+      for (row = 0; row < max_blocks_high; row += step)
+        for (col = 0; col < max_blocks_wide; col += step)
+          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+                                              row, col, tx_size);
+    }
   } else {
     // Prediction
-    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
 
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
-      struct inter_args arg = {xd, r, &eobtotal, mbmi->segment_id};
-      vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+      int plane;
+
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const TX_SIZE tx_size =
+            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                    : mbmi->tx_size;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int step = (1 << tx_size);
+        int row, col;
+        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+                                                tx_size);
+      }
+
       if (!less8x8 && eobtotal == 0)
         mbmi->skip = 1;  // skip loopfilter
     }
   }
 
-  xd->corrupted |= vp9_reader_has_error(r);
+  xd->corrupted |= vpx_reader_has_error(r);
+}
+
+static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
+                                              int mi_row, int mi_col,
+                                              int bsl) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
+
+//  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                int bw) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bw);
 }
 
 static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, vp9_reader *r,
-                                     int has_rows, int has_cols) {
-  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = get_partition_probs(xd, ctx);
+                                     vpx_reader *r,
+                                     int has_rows, int has_cols, int bsl) {
+  const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
+  const vpx_prob *const probs = get_partition_probs(xd, ctx);
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
-    p = (PARTITION_TYPE)vp9_read_tree(r, vp9_partition_tree, probs);
+    p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs);
   else if (!has_rows && has_cols)
-    p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
   else
     p = PARTITION_SPLIT;
 
@@ -797,11 +945,14 @@
   return p;
 }
 
+// TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                              int mi_row, int mi_col,
-                             vp9_reader* r, BLOCK_SIZE bsize) {
+                             vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
   VP9_COMMON *const cm = &pbi->common;
-  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
@@ -810,30 +961,37 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols);
-  subsize = get_subsize(bsize, partition);
-  if (bsize == BLOCK_8X8) {
-    decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+  partition = read_partition(xd, mi_row, mi_col, r, has_rows, has_cols,
+                             n8x8_l2);
+  subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
+                       n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
+                       n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, mi_row, mi_col, r, subsize);
-        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
+                         n8x8_l2);
         break;
       default:
         assert(0 && "Invalid partition type");
@@ -843,14 +1001,14 @@
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
 static void setup_token_decoder(const uint8_t *data,
                                 const uint8_t *data_end,
                                 size_t read_size,
                                 struct vpx_internal_error_info *error_info,
-                                vp9_reader *r,
+                                vpx_reader *r,
                                 vpx_decrypt_cb decrypt_cb,
                                 void *decrypt_state) {
   // Validate the calculated partition length. If the buffer
@@ -860,16 +1018,16 @@
     vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
-  if (vp9_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+  if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
     vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
 static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
-                                   vp9_reader *r) {
+                                   vpx_reader *r) {
   int i, j, k, l, m;
 
-  if (vp9_read_bit(r))
+  if (vpx_read_bit(r))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
@@ -879,7 +1037,7 @@
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
-                            vp9_reader *r) {
+                            vpx_reader *r) {
     const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
     TX_SIZE tx_size;
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
@@ -887,27 +1045,27 @@
 }
 
 static void setup_segmentation(struct segmentation *seg,
-                               struct vp9_read_bit_buffer *rb) {
+                               struct vpx_read_bit_buffer *rb) {
   int i, j;
 
   seg->update_map = 0;
   seg->update_data = 0;
 
-  seg->enabled = vp9_rb_read_bit(rb);
+  seg->enabled = vpx_rb_read_bit(rb);
   if (!seg->enabled)
     return;
 
   // Segmentation map update
-  seg->update_map = vp9_rb_read_bit(rb);
+  seg->update_map = vpx_rb_read_bit(rb);
   if (seg->update_map) {
     for (i = 0; i < SEG_TREE_PROBS; i++)
-      seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+      seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                                : MAX_PROB;
 
-    seg->temporal_update = vp9_rb_read_bit(rb);
+    seg->temporal_update = vpx_rb_read_bit(rb);
     if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++)
-        seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+        seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                                  : MAX_PROB;
     } else {
       for (i = 0; i < PREDICTION_PROBS; i++)
@@ -916,21 +1074,21 @@
   }
 
   // Segmentation data update
-  seg->update_data = vp9_rb_read_bit(rb);
+  seg->update_data = vpx_rb_read_bit(rb);
   if (seg->update_data) {
-    seg->abs_delta = vp9_rb_read_bit(rb);
+    seg->abs_delta = vpx_rb_read_bit(rb);
 
     vp9_clearall_segfeatures(seg);
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
-        const int feature_enabled = vp9_rb_read_bit(rb);
+        const int feature_enabled = vpx_rb_read_bit(rb);
         if (feature_enabled) {
           vp9_enable_segfeature(seg, i, j);
           data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
           if (vp9_is_segfeature_signed(j))
-            data = vp9_rb_read_bit(rb) ? -data : data;
+            data = vpx_rb_read_bit(rb) ? -data : data;
         }
         vp9_set_segdata(seg, i, j, data);
       }
@@ -939,38 +1097,38 @@
 }
 
 static void setup_loopfilter(struct loopfilter *lf,
-                             struct vp9_read_bit_buffer *rb) {
-  lf->filter_level = vp9_rb_read_literal(rb, 6);
-  lf->sharpness_level = vp9_rb_read_literal(rb, 3);
+                             struct vpx_read_bit_buffer *rb) {
+  lf->filter_level = vpx_rb_read_literal(rb, 6);
+  lf->sharpness_level = vpx_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
   lf->mode_ref_delta_update = 0;
 
-  lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb);
+  lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb);
   if (lf->mode_ref_delta_enabled) {
-    lf->mode_ref_delta_update = vp9_rb_read_bit(rb);
+    lf->mode_ref_delta_update = vpx_rb_read_bit(rb);
     if (lf->mode_ref_delta_update) {
       int i;
 
       for (i = 0; i < MAX_REF_LF_DELTAS; i++)
-        if (vp9_rb_read_bit(rb))
-          lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
+        if (vpx_rb_read_bit(rb))
+          lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
 
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
-        if (vp9_rb_read_bit(rb))
-          lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
+        if (vpx_rb_read_bit(rb))
+          lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
     }
   }
 }
 
-static INLINE int read_delta_q(struct vp9_read_bit_buffer *rb) {
-  return vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
+static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0;
 }
 
 static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               struct vp9_read_bit_buffer *rb) {
-  cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
+                               struct vpx_read_bit_buffer *rb) {
+  cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
   cm->y_dc_delta_q = read_delta_q(rb);
   cm->uv_dc_delta_q = read_delta_q(rb);
   cm->uv_ac_delta_q = read_delta_q(rb);
@@ -1012,19 +1170,19 @@
   }
 }
 
-static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
+static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
   const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
                                               EIGHTTAP,
                                               EIGHTTAP_SHARP,
                                               BILINEAR };
-  return vp9_rb_read_bit(rb) ? SWITCHABLE
-                             : literal_to_filter[vp9_rb_read_literal(rb, 2)];
+  return vpx_rb_read_bit(rb) ? SWITCHABLE
+                             : literal_to_filter[vpx_rb_read_literal(rb, 2)];
 }
 
-static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_display_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   cm->display_width = cm->width;
   cm->display_height = cm->height;
-  if (vp9_rb_read_bit(rb))
+  if (vpx_rb_read_bit(rb))
     vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
@@ -1068,7 +1226,7 @@
   }
 }
 
-static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   int width, height;
   BufferPool *const pool = cm->buffer_pool;
   vp9_read_frame_size(rb, &width, &height);
@@ -1107,13 +1265,13 @@
 }
 
 static void setup_frame_size_with_refs(VP9_COMMON *cm,
-                                       struct vp9_read_bit_buffer *rb) {
+                                       struct vpx_read_bit_buffer *rb) {
   int width, height;
   int found = 0, i;
   int has_valid_ref_frame = 0;
   BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
-    if (vp9_rb_read_bit(rb)) {
+    if (vpx_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
@@ -1179,14 +1337,14 @@
   pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
-static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   int min_log2_tile_cols, max_log2_tile_cols, max_ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   // columns
   max_ones = max_log2_tile_cols - min_log2_tile_cols;
   cm->log2_tile_cols = min_log2_tile_cols;
-  while (max_ones-- && vp9_rb_read_bit(rb))
+  while (max_ones-- && vpx_rb_read_bit(rb))
     cm->log2_tile_cols++;
 
   if (cm->log2_tile_cols > 6)
@@ -1194,9 +1352,9 @@
                        "Invalid number of tile columns");
 
   // rows
-  cm->log2_tile_rows = vp9_rb_read_bit(rb);
+  cm->log2_tile_rows = vpx_rb_read_bit(rb);
   if (cm->log2_tile_rows)
-    cm->log2_tile_rows += vp9_rb_read_bit(rb);
+    cm->log2_tile_rows += vpx_rb_read_bit(rb);
 }
 
 typedef struct TileBuffer {
@@ -1324,11 +1482,12 @@
       tile_data->xd.corrupted = 0;
       tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
                              NULL : &cm->counts;
+      vp9_zero(tile_data->dqcoeff);
       vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      vp9_init_macroblockd(cm, &tile_data->xd);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
     }
   }
 
@@ -1346,8 +1505,8 @@
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
-                           &tile_data->bit_reader, BLOCK_64X64);
+          decode_partition(pbi, &tile_data->xd, mi_row,
+                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -1397,7 +1556,7 @@
 
   if (pbi->frame_parallel_decode)
     vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
-  return vp9_reader_find_end(&tile_data->bit_reader);
+  return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
 static int tile_worker_hook(TileWorkerData *const tile_data,
@@ -1421,7 +1580,7 @@
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
                        mi_row, mi_col, &tile_data->bit_reader,
-                       BLOCK_64X64);
+                       BLOCK_64X64, 4);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1543,12 +1702,13 @@
       tile_data->xd.corrupted = 0;
       tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
                              0 : &tile_data->counts;
+      vp9_zero(tile_data->dqcoeff);
       vp9_tile_init(tile, cm, 0, buf->col);
       vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      vp9_init_macroblockd(cm, &tile_data->xd);
+      vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1575,7 +1735,7 @@
     if (final_worker > -1) {
       TileWorkerData *const tile_data =
           (TileWorkerData*)pbi->tile_workers[final_worker].data1;
-      bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
+      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
       final_worker = -1;
     }
 
@@ -1598,9 +1758,9 @@
 }
 
 static void read_bitdepth_colorspace_sampling(
-    VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+    VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   if (cm->profile >= PROFILE_2) {
-    cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+    cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
 #if CONFIG_VP9_HIGHBITDEPTH
     cm->use_highbitdepth = 1;
 #endif
@@ -1610,16 +1770,16 @@
     cm->use_highbitdepth = 0;
 #endif
   }
-  cm->color_space = vp9_rb_read_literal(rb, 3);
+  cm->color_space = vpx_rb_read_literal(rb, 3);
   if (cm->color_space != VPX_CS_SRGB) {
-    vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+    vpx_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      cm->subsampling_x = vp9_rb_read_bit(rb);
-      cm->subsampling_y = vp9_rb_read_bit(rb);
+      cm->subsampling_x = vpx_rb_read_bit(rb);
+      cm->subsampling_y = vpx_rb_read_bit(rb);
       if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "4:2:0 color not supported in profile 1 or 3");
-      if (vp9_rb_read_bit(rb))
+      if (vpx_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Reserved bit set");
     } else {
@@ -1630,7 +1790,7 @@
       // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
       // 4:2:2 or 4:4:0 chroma sampling is not allowed.
       cm->subsampling_y = cm->subsampling_x = 0;
-      if (vp9_rb_read_bit(rb))
+      if (vpx_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Reserved bit set");
     } else {
@@ -1641,7 +1801,7 @@
 }
 
 static size_t read_uncompressed_header(VP9Decoder *pbi,
-                                       struct vp9_read_bit_buffer *rb) {
+                                       struct vpx_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -1651,20 +1811,25 @@
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
-  if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
+  if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
 
   cm->profile = vp9_read_profile(rb);
-
+#if CONFIG_VP9_HIGHBITDEPTH
   if (cm->profile >= MAX_PROFILES)
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Unsupported bitstream profile");
+#else
+  if (cm->profile >= PROFILE_2)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#endif
 
-  cm->show_existing_frame = vp9_rb_read_bit(rb);
+  cm->show_existing_frame = vpx_rb_read_bit(rb);
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
-    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+    const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
     lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
       unlock_buffer_pool(pool);
@@ -1686,9 +1851,9 @@
     return 0;
   }
 
-  cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb);
-  cm->show_frame = vp9_rb_read_bit(rb);
-  cm->error_resilient_mode = vp9_rb_read_bit(rb);
+  cm->frame_type = (FRAME_TYPE) vpx_rb_read_bit(rb);
+  cm->show_frame = vpx_rb_read_bit(rb);
+  cm->error_resilient_mode = vpx_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
     if (!vp9_read_sync_code(rb))
@@ -1709,10 +1874,10 @@
       pbi->need_resync = 0;
     }
   } else {
-    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
+    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
 
     cm->reset_frame_context = cm->error_resilient_mode ?
-        0 : vp9_rb_read_literal(rb, 2);
+        0 : vpx_rb_read_literal(rb, 2);
 
     if (cm->intra_only) {
       if (!vp9_read_sync_code(rb))
@@ -1733,26 +1898,26 @@
 #endif
       }
 
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
       if (pbi->need_resync) {
         memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
         pbi->need_resync = 0;
       }
     } else if (pbi->need_resync != 1) {  /* Skip if need resync */
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       for (i = 0; i < REFS_PER_FRAME; ++i) {
-        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
         RefBuffer *const ref_frame = &cm->frame_refs[i];
         ref_frame->idx = idx;
         ref_frame->buf = &frame_bufs[idx].buf;
-        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb);
       }
 
       setup_frame_size_with_refs(cm, rb);
 
-      cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
+      cm->allow_high_precision_mv = vpx_rb_read_bit(rb);
       cm->interp_filter = read_interp_filter(rb);
 
       for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1784,8 +1949,8 @@
   }
 
   if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = vp9_rb_read_bit(rb);
-    cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
+    cm->refresh_frame_context = vpx_rb_read_bit(rb);
+    cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
   } else {
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
@@ -1793,7 +1958,7 @@
 
   // This flag will be overridden by the call to vp9_setup_past_independence
   // below, forcing the use of context 0 for those frame types.
-  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+  cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
   // Generate next_ref_frame_map.
   lock_buffer_pool(pool);
@@ -1828,7 +1993,7 @@
   setup_segmentation_dequant(cm);
 
   setup_tile_info(cm, rb);
-  sz = vp9_rb_read_literal(rb, 16);
+  sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1842,10 +2007,10 @@
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   FRAME_CONTEXT *const fc = cm->fc;
-  vp9_reader r;
+  vpx_reader r;
   int k;
 
-  if (vp9_reader_init(&r, data, partition_size, pbi->decrypt_cb,
+  if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                       pbi->decrypt_state))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
@@ -1886,7 +2051,7 @@
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
   }
 
-  return vp9_reader_has_error(&r);
+  return vpx_reader_has_error(&r);
 }
 
 #ifdef NDEBUG
@@ -1926,9 +2091,9 @@
 }
 #endif  // NDEBUG
 
-static struct vp9_read_bit_buffer *init_read_bit_buffer(
+static struct vpx_read_bit_buffer *init_read_bit_buffer(
     VP9Decoder *pbi,
-    struct vp9_read_bit_buffer *rb,
+    struct vpx_read_bit_buffer *rb,
     const uint8_t *data,
     const uint8_t *data_end,
     uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
@@ -1949,23 +2114,23 @@
 
 //------------------------------------------------------------------------------
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
-  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb) {
+  return vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
 }
 
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
                          int *width, int *height) {
-  *width = vp9_rb_read_literal(rb, 16) + 1;
-  *height = vp9_rb_read_literal(rb, 16) + 1;
+  *width = vpx_rb_read_literal(rb, 16) + 1;
+  *height = vpx_rb_read_literal(rb, 16) + 1;
 }
 
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
-  int profile = vp9_rb_read_bit(rb);
-  profile |= vp9_rb_read_bit(rb) << 1;
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb) {
+  int profile = vpx_rb_read_bit(rb);
+  profile |= vpx_rb_read_bit(rb) << 1;
   if (profile > 2)
-    profile += vp9_rb_read_bit(rb);
+    profile += vpx_rb_read_bit(rb);
   return (BITSTREAM_PROFILE) profile;
 }
 
@@ -1974,7 +2139,7 @@
                       const uint8_t **p_data_end) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb;
+  struct vpx_read_bit_buffer rb;
   int context_updated = 0;
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
@@ -1990,7 +2155,7 @@
     return;
   }
 
-  data += vp9_rb_bytes_read(&rb);
+  data += vpx_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");

diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index a876e7c..05af706 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h

@@ -17,12 +17,12 @@
 #endif
 
 struct VP9Decoder;
-struct vp9_read_bit_buffer;
+struct vpx_read_bit_buffer;
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb);
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
                          int *width, int *height);
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb);
 
 void vp9_decode_frame(struct VP9Decoder *pbi,
                       const uint8_t *data, const uint8_t *data_end,

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 0d08f8c..341e6d7 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -21,14 +21,13 @@
 
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_reader.h"
 
-static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
+static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p);
 }
 
 static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                         vp9_reader *r, int size_group) {
+                                         vpx_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
       read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
   FRAME_COUNTS *counts = xd->counts;
@@ -38,7 +37,7 @@
 }
 
 static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                          vp9_reader *r,
+                                          vpx_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
                                          cm->fc->uv_mode_prob[y_mode]);
@@ -49,8 +48,8 @@
 }
 
 static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                       vp9_reader *r, int ctx) {
-  const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
+                                       vpx_reader *r, int ctx) {
+  const int mode = vpx_read_tree(r, vp9_inter_mode_tree,
                                  cm->fc->inter_mode_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
@@ -59,20 +58,20 @@
   return NEARESTMV + mode;
 }
 
-static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
-  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
+static int read_segment_id(vpx_reader *r, const struct segmentation *seg) {
+  return vpx_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, vp9_reader *r) {
+                                     TX_SIZE max_tx_size, vpx_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
-  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
-  int tx_size = vp9_read(r, tx_probs[0]);
+  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
+  int tx_size = vpx_read(r, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    tx_size += vp9_read(r, tx_probs[1]);
+    tx_size += vpx_read(r, tx_probs[1]);
     if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      tx_size += vp9_read(r, tx_probs[2]);
+      tx_size += vpx_read(r, tx_probs[2]);
   }
 
   if (counts)
@@ -81,7 +80,7 @@
 }
 
 static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int allow_select, vp9_reader *r) {
+                            int allow_select, vpx_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -91,42 +90,45 @@
     return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
 
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
-                           int mi_row, int mi_col, int segment_id) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+static int dec_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int x, y, segment_id = INT_MAX;
+
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
+      segment_id = MIN(segment_id,
+                       segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(VP9_COMMON *cm, int mi_offset,
+                           int x_mis, int y_mis, int segment_id) {
   int x, y;
 
   assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
 
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
       cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
 static void copy_segment_id(const VP9_COMMON *cm,
                            const uint8_t *last_segment_ids,
                            uint8_t *current_segment_ids,
-                           BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+                           int mi_offset, int x_mis, int y_mis) {
   int x, y;
 
-  for (y = 0; y < ymis; y++)
-    for (x = 0; x < xmis; x++)
+  for (y = 0; y < y_mis; y++)
+    for (x = 0; x < x_mis; x++)
       current_segment_ids[mi_offset + y * cm->mi_cols + x] =  last_segment_ids ?
           last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
 }
 
-static int read_intra_segment_id(VP9_COMMON *const cm, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
-                                 vp9_reader *r) {
+static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
+                                 int x_mis, int y_mis,
+                                 vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
   int segment_id;
 
@@ -135,53 +137,60 @@
 
   if (!seg->update_map) {
     copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
-                    bsize, mi_row, mi_col);
+                    mi_offset, x_mis, y_mis);
     return 0;
   }
 
   segment_id = read_segment_id(r, seg);
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vp9_reader *r) {
+                                 int mi_row, int mi_col, vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
   int predicted_segment_id, segment_id;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
   predicted_segment_id = cm->last_frame_seg_map ?
-      get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
+      dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) :
+      0;
 
   if (!seg->update_map) {
     copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
-                    bsize, mi_row, mi_col);
+                    mi_offset, x_mis, y_mis);
     return predicted_segment_id;
   }
 
   if (seg->temporal_update) {
-    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-    mbmi->seg_id_predicted = vp9_read(r, pred_prob);
+    const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
     segment_id = mbmi->seg_id_predicted ? predicted_segment_id
                                         : read_segment_id(r, seg);
   } else {
     segment_id = read_segment_id(r, seg);
   }
-  set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                     int segment_id, vp9_reader *r) {
+                     int segment_id, vpx_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
-    const int skip = vp9_read(r, cm->fc->skip_probs[ctx]);
+    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->skip[ctx][skip];
@@ -191,15 +200,22 @@
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi  = xd->left_mi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int i;
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = xd->plane[0].n4_w >> 1;
+  const int bh = xd->plane[0].n4_h >> 1;
 
-  mbmi->segment_id = read_intra_segment_id(cm, bsize, mi_row, mi_col, r);
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = MIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = MIN(cm->mi_rows - mi_row, bh);
+
+  mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
   mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
   mbmi->tx_size = read_tx_size(cm, xd, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
@@ -232,16 +248,16 @@
   mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
 }
 
-static int read_mv_component(vp9_reader *r,
+static int read_mv_component(vpx_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
-  const int sign = vp9_read(r, mvcomp->sign);
-  const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
+  const int sign = vpx_read(r, mvcomp->sign);
+  const int mv_class = vpx_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
     mag = 0;
   } else {
     int i;
@@ -249,16 +265,16 @@
 
     d = 0;
     for (i = 0; i < n; ++i)
-      d |= vp9_read(r, mvcomp->bits[i]) << i;
+      d |= vpx_read(r, mvcomp->bits[i]) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
-  fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+  fr = vpx_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
                                                : mvcomp->fp);
 
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
 
   // Result
@@ -266,11 +282,11 @@
   return sign ? -mag : mag;
 }
 
-static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
+static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE joint_type =
-      (MV_JOINT_TYPE)vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints);
+      (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints);
   const int use_hp = allow_hp && vp9_use_mv_hp(ref);
   MV diff = {0, 0};
 
@@ -288,11 +304,11 @@
 
 static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd,
-                                                vp9_reader *r) {
+                                                vpx_reader *r) {
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp9_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]);
+        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->comp_inter[ctx][mode];
@@ -304,7 +320,7 @@
 
 // Read the referncence frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            vp9_reader *r,
+                            vpx_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = xd->counts;
@@ -319,19 +335,19 @@
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
+      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
       if (counts)
         ++counts->comp_ref[ctx][bit];
       ref_frame[idx] = cm->comp_fixed_ref;
       ref_frame[!idx] = cm->comp_var_ref[bit];
     } else if (mode == SINGLE_REFERENCE) {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
-      const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
         ++counts->single_ref[ctx0][0][bit0];
       if (bit0) {
         const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
-        const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
         if (counts)
           ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
@@ -349,10 +365,10 @@
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
     VP9_COMMON *const cm, MACROBLOCKD *const xd,
-    vp9_reader *r) {
+    vpx_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
-      (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree,
+      (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
                                    cm->fc->switchable_interp_prob[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
@@ -362,7 +378,7 @@
 
 static void read_intra_block_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd, MODE_INFO *mi,
-                                       vp9_reader *r) {
+                                       vpx_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   int i;
@@ -404,7 +420,7 @@
                             PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
-                            int is_compound, int allow_hp, vp9_reader *r) {
+                            int is_compound, int allow_hp, vpx_reader *r) {
   int i;
   int ret = 1;
 
@@ -445,12 +461,12 @@
 }
 
 static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vp9_reader *r) {
+                               int segment_id, vpx_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
-    const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
+    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->intra_inter[ctx][is_inter];
@@ -467,7 +483,7 @@
 static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        MODE_INFO *const mi,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -519,8 +535,8 @@
                       : cm->interp_filter;
 
   if (bsize < BLOCK_8X8) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
+    const int num_4x4_w = 1 << xd->bmode_blocks_wl;
+    const int num_4x4_h = 1 << xd->bmode_blocks_hl;
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv nearest_sub8x8[2], near_sub8x8[2];
@@ -569,7 +585,7 @@
 
 static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -588,14 +604,11 @@
     read_intra_block_mode_info(cm, xd, mi, r);
 }
 
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vp9_reader *r) {
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
-  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
-  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
   int w, h;
 

diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index db57b40..75f568c 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h

@@ -11,15 +11,17 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_
 
+#include "vpx_dsp/bitreader.h"
+
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vp9_reader *r);
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
+                        int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 76a8746..915f9dc 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h

@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_util/vpx_thread.h"
 
@@ -21,7 +22,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,15 +30,19 @@
 // TODO(hkuang): combine this with TileWorkerData.
 typedef struct TileData {
   VP9_COMMON *cm;
-  vp9_reader bit_reader;
+  vpx_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
 } TileData;
 
 typedef struct TileWorkerData {
   struct VP9Decoder *pbi;
-  vp9_reader bit_reader;
+  vpx_reader bit_reader;
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 4e3ab82..e4412dc 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -38,10 +38,10 @@
        ++coef_counts[band][ctx][token];                     \
   } while (0)
 
-static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
+static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
   int i, val = 0;
   for (i = 0; i < n; ++i)
-    val = (val << 1) | vp9_read(r, probs[i]);
+    val = (val << 1) | vpx_read(r, probs[i]);
   return val;
 }
 
@@ -49,15 +49,15 @@
                         PLANE_TYPE type,
                         tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
                         int ctx, const int16_t *scan, const int16_t *nb,
-                        vp9_reader *r) {
+                        vpx_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = xd->fc;
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
   int band, c = 0;
-  const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+  const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
-  const vp9_prob *prob;
+  const vpx_prob *prob;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
   uint8_t token_cache[32 * 32];
@@ -117,12 +117,12 @@
     prob = coef_probs[band][ctx];
     if (counts)
       ++eob_branch_count[band][ctx];
-    if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
 
-    while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
@@ -134,13 +134,13 @@
       prob = coef_probs[band][ctx];
     }
 
-    if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vp9_read_tree(r, vp9_coef_con_tree,
+      token = vpx_read_tree(r, vp9_coef_con_tree,
                             vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
       switch (token) {
         case TWO_TOKEN:
@@ -188,13 +188,13 @@
     v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VP9_HIGHBITDEPTH
-    dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
                                           xd->bd);
 #else
-    dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
-    dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
+    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
@@ -205,10 +205,55 @@
   return c;
 }
 
+// TODO(slavarnway): Decode version of vp9_set_context.  Modify vp9_set_context
+// after testing is complete, then delete this version.
+static
+void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      TX_SIZE tx_size, int has_eob,
+                      int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = pd->n4_w +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = pd->n4_h +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
-                            BLOCK_SIZE plane_bsize, int x, int y,
-                            TX_SIZE tx_size, vp9_reader *r,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
                             int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];
@@ -217,7 +262,7 @@
   const int eob = decode_coefs(xd, pd->plane_type,
                                pd->dqcoeff, tx_size,
                                dequant, ctx, sc->scan, sc->neighbors, r);
-  vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
+  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
 

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index ed826a4..d242d44 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -12,8 +12,8 @@
 #ifndef VP9_DECODER_VP9_DETOKENIZE_H_
 #define VP9_DECODER_VP9_DETOKENIZE_H_
 
+#include "vpx_dsp/bitreader.h"
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
 #include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus
@@ -22,8 +22,8 @@
 
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
-                            BLOCK_SIZE plane_bsize, int x, int y,
-                            TX_SIZE tx_size, vp9_reader *r,
+                            int x, int y,
+                            TX_SIZE tx_size, vpx_reader *r,
                             int seg_id);
 
 #ifdef __cplusplus

diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index b33c3b7..4fbc6db 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c

@@ -21,11 +21,11 @@
   return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
-static int decode_uniform(vp9_reader *r) {
+static int decode_uniform(vpx_reader *r) {
   const int l = 8;
   const int m = (1 << l) - 191;
-  const int v = vp9_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
+  const int v = vpx_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
 }
 
 static int inv_remap_prob(int v, int m) {
@@ -58,19 +58,19 @@
   }
 }
 
-static int decode_term_subexp(vp9_reader *r) {
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 4);
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 4) + 16;
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 5) + 32;
+static int decode_term_subexp(vpx_reader *r) {
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4);
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4) + 16;
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 5) + 32;
   return decode_uniform(r) + 64;
 }
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
-  if (vp9_read(r, DIFF_UPDATE_PROB)) {
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p) {
+  if (vpx_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r);
-    *p = (vp9_prob)inv_remap_prob(delp, *p);
+    *p = (vpx_prob)inv_remap_prob(delp, *p);
   }
 }

diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
index 436f434..a8bcc70 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/decoder/vp9_dsubexp.h

@@ -12,13 +12,13 @@
 #ifndef VP9_DECODER_VP9_DSUBEXP_H_
 #define VP9_DECODER_VP9_DSUBEXP_H_
 
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx_dsp/bitreader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
deleted file mode 100644
index fc88bd7..0000000
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-
-#include <limits.h>
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*vp9_rb_error_handler)(void *data);
-
-struct vp9_read_bit_buffer {
-  const uint8_t *bit_buffer;
-  const uint8_t *bit_buffer_end;
-  size_t bit_offset;
-
-  void *error_handler_data;
-  vp9_rb_error_handler error_handler;
-};
-
-size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb);
-
-int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb);
-
-int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits);
-
-int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, int bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_

diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index fecab57..40d7e87 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c

@@ -100,3 +100,17 @@
   hbuf += 8;
   vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }
+
+int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
+  int i;
+  uint16x8_t vec_sum = vdupq_n_u16(0);
+
+  for (i = 0; i < width; i += 16) {
+    const uint8x16_t vec_row = vld1q_u8(ref);
+    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+    ref += 16;
+  }
+
+  return horizontal_add_u16x8(vec_sum);
+}

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 6270bf4..b619063 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -347,7 +347,10 @@
   // For video conference clips, if the background has high motion in current
   // frame because of the camera movement, set this frame as the golden frame.
   // Use 70% and 5% as the thresholds for golden frame refreshing.
-  if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
+  // Also, force this frame as a golden update frame if this frame will change
+  // the resolution (resize_pending != 0).
+  if (cpi->resize_pending != 0 ||
+     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
     vp9_cyclic_refresh_set_golden_update(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -562,4 +565,5 @@
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
   cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
 }

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 4ca4083..f06bd56 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -13,6 +13,7 @@
 #include <limits.h>
 
 #include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 
@@ -32,7 +33,6 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
 static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
   {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
@@ -44,27 +44,27 @@
 static const struct vp9_token inter_mode_encodings[INTER_MODES] =
   {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
 
-static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
-                             const vp9_prob *probs) {
+static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
   vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
-static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode,
-                             const vp9_prob *probs) {
+static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
   assert(is_inter_mode(mode));
   vp9_write_token(w, vp9_inter_mode_tree, probs,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
                                 int data, int max) {
-  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
+  vpx_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void prob_diff_update(const vp9_tree_index *tree,
-                             vp9_prob probs[/*n - 1*/],
+static void prob_diff_update(const vpx_tree_index *tree,
+                             vpx_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/],
-                             int n, vp9_writer *w) {
+                             int n, vpx_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -77,32 +77,32 @@
 }
 
 static void write_selected_tx_size(const VP9_COMMON *cm,
-                                   const MACROBLOCKD *xd, vp9_writer *w) {
+                                   const MACROBLOCKD *xd, vpx_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
                                                  &cm->fc->tx_probs);
-  vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
+  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
+    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
     if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
 
 static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, vp9_writer *w) {
+                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
-    vp9_write(w, skip, vp9_get_skip_prob(cm, xd));
+    vpx_write(w, skip, vp9_get_skip_prob(cm, xd));
     return skip;
   }
 }
 
-static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w,
+static void update_skip_probs(VP9_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
   int k;
 
@@ -110,7 +110,7 @@
     vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w,
+static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
                                            FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
@@ -119,7 +119,7 @@
                      counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_mb_tokens(vp9_writer *w,
+static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth) {
   TOKENEXTRA *p = *tp;
@@ -179,12 +179,12 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          vp9_write(w, bb, pb[i >> 1]);
+          vpx_write(w, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vp9_write_bit(w, e & 1);
+      vpx_write_bit(w, e & 1);
     }
     ++p;
   }
@@ -192,7 +192,7 @@
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
+static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
     vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
@@ -200,7 +200,7 @@
 
 // This function encodes the reference frame
 static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                             vp9_writer *w) {
+                             vpx_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
@@ -215,27 +215,27 @@
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
+      vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
       assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
-      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
+      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+        vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
       }
     }
   }
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
-                                vp9_writer *w) {
+                                vpx_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
   const MACROBLOCK *const x = &cpi->td.mb;
@@ -254,8 +254,8 @@
   if (seg->update_map) {
     if (seg->temporal_update) {
       const int pred_flag = mbmi->seg_id_predicted;
-      vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-      vp9_write(w, pred_flag, pred_prob);
+      vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+      vpx_write(w, pred_flag, pred_prob);
       if (!pred_flag)
         write_segment_id(w, seg, segment_id);
     } else {
@@ -266,7 +266,7 @@
   skip = write_skip(cm, xd, segment_id, mi, w);
 
   if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+    vpx_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(is_inter && skip)) {
@@ -290,7 +290,7 @@
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
   } else {
     const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-    const vp9_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
@@ -339,7 +339,7 @@
 }
 
 static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vp9_writer *w) {
+                              MODE_INFO **mi_8x8, vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = xd->above_mi;
@@ -375,7 +375,7 @@
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          vp9_writer *w, TOKENEXTRA **tok,
+                          vpx_writer *w, TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -405,9 +405,9 @@
 static void write_partition(const VP9_COMMON *const cm,
                             const MACROBLOCKD *const xd,
                             int hbs, int mi_row, int mi_col,
-                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = xd->partition_probs[ctx];
+  const vpx_prob *const probs = xd->partition_probs[ctx];
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
@@ -415,17 +415,17 @@
     vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
-    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
-    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
     assert(p == PARTITION_SPLIT);
   }
 }
 
 static void write_modes_sb(VP9_COMP *cpi,
-                           const TileInfo *const tile, vp9_writer *w,
+                           const TileInfo *const tile, vpx_writer *w,
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -483,7 +483,7 @@
 }
 
 static void write_modes(VP9_COMP *cpi,
-                        const TileInfo *const tile, vp9_writer *w,
+                        const TileInfo *const tile, vpx_writer *w,
                         TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@@ -528,12 +528,12 @@
   }
 }
 
-static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
+static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp9_coeff_stats *frame_branch_ct,
                                      vp9_coeff_probs_model *new_coef_probs) {
   vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
-  const vp9_prob upd = DIFF_UPDATE_PROB;
+  const vpx_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   int stepsize = cpi->sf.coeff_prob_appx_step;
@@ -548,8 +548,8 @@
           for (k = 0; k < COEF_BANDS; ++k) {
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                const vp9_prob oldp = old_coef_probs[i][j][k][l][t];
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
                 int s;
                 int u = 0;
                 if (t == PIVOT_NODE)
@@ -575,19 +575,19 @@
       // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
-        vp9_write_bit(bc, 0);
+        vpx_write_bit(bc, 0);
         return;
       }
-      vp9_write_bit(bc, 1);
+      vpx_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vp9_prob upd = DIFF_UPDATE_PROB;
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                const vpx_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
                 if (t == PIVOT_NODE)
@@ -600,7 +600,7 @@
                       *oldp, &newp, upd);
                 if (s > 0 && newp != *oldp)
                   u = 1;
-                vp9_write(bc, u, upd);
+                vpx_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -623,8 +623,8 @@
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
 
@@ -648,11 +648,11 @@
                 if (u == 1 && updates == 1) {
                   int v;
                   // first update
-                  vp9_write_bit(bc, 1);
+                  vpx_write_bit(bc, 1);
                   for (v = 0; v < noupdates_before_first; ++v)
-                    vp9_write(bc, 0, upd);
+                    vpx_write(bc, 0, upd);
                 }
-                vp9_write(bc, u, upd);
+                vpx_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -664,7 +664,7 @@
         }
       }
       if (updates == 0) {
-        vp9_write_bit(bc, 0);  // no updates
+        vpx_write_bit(bc, 0);  // no updates
       }
       return;
     }
@@ -673,7 +673,7 @@
   }
 }
 
-static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
+static void update_coef_probs(VP9_COMP *cpi, vpx_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
@@ -682,7 +682,7 @@
     vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
     if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
-      vp9_write_bit(w, 0);
+      vpx_write_bit(w, 0);
     } else {
       build_tree_distribution(cpi, tx_size, frame_branch_ct,
                               frame_coef_probs);
@@ -693,75 +693,75 @@
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
-                              struct vp9_write_bit_buffer *wb) {
+                              struct vpx_write_bit_buffer *wb) {
   int i;
 
   // Encode the loop filter level and type
-  vp9_wb_write_literal(wb, lf->filter_level, 6);
-  vp9_wb_write_literal(wb, lf->sharpness_level, 3);
+  vpx_wb_write_literal(wb, lf->filter_level, 6);
+  vpx_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
   // ref frame (if they are enabled).
-  vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+  vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled);
 
   if (lf->mode_ref_delta_enabled) {
-    vp9_wb_write_bit(wb, lf->mode_ref_delta_update);
+    vpx_wb_write_bit(wb, lf->mode_ref_delta_update);
     if (lf->mode_ref_delta_update) {
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
         const int delta = lf->ref_deltas[i];
         const int changed = delta != lf->last_ref_deltas[i];
-        vp9_wb_write_bit(wb, changed);
+        vpx_wb_write_bit(wb, changed);
         if (changed) {
           lf->last_ref_deltas[i] = delta;
-          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vp9_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
         }
       }
 
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
         const int delta = lf->mode_deltas[i];
         const int changed = delta != lf->last_mode_deltas[i];
-        vp9_wb_write_bit(wb, changed);
+        vpx_wb_write_bit(wb, changed);
         if (changed) {
           lf->last_mode_deltas[i] = delta;
-          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vp9_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
         }
       }
     }
   }
 }
 
-static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) {
+static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
-    vp9_wb_write_bit(wb, 1);
-    vp9_wb_write_literal(wb, abs(delta_q), 4);
-    vp9_wb_write_bit(wb, delta_q < 0);
+    vpx_wb_write_bit(wb, 1);
+    vpx_wb_write_literal(wb, abs(delta_q), 4);
+    vpx_wb_write_bit(wb, delta_q < 0);
   } else {
-    vp9_wb_write_bit(wb, 0);
+    vpx_wb_write_bit(wb, 0);
   }
 }
 
 static void encode_quantization(const VP9_COMMON *const cm,
-                                struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+                                struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
   write_delta_q(wb, cm->y_dc_delta_q);
   write_delta_q(wb, cm->uv_dc_delta_q);
   write_delta_q(wb, cm->uv_ac_delta_q);
 }
 
 static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                struct vp9_write_bit_buffer *wb) {
+                                struct vpx_write_bit_buffer *wb) {
   int i, j;
 
   const struct segmentation *seg = &cm->seg;
 
-  vp9_wb_write_bit(wb, seg->enabled);
+  vpx_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
     return;
 
   // Segmentation map
-  vp9_wb_write_bit(wb, seg->update_map);
+  vpx_wb_write_bit(wb, seg->update_map);
   if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
     vp9_choose_segmap_coding_method(cm, xd);
@@ -769,40 +769,40 @@
     for (i = 0; i < SEG_TREE_PROBS; i++) {
       const int prob = seg->tree_probs[i];
       const int update = prob != MAX_PROB;
-      vp9_wb_write_bit(wb, update);
+      vpx_wb_write_bit(wb, update);
       if (update)
-        vp9_wb_write_literal(wb, prob, 8);
+        vpx_wb_write_literal(wb, prob, 8);
     }
 
     // Write out the chosen coding method.
-    vp9_wb_write_bit(wb, seg->temporal_update);
+    vpx_wb_write_bit(wb, seg->temporal_update);
     if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++) {
         const int prob = seg->pred_probs[i];
         const int update = prob != MAX_PROB;
-        vp9_wb_write_bit(wb, update);
+        vpx_wb_write_bit(wb, update);
         if (update)
-          vp9_wb_write_literal(wb, prob, 8);
+          vpx_wb_write_literal(wb, prob, 8);
       }
     }
   }
 
   // Segmentation data
-  vp9_wb_write_bit(wb, seg->update_data);
+  vpx_wb_write_bit(wb, seg->update_data);
   if (seg->update_data) {
-    vp9_wb_write_bit(wb, seg->abs_delta);
+    vpx_wb_write_bit(wb, seg->abs_delta);
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         const int active = segfeature_active(seg, i, j);
-        vp9_wb_write_bit(wb, active);
+        vpx_wb_write_bit(wb, active);
         if (active) {
           const int data = get_segdata(seg, i, j);
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
             encode_unsigned_max(wb, abs(data), data_max);
-            vp9_wb_write_bit(wb, data < 0);
+            vpx_wb_write_bit(wb, data < 0);
           } else {
             encode_unsigned_max(wb, data, data_max);
           }
@@ -812,12 +812,12 @@
   }
 }
 
-static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w,
+static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
   // Mode
-  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+  vpx_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
-    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
 
   // Probabilities
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -850,12 +850,12 @@
 }
 
 static void write_interp_filter(INTERP_FILTER filter,
-                                struct vp9_write_bit_buffer *wb) {
+                                struct vpx_write_bit_buffer *wb) {
   const int filter_to_literal[] = { 1, 0, 2, 3 };
 
-  vp9_wb_write_bit(wb, filter == SWITCHABLE);
+  vpx_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
-    vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
+    vpx_wb_write_literal(wb, filter_to_literal[filter], 2);
 }
 
 static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
@@ -882,22 +882,22 @@
 }
 
 static void write_tile_info(const VP9_COMMON *const cm,
-                            struct vp9_write_bit_buffer *wb) {
+                            struct vpx_write_bit_buffer *wb) {
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   // columns
   ones = cm->log2_tile_cols - min_log2_tile_cols;
   while (ones--)
-    vp9_wb_write_bit(wb, 1);
+    vpx_wb_write_bit(wb, 1);
 
   if (cm->log2_tile_cols < max_log2_tile_cols)
-    vp9_wb_write_bit(wb, 0);
+    vpx_wb_write_bit(wb, 0);
 
   // rows
-  vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0)
-    vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);
+    vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
 }
 
 static int get_refresh_mask(VP9_COMP *cpi) {
@@ -928,7 +928,7 @@
 
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
-  vp9_writer residual_bc;
+  vpx_writer residual_bc;
   int tile_row, tile_col;
   TOKENEXTRA *tok_end;
   size_t total_size = 0;
@@ -947,14 +947,14 @@
           cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
-        vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
       else
-        vp9_start_encode(&residual_bc, data_ptr + total_size);
+        vpx_start_encode(&residual_bc, data_ptr + total_size);
 
       write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
                   &residual_bc, &tok, tok_end);
       assert(tok == tok_end);
-      vp9_stop_encode(&residual_bc);
+      vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
         mem_put_be32(data_ptr + total_size, residual_bc.pos);
@@ -969,26 +969,26 @@
 }
 
 static void write_display_size(const VP9_COMMON *cm,
-                               struct vp9_write_bit_buffer *wb) {
+                               struct vpx_write_bit_buffer *wb) {
   const int scaling_active = cm->width != cm->display_width ||
                              cm->height != cm->display_height;
-  vp9_wb_write_bit(wb, scaling_active);
+  vpx_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
-    vp9_wb_write_literal(wb, cm->display_width - 1, 16);
-    vp9_wb_write_literal(wb, cm->display_height - 1, 16);
+    vpx_wb_write_literal(wb, cm->display_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->display_height - 1, 16);
   }
 }
 
 static void write_frame_size(const VP9_COMMON *cm,
-                             struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, cm->width - 1, 16);
-  vp9_wb_write_literal(wb, cm->height - 1, 16);
+                             struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->width - 1, 16);
+  vpx_wb_write_literal(wb, cm->height - 1, 16);
 
   write_display_size(cm, wb);
 }
 
 static void write_frame_size_with_refs(VP9_COMP *cpi,
-                                       struct vp9_write_bit_buffer *wb) {
+                                       struct vpx_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
   int found = 0;
 
@@ -1011,40 +1011,40 @@
       found = cm->width == cfg->y_crop_width &&
               cm->height == cfg->y_crop_height;
     }
-    vp9_wb_write_bit(wb, found);
+    vpx_wb_write_bit(wb, found);
     if (found) {
       break;
     }
   }
 
   if (!found) {
-    vp9_wb_write_literal(wb, cm->width - 1, 16);
-    vp9_wb_write_literal(wb, cm->height - 1, 16);
+    vpx_wb_write_literal(wb, cm->width - 1, 16);
+    vpx_wb_write_literal(wb, cm->height - 1, 16);
   }
 
   write_display_size(cm, wb);
 }
 
-static void write_sync_code(struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
+static void write_sync_code(struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
 static void write_profile(BITSTREAM_PROFILE profile,
-                          struct vp9_write_bit_buffer *wb) {
+                          struct vpx_write_bit_buffer *wb) {
   switch (profile) {
     case PROFILE_0:
-      vp9_wb_write_literal(wb, 0, 2);
+      vpx_wb_write_literal(wb, 0, 2);
       break;
     case PROFILE_1:
-      vp9_wb_write_literal(wb, 2, 2);
+      vpx_wb_write_literal(wb, 2, 2);
       break;
     case PROFILE_2:
-      vp9_wb_write_literal(wb, 1, 2);
+      vpx_wb_write_literal(wb, 1, 2);
       break;
     case PROFILE_3:
-      vp9_wb_write_literal(wb, 6, 3);
+      vpx_wb_write_literal(wb, 6, 3);
       break;
     default:
       assert(0);
@@ -1052,41 +1052,41 @@
 }
 
 static void write_bitdepth_colorspace_sampling(
-    VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) {
+    VP9_COMMON *const cm, struct vpx_write_bit_buffer *wb) {
   if (cm->profile >= PROFILE_2) {
     assert(cm->bit_depth > VPX_BITS_8);
-    vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
+    vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
-  vp9_wb_write_literal(wb, cm->color_space, 3);
+  vpx_wb_write_literal(wb, cm->color_space, 3);
   if (cm->color_space != VPX_CS_SRGB) {
-    vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    vpx_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
-      vp9_wb_write_bit(wb, cm->subsampling_x);
-      vp9_wb_write_bit(wb, cm->subsampling_y);
-      vp9_wb_write_bit(wb, 0);  // unused
+      vpx_wb_write_bit(wb, cm->subsampling_x);
+      vpx_wb_write_bit(wb, cm->subsampling_y);
+      vpx_wb_write_bit(wb, 0);  // unused
     } else {
       assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
     }
   } else {
     assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
-    vp9_wb_write_bit(wb, 0);  // unused
+    vpx_wb_write_bit(wb, 0);  // unused
   }
 }
 
 static void write_uncompressed_header(VP9_COMP *cpi,
-                                      struct vp9_write_bit_buffer *wb) {
+                                      struct vpx_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
+  vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
   write_profile(cm->profile, wb);
 
-  vp9_wb_write_bit(wb, 0);  // show_existing_frame
-  vp9_wb_write_bit(wb, cm->frame_type);
-  vp9_wb_write_bit(wb, cm->show_frame);
-  vp9_wb_write_bit(wb, cm->error_resilient_mode);
+  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  vpx_wb_write_bit(wb, cm->frame_type);
+  vpx_wb_write_bit(wb, cm->show_frame);
+  vpx_wb_write_bit(wb, cm->error_resilient_mode);
 
   if (cm->frame_type == KEY_FRAME) {
     write_sync_code(wb);
@@ -1102,10 +1102,10 @@
     // show_existing_frame flag which tells the decoder which frame we want to
     // show.
     if (!cm->show_frame)
-      vp9_wb_write_bit(wb, cm->intra_only);
+      vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)
-      vp9_wb_write_literal(wb, cm->reset_frame_context, 2);
+      vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
 
     if (cm->intra_only) {
       write_sync_code(wb);
@@ -1115,21 +1115,21 @@
         write_bitdepth_colorspace_sampling(cm, wb);
       }
 
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-        vp9_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                              REF_FRAMES_LOG2);
-        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+        vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
 
       write_frame_size_with_refs(cpi, wb);
 
-      vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
+      vpx_wb_write_bit(wb, cm->allow_high_precision_mv);
 
       fix_interp_filter(cm, cpi->td.counts);
       write_interp_filter(cm->interp_filter, wb);
@@ -1137,11 +1137,11 @@
   }
 
   if (!cm->error_resilient_mode) {
-    vp9_wb_write_bit(wb, cm->refresh_frame_context);
-    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+    vpx_wb_write_bit(wb, cm->refresh_frame_context);
+    vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
@@ -1155,9 +1155,9 @@
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
-  vp9_writer header_bc;
+  vpx_writer header_bc;
 
-  vp9_start_encode(&header_bc, data);
+  vpx_start_encode(&header_bc, data);
 
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
@@ -1185,9 +1185,9 @@
       const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 
-      vp9_write_bit(&header_bc, use_compound_pred);
+      vpx_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
-        vp9_write_bit(&header_bc, use_hybrid_pred);
+        vpx_write_bit(&header_bc, use_hybrid_pred);
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
@@ -1221,7 +1221,7 @@
                         &counts->mv);
   }
 
-  vp9_stop_encode(&header_bc);
+  vpx_stop_encode(&header_bc);
   assert(header_bc.pos <= 0xffff);
 
   return header_bc.pos;
@@ -1230,14 +1230,14 @@
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
   size_t first_part_size, uncompressed_hdr_size;
-  struct vp9_write_bit_buffer wb = {data, 0};
-  struct vp9_write_bit_buffer saved_wb;
+  struct vpx_write_bit_buffer wb = {data, 0};
+  struct vpx_write_bit_buffer saved_wb;
 
   write_uncompressed_header(cpi, &wb);
   saved_wb = wb;
-  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  uncompressed_hdr_size = vp9_wb_bytes_written(&wb);
+  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
 
   vp9_clear_system_state();
@@ -1245,7 +1245,7 @@
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
   // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
-  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 

diff --git a/vp9/encoder/vp9_cost.c b/vp9/encoder/vp9_cost.c
index 1c3c3d2..e2fbb34 100644
--- a/vp9/encoder/vp9_cost.c
+++ b/vp9/encoder/vp9_cost.c

@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <assert.h>
 
 #include "vp9/encoder/vp9_cost.h"
 
@@ -34,14 +35,14 @@
   22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
   4,    3,    1,    1};
 
-static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
+static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
                  int i, int c) {
-  const vp9_prob prob = probs[i / 2];
+  const vpx_prob prob = probs[i / 2];
   int b;
 
   for (b = 0; b <= 1; ++b) {
     const int cc = c + vp9_cost_bit(prob, b);
-    const vp9_tree_index ii = tree[i + b];
+    const vpx_tree_index ii = tree[i + b];
 
     if (ii <= 0)
       costs[-ii] = cc;
@@ -50,11 +51,11 @@
   }
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
   cost(costs, tree, probs, 0, 0);
 }
 
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) {
   assert(tree[0] <= 0 && tree[1] > 0);
 
   costs[-tree[0]] = vp9_cost_bit(probs[0], 0);

diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h
index 6d2b940..eac74c4 100644
--- a/vp9/encoder/vp9_cost.h
+++ b/vp9/encoder/vp9_cost.h

@@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_COST_H_
 #define VP9_ENCODER_VP9_COST_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -21,20 +21,20 @@
 
 #define vp9_cost_zero(prob) (vp9_prob_cost[prob])
 
-#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob))
+#define vp9_cost_one(prob) vp9_cost_zero(vpx_complement(prob))
 
-#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vpx_complement(prob) \
                                                     : (prob))
 
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vp9_prob p) {
+                                          vpx_prob p) {
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
-static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs,
                              int bits, int len) {
   int cost = 0;
-  vp9_tree_index i = 0;
+  vpx_tree_index i = 0;
 
   do {
     const int bit = (bits >> --len) & 1;
@@ -45,8 +45,8 @@
   return cost;
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 414d2bb..5170541 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -28,7 +28,7 @@
   return rv;
 }
 
-void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t step[4];
   tran_high_t temp1, temp2;
 
@@ -47,6 +47,494 @@
   output[3] = (tran_low_t)fdct_round_shift(temp2);
 }
 
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
+
+  // stage 1
+  s0 = input[0] + input[7];
+  s1 = input[1] + input[6];
+  s2 = input[2] + input[5];
+  s3 = input[3] + input[4];
+  s4 = input[3] - input[4];
+  s5 = input[2] - input[5];
+  s6 = input[1] - input[6];
+  s7 = input[0] - input[7];
+
+  // fdct4(step, step);
+  x0 = s0 + s3;
+  x1 = s1 + s2;
+  x2 = s1 - s2;
+  x3 = s0 - s3;
+  t0 = (x0 + x1) * cospi_16_64;
+  t1 = (x0 - x1) * cospi_16_64;
+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+  output[0] = (tran_low_t)fdct_round_shift(t0);
+  output[2] = (tran_low_t)fdct_round_shift(t2);
+  output[4] = (tran_low_t)fdct_round_shift(t1);
+  output[6] = (tran_low_t)fdct_round_shift(t3);
+
+  // Stage 2
+  t0 = (s6 - s5) * cospi_16_64;
+  t1 = (s6 + s5) * cospi_16_64;
+  t2 = (tran_low_t)fdct_round_shift(t0);
+  t3 = (tran_low_t)fdct_round_shift(t1);
+
+  // Stage 3
+  x0 = s4 + t2;
+  x1 = s4 - t2;
+  x2 = s7 - t3;
+  x3 = s7 + t3;
+
+  // Stage 4
+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+  output[1] = (tran_low_t)fdct_round_shift(t0);
+  output[3] = (tran_low_t)fdct_round_shift(t2);
+  output[5] = (tran_low_t)fdct_round_shift(t1);
+  output[7] = (tran_low_t)fdct_round_shift(t3);
+}
+
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t input[8];      // canbe16
+  tran_high_t temp1, temp2;  // needs32
+
+  // step 1
+  input[0] = in[0] + in[15];
+  input[1] = in[1] + in[14];
+  input[2] = in[2] + in[13];
+  input[3] = in[3] + in[12];
+  input[4] = in[4] + in[11];
+  input[5] = in[5] + in[10];
+  input[6] = in[6] + in[ 9];
+  input[7] = in[7] + in[ 8];
+
+  step1[0] = in[7] - in[ 8];
+  step1[1] = in[6] - in[ 9];
+  step1[2] = in[5] - in[10];
+  step1[3] = in[4] - in[11];
+  step1[4] = in[3] - in[12];
+  step1[5] = in[2] - in[13];
+  step1[6] = in[1] - in[14];
+  step1[7] = in[0] - in[15];
+
+  // fdct8(step, step);
+  {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    // stage 1
+    s0 = input[0] + input[7];
+    s1 = input[1] + input[6];
+    s2 = input[2] + input[5];
+    s3 = input[3] + input[4];
+    s4 = input[3] - input[4];
+    s5 = input[2] - input[5];
+    s6 = input[1] - input[6];
+    s7 = input[0] - input[7];
+
+    // fdct4(step, step);
+    x0 = s0 + s3;
+    x1 = s1 + s2;
+    x2 = s1 - s2;
+    x3 = s0 - s3;
+    t0 = (x0 + x1) * cospi_16_64;
+    t1 = (x0 - x1) * cospi_16_64;
+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+    out[0] = (tran_low_t)fdct_round_shift(t0);
+    out[4] = (tran_low_t)fdct_round_shift(t2);
+    out[8] = (tran_low_t)fdct_round_shift(t1);
+    out[12] = (tran_low_t)fdct_round_shift(t3);
+
+    // Stage 2
+    t0 = (s6 - s5) * cospi_16_64;
+    t1 = (s6 + s5) * cospi_16_64;
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
+
+    // Stage 3
+    x0 = s4 + t2;
+    x1 = s4 - t2;
+    x2 = s7 - t3;
+    x3 = s7 + t3;
+
+    // Stage 4
+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+    out[2] = (tran_low_t)fdct_round_shift(t0);
+    out[6] = (tran_low_t)fdct_round_shift(t2);
+    out[10] = (tran_low_t)fdct_round_shift(t1);
+    out[14] = (tran_low_t)fdct_round_shift(t3);
+  }
+
+  // step 2
+  temp1 = (step1[5] - step1[2]) * cospi_16_64;
+  temp2 = (step1[4] - step1[3]) * cospi_16_64;
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
+  temp1 = (step1[4] + step1[3]) * cospi_16_64;
+  temp2 = (step1[5] + step1[2]) * cospi_16_64;
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
+
+  // step 3
+  step3[0] = step1[0] + step2[3];
+  step3[1] = step1[1] + step2[2];
+  step3[2] = step1[1] - step2[2];
+  step3[3] = step1[0] - step2[3];
+  step3[4] = step1[7] - step2[4];
+  step3[5] = step1[6] - step2[5];
+  step3[6] = step1[6] + step2[5];
+  step3[7] = step1[7] + step2[4];
+
+  // step 4
+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
+  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
+
+  // step 5
+  step1[0] = step3[0] + step2[1];
+  step1[1] = step3[0] - step2[1];
+  step1[2] = step3[3] + step2[2];
+  step1[3] = step3[3] - step2[2];
+  step1[4] = step3[4] - step2[5];
+  step1[5] = step3[4] + step2[5];
+  step1[6] = step3[7] - step2[6];
+  step1[7] = step3[7] + step2[6];
+
+  // step 6
+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  out[1] = (tran_low_t)fdct_round_shift(temp1);
+  out[9] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+  out[5] = (tran_low_t)fdct_round_shift(temp1);
+  out[13] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  out[3] = (tran_low_t)fdct_round_shift(temp1);
+  out[11] = (tran_low_t)fdct_round_shift(temp2);
+
+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+  out[7] = (tran_low_t)fdct_round_shift(temp1);
+  out[15] = (tran_low_t)fdct_round_shift(temp2);
+}
+
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = fdct_round_shift(s0 + s8);
+  x1 = fdct_round_shift(s1 + s9);
+  x2 = fdct_round_shift(s2 + s10);
+  x3 = fdct_round_shift(s3 + s11);
+  x4 = fdct_round_shift(s4 + s12);
+  x5 = fdct_round_shift(s5 + s13);
+  x6 = fdct_round_shift(s6 + s14);
+  x7 = fdct_round_shift(s7 + s15);
+  x8  = fdct_round_shift(s0 - s8);
+  x9  = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
+  x4 = s0 - s4;
+  x5 = s1 - s5;
+  x6 = s2 - s6;
+  x7 = s3 - s7;
+  x8 = fdct_round_shift(s8 + s12);
+  x9 = fdct_round_shift(s9 + s13);
+  x10 = fdct_round_shift(s10 + s14);
+  x11 = fdct_round_shift(s11 + s15);
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+  x8 = s8 + s10;
+  x9 = s9 + s11;
+  x10 = s8 - s10;
+  x11 = s9 - s11;
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (- x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (- x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x8;
+  output[2] = (tran_low_t)x12;
+  output[3] = (tran_low_t)-x4;
+  output[4] = (tran_low_t)x6;
+  output[5] = (tran_low_t)x14;
+  output[6] = (tran_low_t)x10;
+  output[7] = (tran_low_t)x2;
+  output[8] = (tran_low_t)x3;
+  output[9] = (tran_low_t)x11;
+  output[10] = (tran_low_t)x15;
+  output[11] = (tran_low_t)x7;
+  output[12] = (tran_low_t)x5;
+  output[13] = (tran_low_t)-x13;
+  output[14] = (tran_low_t)x9;
+  output[15] = (tran_low_t)-x1;
+}
+
+static const transform_2d FHT_4[] = {
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
+};
+
 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   tran_low_t sum = 0;
@@ -125,46 +613,6 @@
   }
 }
 
-void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t x0, x1, x2, x3;
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_4_9 * x0;
-  s2 = sinpi_2_9 * x1;
-  s3 = sinpi_1_9 * x1;
-  s4 = sinpi_3_9 * x2;
-  s5 = sinpi_4_9 * x3;
-  s6 = sinpi_2_9 * x3;
-  s7 = x0 + x1 - x3;
-
-  x0 = s0 + s2 + s5;
-  x1 = sinpi_3_9 * s7;
-  x2 = s1 - s3 + s6;
-  x3 = s4;
-
-  s0 = x0 + x3;
-  s1 = x1;
-  s2 = x2 - x3;
-  s3 = x2 - x0 + x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  output[0] = (tran_low_t)fdct_round_shift(s0);
-  output[1] = (tran_low_t)fdct_round_shift(s1);
-  output[2] = (tran_low_t)fdct_round_shift(s2);
-  output[3] = (tran_low_t)fdct_round_shift(s3);
-}
-
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
@@ -197,58 +645,6 @@
   }
 }
 
-void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-  tran_high_t t0, t1, t2, t3;                  // needs32
-  tran_high_t x0, x1, x2, x3;                  // canbe16
-
-  // stage 1
-  s0 = input[0] + input[7];
-  s1 = input[1] + input[6];
-  s2 = input[2] + input[5];
-  s3 = input[3] + input[4];
-  s4 = input[3] - input[4];
-  s5 = input[2] - input[5];
-  s6 = input[1] - input[6];
-  s7 = input[0] - input[7];
-
-  // fdct4(step, step);
-  x0 = s0 + s3;
-  x1 = s1 + s2;
-  x2 = s1 - s2;
-  x3 = s0 - s3;
-  t0 = (x0 + x1) * cospi_16_64;
-  t1 = (x0 - x1) * cospi_16_64;
-  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = (tran_low_t)fdct_round_shift(t0);
-  output[2] = (tran_low_t)fdct_round_shift(t2);
-  output[4] = (tran_low_t)fdct_round_shift(t1);
-  output[6] = (tran_low_t)fdct_round_shift(t3);
-
-  // Stage 2
-  t0 = (s6 - s5) * cospi_16_64;
-  t1 = (s6 + s5) * cospi_16_64;
-  t2 = (tran_low_t)fdct_round_shift(t0);
-  t3 = (tran_low_t)fdct_round_shift(t1);
-
-  // Stage 3
-  x0 = s4 + t2;
-  x1 = s4 - t2;
-  x2 = s7 - t3;
-  x3 = s7 + t3;
-
-  // Stage 4
-  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = (tran_low_t)fdct_round_shift(t0);
-  output[3] = (tran_low_t)fdct_round_shift(t2);
-  output[5] = (tran_low_t)fdct_round_shift(t1);
-  output[7] = (tran_low_t)fdct_round_shift(t3);
-}
-
 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   tran_low_t sum = 0;
@@ -325,7 +721,7 @@
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
+    fdct8(&intermediate[i * 8], &final_output[i * 8]);
     for (j = 0; j < 8; ++j)
       final_output[j + i * 8] /= 2;
   }
@@ -407,7 +803,7 @@
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    vp9_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
     for (j = 0; j < 8; ++j)
       coeff_ptr[j + i * 8] /= 2;
   }
@@ -634,77 +1030,6 @@
   }
 }
 
-void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-
-  x0 = fdct_round_shift(s0 + s4);
-  x1 = fdct_round_shift(s1 + s5);
-  x2 = fdct_round_shift(s2 + s6);
-  x3 = fdct_round_shift(s3 + s7);
-  x4 = fdct_round_shift(s0 - s4);
-  x5 = fdct_round_shift(s1 - s5);
-  x6 = fdct_round_shift(s2 - s6);
-  x7 = fdct_round_shift(s3 - s7);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x4;
-  output[2] = (tran_low_t)x6;
-  output[3] = (tran_low_t)-x2;
-  output[4] = (tran_low_t)x3;
-  output[5] = (tran_low_t)-x7;
-  output[6] = (tran_low_t)x5;
-  output[7] = (tran_low_t)-x1;
-}
-
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
@@ -791,311 +1116,6 @@
   }
 }
 
-// Rewrote to use same algorithm as others.
-void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
-  tran_high_t step1[8];      // canbe16
-  tran_high_t step2[8];      // canbe16
-  tran_high_t step3[8];      // canbe16
-  tran_high_t input[8];      // canbe16
-  tran_high_t temp1, temp2;  // needs32
-
-  // step 1
-  input[0] = in[0] + in[15];
-  input[1] = in[1] + in[14];
-  input[2] = in[2] + in[13];
-  input[3] = in[3] + in[12];
-  input[4] = in[4] + in[11];
-  input[5] = in[5] + in[10];
-  input[6] = in[6] + in[ 9];
-  input[7] = in[7] + in[ 8];
-
-  step1[0] = in[7] - in[ 8];
-  step1[1] = in[6] - in[ 9];
-  step1[2] = in[5] - in[10];
-  step1[3] = in[4] - in[11];
-  step1[4] = in[3] - in[12];
-  step1[5] = in[2] - in[13];
-  step1[6] = in[1] - in[14];
-  step1[7] = in[0] - in[15];
-
-  // fdct8(step, step);
-  {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    // stage 1
-    s0 = input[0] + input[7];
-    s1 = input[1] + input[6];
-    s2 = input[2] + input[5];
-    s3 = input[3] + input[4];
-    s4 = input[3] - input[4];
-    s5 = input[2] - input[5];
-    s6 = input[1] - input[6];
-    s7 = input[0] - input[7];
-
-    // fdct4(step, step);
-    x0 = s0 + s3;
-    x1 = s1 + s2;
-    x2 = s1 - s2;
-    x3 = s0 - s3;
-    t0 = (x0 + x1) * cospi_16_64;
-    t1 = (x0 - x1) * cospi_16_64;
-    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
-    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = (tran_low_t)fdct_round_shift(t0);
-    out[4] = (tran_low_t)fdct_round_shift(t2);
-    out[8] = (tran_low_t)fdct_round_shift(t1);
-    out[12] = (tran_low_t)fdct_round_shift(t3);
-
-    // Stage 2
-    t0 = (s6 - s5) * cospi_16_64;
-    t1 = (s6 + s5) * cospi_16_64;
-    t2 = fdct_round_shift(t0);
-    t3 = fdct_round_shift(t1);
-
-    // Stage 3
-    x0 = s4 + t2;
-    x1 = s4 - t2;
-    x2 = s7 - t3;
-    x3 = s7 + t3;
-
-    // Stage 4
-    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = (tran_low_t)fdct_round_shift(t0);
-    out[6] = (tran_low_t)fdct_round_shift(t2);
-    out[10] = (tran_low_t)fdct_round_shift(t1);
-    out[14] = (tran_low_t)fdct_round_shift(t3);
-  }
-
-  // step 2
-  temp1 = (step1[5] - step1[2]) * cospi_16_64;
-  temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = fdct_round_shift(temp1);
-  step2[3] = fdct_round_shift(temp2);
-  temp1 = (step1[4] + step1[3]) * cospi_16_64;
-  temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = fdct_round_shift(temp1);
-  step2[5] = fdct_round_shift(temp2);
-
-  // step 3
-  step3[0] = step1[0] + step2[3];
-  step3[1] = step1[1] + step2[2];
-  step3[2] = step1[1] - step2[2];
-  step3[3] = step1[0] - step2[3];
-  step3[4] = step1[7] - step2[4];
-  step3[5] = step1[6] - step2[5];
-  step3[6] = step1[6] + step2[5];
-  step3[7] = step1[7] + step2[4];
-
-  // step 4
-  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
-  step2[1] = fdct_round_shift(temp1);
-  step2[2] = fdct_round_shift(temp2);
-  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = fdct_round_shift(temp1);
-  step2[6] = fdct_round_shift(temp2);
-
-  // step 5
-  step1[0] = step3[0] + step2[1];
-  step1[1] = step3[0] - step2[1];
-  step1[2] = step3[3] + step2[2];
-  step1[3] = step3[3] - step2[2];
-  step1[4] = step3[4] - step2[5];
-  step1[5] = step3[4] + step2[5];
-  step1[6] = step3[7] - step2[6];
-  step1[7] = step3[7] + step2[6];
-
-  // step 6
-  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
-  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = (tran_low_t)fdct_round_shift(temp1);
-  out[9] = (tran_low_t)fdct_round_shift(temp2);
-
-  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = (tran_low_t)fdct_round_shift(temp1);
-  out[13] = (tran_low_t)fdct_round_shift(temp2);
-
-  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
-  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = (tran_low_t)fdct_round_shift(temp1);
-  out[11] = (tran_low_t)fdct_round_shift(temp2);
-
-  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = (tran_low_t)fdct_round_shift(temp1);
-  out[15] = (tran_low_t)fdct_round_shift(temp2);
-}
-
-void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  // stage 1
-  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
-
-  x0 = fdct_round_shift(s0 + s8);
-  x1 = fdct_round_shift(s1 + s9);
-  x2 = fdct_round_shift(s2 + s10);
-  x3 = fdct_round_shift(s3 + s11);
-  x4 = fdct_round_shift(s4 + s12);
-  x5 = fdct_round_shift(s5 + s13);
-  x6 = fdct_round_shift(s6 + s14);
-  x7 = fdct_round_shift(s7 + s15);
-  x8  = fdct_round_shift(s0 - s8);
-  x9  = fdct_round_shift(s1 - s9);
-  x10 = fdct_round_shift(s2 - s10);
-  x11 = fdct_round_shift(s3 - s11);
-  x12 = fdct_round_shift(s4 - s12);
-  x13 = fdct_round_shift(s5 - s13);
-  x14 = fdct_round_shift(s6 - s14);
-  x15 = fdct_round_shift(s7 - s15);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
-  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
-  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
-  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = s0 + s4;
-  x1 = s1 + s5;
-  x2 = s2 + s6;
-  x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = fdct_round_shift(s8 + s12);
-  x9 = fdct_round_shift(s9 + s13);
-  x10 = fdct_round_shift(s10 + s14);
-  x11 = fdct_round_shift(s11 + s15);
-  x12 = fdct_round_shift(s8 - s12);
-  x13 = fdct_round_shift(s9 - s13);
-  x14 = fdct_round_shift(s10 - s14);
-  x15 = fdct_round_shift(s11 - s15);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
-  x12 = fdct_round_shift(s12 + s14);
-  x13 = fdct_round_shift(s13 + s15);
-  x14 = fdct_round_shift(s12 - s14);
-  x15 = fdct_round_shift(s13 - s15);
-
-  // stage 4
-  s2 = (- cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (- x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (- x10 + x11);
-  s14 = (- cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-  x10 = fdct_round_shift(s10);
-  x11 = fdct_round_shift(s11);
-  x14 = fdct_round_shift(s14);
-  x15 = fdct_round_shift(s15);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x8;
-  output[2] = (tran_low_t)x12;
-  output[3] = (tran_low_t)-x4;
-  output[4] = (tran_low_t)x6;
-  output[5] = (tran_low_t)x14;
-  output[6] = (tran_low_t)x10;
-  output[7] = (tran_low_t)x2;
-  output[8] = (tran_low_t)x3;
-  output[9] = (tran_low_t)x11;
-  output[10] = (tran_low_t)x15;
-  output[11] = (tran_low_t)x7;
-  output[12] = (tran_low_t)x5;
-  output[13] = (tran_low_t)-x13;
-  output[14] = (tran_low_t)x9;
-  output[15] = (tran_low_t)-x1;
-}
-
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {

diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h
index 49afcbb..6ce7e96 100644
--- a/vp9/encoder/vp9_dct.h
+++ b/vp9/encoder/vp9_dct.h

@@ -11,49 +11,12 @@
 #ifndef VP9_ENCODER_VP9_DCT_H_
 #define VP9_ENCODER_VP9_DCT_H_
 
-#include "vp9/common/vp9_idct.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride);
-void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
-void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride);
-
-void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
-void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
-void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
-void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
-void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
-void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
 void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
 
-static const transform_2d FHT_4[] = {
-  { vp9_fdct4,  vp9_fdct4  },  // DCT_DCT  = 0
-  { vp9_fadst4, vp9_fdct4  },  // ADST_DCT = 1
-  { vp9_fdct4,  vp9_fadst4 },  // DCT_ADST = 2
-  { vp9_fadst4, vp9_fadst4 }   // ADST_ADST = 3
-};
-
-static const transform_2d FHT_8[] = {
-  { vp9_fdct8,  vp9_fdct8  },  // DCT_DCT  = 0
-  { vp9_fadst8, vp9_fdct8  },  // ADST_DCT = 1
-  { vp9_fdct8,  vp9_fadst8 },  // DCT_ADST = 2
-  { vp9_fadst8, vp9_fadst8 }   // ADST_ADST = 3
-};
-
-static const transform_2d FHT_16[] = {
-  { vp9_fdct16,  vp9_fdct16  },  // DCT_DCT  = 0
-  { vp9_fadst16, vp9_fdct16  },  // ADST_DCT = 1
-  { vp9_fdct16,  vp9_fadst16 },  // DCT_ADST = 2
-  { vp9_fadst16, vp9_fadst16 }   // ADST_ADST = 3
-};
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 02d986e..a450945 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -194,7 +194,6 @@
 
   set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
-
   mbmi = &xd->mi[0]->mbmi;
 
   // Set up destination pointers.
@@ -2123,38 +2122,6 @@
   BLOCK_64X64
 };
 
-// Checks to see if a macro block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(VP9_COMP *cpi,
-                          int mi_row, int mi_col) {
-  int is_active_edge = 0;
-  int top_edge = 0;
-  int bottom_edge = cpi->common.mi_rows;
-  int left_edge = 0;
-  int right_edge = cpi->common.mi_cols;
-
-  // For two pass account for any formatting bars detected.
-  if (cpi->oxcf.pass == 2) {
-    TWO_PASS *twopass = &cpi->twopass;
-
-    // The inactive region is specified in MBs not mi units.
-    // The image edge is in the following MB row.
-    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-
-    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-    bottom_edge = MAX(top_edge, bottom_edge);
-  }
-
-  if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
-      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
-      ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
-      ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
-    is_active_edge = 1;
-  }
-
-  return is_active_edge;
-}
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
@@ -2253,7 +2220,7 @@
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
-  if (active_edge_sb(cpi, mi_row, mi_col)) {
+  if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
     min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
@@ -2674,8 +2641,9 @@
   }
 
   // PARTITION_HORZ
-  if (partition_horz_allowed && do_rect) {
-    subsize = get_subsize(bsize, PARTITION_HORZ);
+  if (partition_horz_allowed &&
+      (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
@@ -2721,8 +2689,9 @@
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
   // PARTITION_VERT
-  if (partition_vert_allowed && do_rect) {
-    subsize = get_subsize(bsize, PARTITION_VERT);
+  if (partition_vert_allowed &&
+      (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+      subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3130941..9c3c510 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -23,7 +24,6 @@
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -763,7 +763,7 @@
 }
 
 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                               TX_SIZE tx_size, void *arg) {
+                            TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -795,7 +795,7 @@
       case TX_32X32:
         scan_order = &vp9_default_scan_orders[TX_32X32];
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+        vp9_predict_intra_block(xd, bwl, TX_32X32, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
@@ -816,14 +816,17 @@
         tx_type = get_tx_type(pd->plane_type, xd);
         scan_order = &vp9_scan_orders[TX_16X16][tx_type];
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+        vp9_predict_intra_block(xd, bwl, TX_16X16, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
-          vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          if (tx_type == DCT_DCT)
+            vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob,
@@ -838,14 +841,17 @@
         tx_type = get_tx_type(pd->plane_type, xd);
         scan_order = &vp9_scan_orders[TX_8X8][tx_type];
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+        vp9_predict_intra_block(xd, bwl, TX_8X8, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
-          vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          if (tx_type == DCT_DCT)
+            vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob,
@@ -860,7 +866,7 @@
         tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
         scan_order = &vp9_scan_orders[TX_4X4][tx_type];
         mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+        vp9_predict_intra_block(xd, bwl, TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
@@ -903,7 +909,7 @@
     case TX_32X32:
       scan_order = &vp9_default_scan_orders[TX_32X32];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+      vp9_predict_intra_block(xd, bwl, TX_32X32, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
@@ -923,7 +929,7 @@
       tx_type = get_tx_type(pd->plane_type, xd);
       scan_order = &vp9_scan_orders[TX_16X16][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+      vp9_predict_intra_block(xd, bwl, TX_16X16, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
@@ -943,7 +949,7 @@
       tx_type = get_tx_type(pd->plane_type, xd);
       scan_order = &vp9_scan_orders[TX_8X8][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+      vp9_predict_intra_block(xd, bwl, TX_8X8, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
@@ -963,7 +969,7 @@
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
       mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+      vp9_predict_intra_block(xd, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);

diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 10180f2..a1d77db 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c

@@ -29,7 +29,7 @@
   vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
 }
 
-static void encode_mv_component(vp9_writer* w, int comp,
+static void encode_mv_component(vpx_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
   const int sign = comp < 0;
@@ -42,7 +42,7 @@
   assert(comp != 0);
 
   // Sign
-  vp9_write(w, sign, mvcomp->sign);
+  vpx_write(w, sign, mvcomp->sign);
 
   // Class
   vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
@@ -56,7 +56,7 @@
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < n; ++i)
-      vp9_write(w, (d >> i) & 1, mvcomp->bits[i]);
+      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
   // Fractional bits
@@ -66,7 +66,7 @@
 
   // High precision bit
   if (usehp)
-    vp9_write(w, hp,
+    vpx_write(w, hp,
               mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
 }
 
@@ -133,23 +133,23 @@
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
-                     vp9_prob upd_p) {
-  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                     vpx_prob upd_p) {
+  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
   const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
                      cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
-  vp9_write(w, update, upd_p);
+  vpx_write(w, update, upd_p);
   if (update) {
     *cur_p = new_p;
-    vp9_write_literal(w, new_p >> 1, 7);
+    vpx_write_literal(w, new_p >> 1, 7);
   }
   return update;
 }
 
-static void write_mv_update(const vp9_tree_index *tree,
-                            vp9_prob probs[/*n - 1*/],
+static void write_mv_update(const vpx_tree_index *tree,
+                            vpx_prob probs[/*n - 1*/],
                             const unsigned int counts[/*n - 1*/],
-                            int n, vp9_writer *w) {
+                            int n, vpx_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -161,7 +161,7 @@
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
                          nmv_context_counts *const counts) {
   int i, j;
   nmv_context *const mvc = &cm->fc->nmvc;
@@ -199,7 +199,7 @@
   }
 }
 
-void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+void vp9_encode_mv(VP9_COMP* cpi, vpx_writer* w,
                    const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,

diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index e8ee5ab..5fb114c 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h

@@ -20,10 +20,10 @@
 
 void vp9_entropy_mv_init(void);
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
                          nmv_context_counts *const counts);
 
-void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f27f57a..781204d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -19,6 +19,7 @@
 #include "vpx/internal/vpx_psnr.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
@@ -732,7 +733,7 @@
 
   vp9_set_mb_mi(cm, cm->width, cm->height);
   vp9_init_context_buffers(cm);
-  vp9_init_macroblockd(cm, xd);
+  vp9_init_macroblockd(cm, xd, NULL);
   cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
@@ -2627,9 +2628,10 @@
   const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
   int force_recode = 0;
 
-  if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (frame_is_kfgfarf &&
-      (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+       (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
     if (frame_is_kfgfarf &&
         (oxcf->resize_mode == RESIZE_DYNAMIC) &&
         scale_down(cpi, q)) {
@@ -3059,17 +3061,17 @@
       oxcf->rc_mode == VPX_CBR &&
       !cpi->use_svc &&
       oxcf->resize_mode == RESIZE_DYNAMIC) {
-      if (cpi->resize_state == 1) {
+      if (cpi->resize_pending == 1) {
         oxcf->scaled_frame_width =
             (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
         oxcf->scaled_frame_height =
             (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
-      } else if (cpi->resize_state == -1) {
+      } else if (cpi->resize_pending == -1) {
         // Go back up to original size.
         oxcf->scaled_frame_width = oxcf->width;
         oxcf->scaled_frame_height = oxcf->height;
       }
-      if (cpi->resize_state != 0) {
+      if (cpi->resize_pending != 0) {
         // There has been a change in frame size.
         vp9_set_size_literal(cpi,
                              oxcf->scaled_frame_width,
@@ -3140,12 +3142,27 @@
 
   set_frame_size(cpi);
 
-  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
-
-  if (cpi->unscaled_last_source != NULL)
-    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+  // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
+  // Only for 2x2 scaling for now.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      cpi->un_scaled_source->y_width == (cm->width << 1) &&
+      cpi->un_scaled_source->y_height == (cm->height << 1)) {
+    cpi->Source = vp9_scale_if_required_fast(cm,
+                                             cpi->un_scaled_source,
+                                             &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+       cpi->Last_Source = vp9_scale_if_required_fast(cm,
+                                                     cpi->unscaled_last_source,
+                                                     &cpi->scaled_last_source);
+  } else {
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+  }
 
   if (frame_is_intra_only(cm) == 0) {
     vp9_scale_references(cpi);
@@ -3491,6 +3508,21 @@
   }
 }
 
+YV12_BUFFER_CONFIG *vp9_scale_if_required_fast(VP9_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+    // For 2x2 scaling down.
+    vpx_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1,
+                    2, 1, 0);
+    vp9_extend_frame_borders(scaled);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled) {
@@ -4275,7 +4307,7 @@
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
-    double samples;
+    double samples = 0.0;
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index f095cad..b76b6b7 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -55,7 +55,7 @@
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
+  vpx_prob segment_pred_probs[PREDICTION_PROBS];
 
   unsigned char *last_frame_seg_map_copy;
 
@@ -614,6 +614,10 @@
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+YV12_BUFFER_CONFIG *vp9_scale_if_required_fast(VP9_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled);

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a85b70b..5caf2cb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -126,6 +126,7 @@
             stats->pcnt_neutral,
             stats->intra_skip_pct,
             stats->inactive_zone_rows,
+            stats->inactive_zone_cols,
             stats->MVr,
             stats->mvr_abs,
             stats->MVc,
@@ -164,6 +165,7 @@
   section->pcnt_neutral = 0.0;
   section->intra_skip_pct = 0.0;
   section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
   section->MVr = 0.0;
   section->mvr_abs     = 0.0;
   section->MVc        = 0.0;
@@ -191,6 +193,7 @@
   section->pcnt_neutral += frame->pcnt_neutral;
   section->intra_skip_pct += frame->intra_skip_pct;
   section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
   section->MVr += frame->MVr;
   section->mvr_abs     += frame->mvr_abs;
   section->MVc        += frame->MVc;
@@ -216,6 +219,7 @@
   section->pcnt_neutral -= frame->pcnt_neutral;
   section->intra_skip_pct -= frame->intra_skip_pct;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
   section->MVr -= frame->MVr;
   section->mvr_abs     -= frame->mvr_abs;
   section->MVc        -= frame->MVc;
@@ -1050,6 +1054,7 @@
     fps.pcnt_neutral = (double)neutral_count / num_mbs;
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1878,7 +1883,7 @@
   double gf_group_error_left;
   int gf_arf_bits;
   const int is_key_frame = frame_is_intra_only(cm);
-  const int kf_or_arf_active = is_key_frame || rc->source_alt_ref_active;
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -1898,7 +1903,7 @@
 
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
-  if (is_key_frame || rc->source_alt_ref_active) {
+  if (arf_active_or_kf) {
     gf_group_err -= gf_first_frame_err;
 #if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error -= this_frame->coded_error;
@@ -1931,7 +1936,7 @@
       // bits to spare and are better with a smaller interval and smaller boost.
       // At high Q when there are few bits to spare we are better with a longer
       // interval to spread the cost of the GF.
-      active_max_gf_interval = rc->max_gf_interval - 4 + MIN(4, (int_lbq / 6));
+      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
       if (active_max_gf_interval < active_min_gf_interval)
         active_max_gf_interval = active_min_gf_interval;
 
@@ -1996,11 +2001,11 @@
     // Break out conditions.
     if (
       // Break at active_max_gf_interval unless almost totally static.
-      ((i >= active_max_gf_interval + kf_or_arf_active) &&
-       (zero_motion_accumulator < 0.995)) ||
+      (i >= (active_max_gf_interval + arf_active_or_kf) &&
+            zero_motion_accumulator < 0.995) ||
       (
         // Don't break out with a very short interval.
-        (i >= active_min_gf_interval + kf_or_arf_active) &&
+        (i >= active_min_gf_interval + arf_active_or_kf) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
@@ -2038,10 +2043,7 @@
   }
 
   // Set the interval until the next gf.
-  if (is_key_frame || rc->source_alt_ref_pending)
-    rc->baseline_gf_interval = i - 1;
-  else
-    rc->baseline_gf_interval = i;
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
 
   // Only encode alt reference frame in temporal base layer. So
   // baseline_gf_interval should be multiple of a temporal layer group

diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 0047932..49f9da3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h

@@ -53,6 +53,7 @@
   double pcnt_neutral;
   double intra_skip_pct;
   double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
   double MVr;
   double mvr_abs;
   double MVc;

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index d5eeb9c..acbd7dd 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -145,7 +145,7 @@
     unsigned int err;
 
     xd->mi[0]->mbmi.mode = mode;
-    vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
+    vp9_predict_intra_block(xd, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index e99cbc7..6d09dbe 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -902,8 +902,7 @@
   p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
   pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
   // Use source buffer as an approximation for the fully reconstructed buffer.
-  vp9_predict_intra_block(xd, block >> (2 * tx_size),
-                          b_width_log2_lookup[plane_bsize],
+  vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
                           tx_size, args->mode,
                           x->skip_encode ? p->src.buf : pd->dst.buf,
                           x->skip_encode ? src_stride : dst_stride,
@@ -1020,9 +1019,9 @@
 static void init_ref_frame_cost(VP9_COMMON *const cm,
                                 MACROBLOCKD *const xd,
                                 int ref_frame_cost[MAX_REF_FRAMES]) {
-  vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
-  vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
-  vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+  vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
 
   ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
   ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index e6e17c0..d53d95d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -9,7 +9,7 @@
  */
 
 #include <math.h>
-
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -20,120 +20,6 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    const int64_t tmp =
-        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-         quant) >> 16;
-    qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    const int64_t tmp =
-        (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-               INT32_MIN, INT32_MAX) * quant) >> 15;
-    qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -207,15 +93,11 @@
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      const int64_t tmp =
-          (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-           quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-      if (tmp)
+      if (abs_qcoeff)
         eob = i;
     }
   }
@@ -287,21 +169,21 @@
 
   if (!skip_block) {
     for (i = 0; i < n_coeffs; i++) {
+      uint32_t abs_qcoeff = 0;
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
-      int64_t tmp = 0;
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
       if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                    INT32_MIN, INT32_MAX);
-        tmp = (tmp * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
+        const int64_t tmp = abs_coeff
+                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
       }
 
-      if (tmp)
+      if (abs_qcoeff)
         eob = i;
     }
   }
@@ -309,227 +191,6 @@
 }
 #endif
 
-void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp)
-          eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0],
-                            INT32_MIN, INT32_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp)
-          eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, int skip_block,
-                                   const int16_t *zbin_ptr,
-                                   const int16_t *round_ptr,
-                                   const int16_t *quant_ptr,
-                                   const int16_t *quant_shift_ptr,
-                                   tran_low_t *qcoeff_ptr,
-                                   tran_low_t *dqcoeff_ptr,
-                                   const int16_t *dequant_ptr,
-                                   uint16_t *eob_ptr,
-                                   const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int64_t tmp = clamp(abs_coeff +
-                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                          INT32_MIN, INT32_MAX);
-      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 55e5469..6132036 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -37,34 +37,9 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr);
-#endif
-
 struct VP9_COMP;
 struct VP9Common;
 

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index be09bca..2be2a64 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -281,11 +281,14 @@
   // Assume we do not need any constraint lower than 4K 20 fps
   static const double factor_safe = 3840 * 2160 * 20.0;
   const double factor = width * height * framerate;
+  const double default_interval =
+      MIN(MAX_GF_INTERVAL, MAX(MIN_GF_INTERVAL, (int)(framerate * 0.125)));
 
   if (factor <= factor_safe)
-    return MIN_GF_INTERVAL;
+    return (int)default_interval;
   else
-    return (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5);
+    return (int)MAX(default_interval,
+                    (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
   // Note this logic makes:
   // 4K24: 5
   // 4K30: 6
@@ -294,6 +297,7 @@
 
 int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
   int interval = MIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
   return MAX(interval, min_gf_interval);
 }
 
@@ -1351,9 +1355,11 @@
   }
 
   // Trigger the resizing of the next frame if it is scaled.
-  cpi->resize_pending =
-      rc->next_frame_size_selector != rc->frame_size_selector;
-  rc->frame_size_selector = rc->next_frame_size_selector;
+  if (oxcf->pass != 0) {
+    cpi->resize_pending =
+        rc->next_frame_size_selector != rc->frame_size_selector;
+    rc->frame_size_selector = rc->next_frame_size_selector;
+  }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1628,9 +1634,9 @@
 
   vp9_rc_set_frame_target(cpi, target);
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
-    cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
+    cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
   else
-    cpi->resize_state = 0;
+    cpi->resize_pending = 0;
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1693,7 +1699,6 @@
   if (rc->max_gf_interval == 0)
     rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
         cpi->framerate, rc->min_gf_interval);
-  rc->max_gf_interval += (rc->max_gf_interval & 0x01);
 
   // Extended interval for genuinely static scenes
   rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
@@ -1805,12 +1810,12 @@
     cpi->resize_count = 0;
     return 0;
   }
-  // Resize based on average QP over some window.
+  // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
   if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
     const int window = (int)(5 * cpi->framerate);
     cpi->resize_avg_qp += cm->base_qindex;
-    if (cpi->rc.buffer_level < 0)
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
       ++cpi->resize_buffer_underflow;
     ++cpi->resize_count;
     // Check for resize action every "window" frames.
@@ -1821,11 +1826,13 @@
       // Resize back up if average QP is low, and we are currently in a resized
       // down state.
       if (cpi->resize_state == 0 &&
-          cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) {
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
         resize_now = 1;
+        cpi->resize_state = 1;
       } else if (cpi->resize_state == 1 &&
                  avg_qp < 40 * cpi->rc.worst_quality / 100) {
         resize_now = -1;
+        cpi->resize_state = 0;
       }
       // Reset for next window measurement.
       cpi->resize_avg_qp = 0;

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index bc7cb34..3f6de42 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -93,7 +93,7 @@
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-            vp9_prob probs[ENTROPY_NODES];
+            vpx_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a27e89..8ae6783 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -641,7 +641,7 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
@@ -654,7 +654,7 @@
   int64_t best_rd = INT64_MAX;
   TX_SIZE best_tx = max_tx_size;
 
-  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
@@ -760,7 +760,8 @@
   return 0;
 }
 
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int row, int col,
                                      PREDICTION_MODE *best_mode,
                                      const int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
@@ -770,18 +771,14 @@
   PREDICTION_MODE mode;
   MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
-                                                                src_stride)];
-  uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
-                                                           dst_stride)];
+  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+  uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4];
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -790,8 +787,6 @@
   uint16_t best_dst16[8 * 8];
 #endif
 
-  assert(ib < 4);
-
   memcpy(ta, a, sizeof(ta));
   memcpy(tl, l, sizeof(tl));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
@@ -819,7 +814,7 @@
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-          const int block = ib + idy * 2 + idx;
+          const int block = (row + idy) * 2 + (col + idx);
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
           int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
@@ -827,11 +822,11 @@
                                                                   p->src_diff);
           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
           xd->mi[0]->bmi[block].as_mode = mode;
-          vp9_predict_intra_block(xd, block, 1,
-                                  TX_4X4, mode,
+          vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                   x->skip_encode ? src : dst,
                                   x->skip_encode ? src_stride : dst_stride,
-                                  dst, dst_stride, idx, idy, 0);
+                                  dst, dst_stride,
+                                  col + idx, row + idy, 0);
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
                                     dst, dst_stride, xd->bd);
           if (xd->lossless) {
@@ -850,7 +845,10 @@
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
             const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
-            vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+            if (tx_type == DCT_DCT)
+              vp9_highbd_fdct4x4(src_diff, coeff, 8);
+            else
+              vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                                  so->scan, so->neighbors,
@@ -920,18 +918,17 @@
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        const int block = ib + idy * 2 + idx;
+        const int block = (row + idy) * 2 + (col + idx);
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi[0]->bmi[block].as_mode = mode;
-        vp9_predict_intra_block(xd, block, 1,
-                                TX_4X4, mode,
+        vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
-                                dst, dst_stride, idx, idy, 0);
+                                dst, dst_stride, col + idx, row + idy, 0);
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
@@ -1030,9 +1027,9 @@
         bmode_costs  = cpi->y_mode_costs[A][L];
       }
 
-      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + idx, t_left + idy, &r, &ry, &d,
-                                      bsize, best_rd - total_rd);
+      this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
+                                      bmode_costs, t_above + idx, t_left + idy,
+                                      &r, &ry, &d, bsize, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
 
@@ -2121,7 +2118,7 @@
                                      int segment_id,
                                      unsigned int *ref_costs_single,
                                      unsigned int *ref_costs_comp,
-                                     vp9_prob *comp_mode_p) {
+                                     vpx_prob *comp_mode_p) {
   int seg_ref_active = segfeature_active(&cm->seg, segment_id,
                                          SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
@@ -2129,8 +2126,8 @@
     memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
-    vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
-    vp9_prob comp_inter_p = 128;
+    vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+    vpx_prob comp_inter_p = 128;
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
@@ -2142,8 +2139,8 @@
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
-      vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
-      vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+      vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+      vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -2162,7 +2159,7 @@
       ref_costs_single[ALTREF_FRAME] = 512;
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
-      vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
+      vpx_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -2898,6 +2895,77 @@
   *this_rd += (*this_rd * var_factor) / 100;
 }
 
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+    ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+    (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = MAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = MAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+                       int mi_row, int mi_col) {
+  return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+         vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
                                TileDataEnc *tile_data,
                                MACROBLOCK *x,
@@ -2935,7 +3003,7 @@
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   unsigned int best_pred_sse = UINT_MAX;
   PREDICTION_MODE best_intra_mode = DC_PRED;
@@ -3628,7 +3696,7 @@
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   INTERP_FILTER best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -3744,20 +3812,22 @@
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   INTERP_FILTER tmp_best_filter = SWITCHABLE;
   int rate_uv_intra, rate_uv_tokenonly;
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  int internal_active_edge =
+    vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3843,7 +3913,8 @@
       continue;
 
     // Test best rd so far against threshold for trying this mode.
-    if (rd_less_than_thresh(best_rd,
+    if (!internal_active_edge &&
+        rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
                             tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;

diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 459b032..00ee55c 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h

@@ -54,6 +54,11 @@
                                         PICK_MODE_CONTEXT *ctx,
                                         int64_t best_rd_so_far);
 
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
+int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                    struct TileDataEnc *tile_data,
                                    struct macroblock *x,
@@ -61,6 +66,7 @@
                                    struct RD_COST *rd_cost,
                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 1f0d4df..c5c50a2 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c

@@ -49,7 +49,7 @@
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(int *segcounts, vp9_prob *segment_tree_probs) {
+static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
   // Work out probabilities of each segment
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
@@ -66,7 +66,7 @@
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(int *segcounts, vp9_prob *probs) {
+static int cost_segmap(int *segcounts, vpx_prob *probs) {
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
   const int c45 = segcounts[4] + segcounts[5];
@@ -207,9 +207,9 @@
   int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
   int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
 
-  vp9_prob no_pred_tree[SEG_TREE_PROBS];
-  vp9_prob t_pred_tree[SEG_TREE_PROBS];
-  vp9_prob t_nopred_prob[PREDICTION_PROBS];
+  vpx_prob no_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_nopred_prob[PREDICTION_PROBS];
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6c6c4ed..bf06fd0 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -12,6 +12,8 @@
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -89,8 +91,10 @@
 
   // If this is a two pass clip that fits the criteria for animated or
   // graphics content then reset disable_split_mask for speeds 1-4.
+  // Also if the image edge is internal to the coded area.
   if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
-      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)) {
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (vp9_internal_image_edge(cpi)))) {
     sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
   }
 
@@ -112,7 +116,13 @@
   sf->allow_skip_recode = 1;
 
   if (speed >= 1) {
-    sf->use_square_partition_only = !frame_is_intra_only(cm);
+    if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+        vp9_internal_image_edge(cpi)) {
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
+    } else {
+      sf->use_square_partition_only = !frame_is_intra_only(cm);
+    }
+
     sf->less_rectangular_check  = 1;
 
     sf->use_rd_breakout = 1;
@@ -152,6 +162,7 @@
   }
 
   if (speed >= 3) {
+    sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;

diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index b345b16..799f179 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c

@@ -7,13 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "vpx_dsp/bitwriter.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
-
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_subexp.h"
-#include "vp9/encoder/vp9_writer.h"
 
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
@@ -79,50 +78,50 @@
   return i;
 }
 
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
   int delp = remap_prob(newp, oldp);
   return update_bits[delp] * 256;
 }
 
-static void encode_uniform(vp9_writer *w, int v) {
+static void encode_uniform(vpx_writer *w, int v) {
   const int l = 8;
   const int m = (1 << l) - 191;
   if (v < m) {
-    vp9_write_literal(w, v, l - 1);
+    vpx_write_literal(w, v, l - 1);
   } else {
-    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vp9_write_literal(w, (v - m) & 1, 1);
+    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vpx_write_literal(w, (v - m) & 1, 1);
   }
 }
 
-static INLINE int write_bit_gte(vp9_writer *w, int word, int test) {
-  vp9_write_literal(w, word >= test, 1);
+static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
+  vpx_write_literal(w, word >= test, 1);
   return word >= test;
 }
 
-static void encode_term_subexp(vp9_writer *w, int word) {
+static void encode_term_subexp(vpx_writer *w, int word) {
   if (!write_bit_gte(w, word, 16)) {
-    vp9_write_literal(w, word, 4);
+    vpx_write_literal(w, word, 4);
   } else if (!write_bit_gte(w, word, 32)) {
-    vp9_write_literal(w, word - 16, 4);
+    vpx_write_literal(w, word - 16, 4);
   } else if (!write_bit_gte(w, word, 64)) {
-    vp9_write_literal(w, word - 32, 5);
+    vpx_write_literal(w, word - 32, 5);
   } else {
     encode_uniform(w, word - 64);
   }
 }
 
-void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
+void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
   const int delp = remap_prob(newp, oldp);
   encode_term_subexp(w, delp);
 }
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
-                                        vp9_prob oldp, vp9_prob *bestp,
-                                        vp9_prob upd) {
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd) {
   const int old_b = cost_branch256(ct, oldp);
   int bestsavings = 0;
-  vp9_prob newp, bestnewp = oldp;
+  vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
 
   for (newp = *bestp; newp != oldp; newp += step) {
@@ -139,15 +138,15 @@
 }
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vp9_prob *oldp,
-                                              vp9_prob *bestp,
-                                              vp9_prob upd,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
                                               int stepsize) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
-  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
   vp9_model_to_full_probs(oldp, oldplist);
-  memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
   for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
     old_b += cost_branch256(ct + 2 * i, oldplist[i]);
   old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
@@ -197,18 +196,18 @@
   return bestsavings;
 }
 
-void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
-  const vp9_prob upd = DIFF_UPDATE_PROB;
-  vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
                                                           upd);
   assert(newp >= 1);
   if (savings > 0) {
-    vp9_write(w, 1, upd);
+    vpx_write(w, 1, upd);
     vp9_write_prob_diff_update(w, newp, *oldp);
     *oldp = newp;
   } else {
-    vp9_write(w, 0, upd);
+    vpx_write(w, 0, upd);
   }
 }

diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 6fbb747..b968232 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h

@@ -16,25 +16,25 @@
 extern "C" {
 #endif
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
-struct vp9_writer;
+struct vpx_writer;
 
-void vp9_write_prob_diff_update(struct vp9_writer *w,
-                                vp9_prob newp, vp9_prob oldp);
+void vp9_write_prob_diff_update(struct vpx_writer *w,
+                                vpx_prob newp, vpx_prob oldp);
 
-void vp9_cond_prob_diff_update(struct vp9_writer *w, vp9_prob *oldp,
+void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
-                                        vp9_prob oldp, vp9_prob *bestp,
-                                        vp9_prob upd);
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd);
 
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vp9_prob *oldp,
-                                              vp9_prob *bestp,
-                                              vp9_prob upd,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
                                               int stepsize);
 
 #ifdef __cplusplus

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 181a99c..a1c076a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -52,7 +52,7 @@
     / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -EOB_TOKEN, 2,                       // 0  = EOB
   -ZERO_TOKEN, 4,                      // 1  = ZERO
   -ONE_TOKEN, 6,                       // 2  = ONE
@@ -66,12 +66,12 @@
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
 
-static const vp9_tree_index cat1[2] = {0, 0};
-static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
-static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
-static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+static const vpx_tree_index cat1[2] = {0, 0};
+static const vpx_tree_index cat2[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
     14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
 
 static const int16_t zero_cost[] = {0};
@@ -367,20 +367,20 @@
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static const vp9_tree_index cat1_high10[2] = {0, 0};
-static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
-static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
-static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+static const vpx_tree_index cat1_high10[2] = {0, 0};
+static const vpx_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
   12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
   30, 30, 0, 0};
-static const vp9_tree_index cat1_high12[2] = {0, 0};
-static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0};
-static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
-static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+static const vpx_tree_index cat1_high12[2] = {0, 0};
+static const vpx_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
   12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
   30, 30, 32, 32, 34, 34, 0, 0};
 #endif
@@ -457,7 +457,7 @@
                    aoff, loff);
 }
 
-static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
+static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
                              int32_t extra, uint8_t token,
                              uint8_t skip_eob_node,
                              unsigned int *counts) {
@@ -470,7 +470,7 @@
 }
 
 static INLINE void add_token_no_extra(TOKENEXTRA **t,
-                                      const vp9_prob *context_tree,
+                                      const vpx_prob *context_tree,
                                       uint8_t token,
                                       uint8_t skip_eob_node,
                                       unsigned int *counts) {
@@ -511,7 +511,7 @@
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
-  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+  vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       cpi->common.fc->coef_probs[tx_size][type][ref];
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[tx_size][type][ref];

diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 81cc2e1..11b78ba 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h

@@ -35,14 +35,14 @@
 } TOKENVALUE;
 
 typedef struct {
-  const vp9_prob *context_tree;
+  const vpx_prob *context_tree;
   EXTRABIT extra;
   uint8_t token;
   uint8_t skip_eob_node;
 } TOKENEXTRA;
 
-extern const vp9_tree_index vp9_coef_tree[];
-extern const vp9_tree_index vp9_coef_con_tree[];
+extern const vpx_tree_index vp9_coef_tree[];
+extern const vpx_tree_index vp9_coef_con_tree[];
 extern const struct vp9_token vp9_coef_encodings[];
 
 int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);

diff --git a/vp9/encoder/vp9_treewriter.c b/vp9/encoder/vp9_treewriter.c
index bb04b40..0fc078e 100644
--- a/vp9/encoder/vp9_treewriter.c
+++ b/vp9/encoder/vp9_treewriter.c

@@ -10,13 +10,13 @@
 
 #include "vp9/encoder/vp9_treewriter.h"
 
-static void tree2tok(struct vp9_token *tokens, const vp9_tree_index *tree,
+static void tree2tok(struct vp9_token *tokens, const vpx_tree_index *tree,
                      int i, int v, int l) {
   v += v;
   ++l;
 
   do {
-    const vp9_tree_index j = tree[i++];
+    const vpx_tree_index j = tree[i++];
     if (j <= 0) {
       tokens[-j].value = v;
       tokens[-j].len = l;
@@ -27,11 +27,11 @@
 }
 
 void vp9_tokens_from_tree(struct vp9_token *tokens,
-                          const vp9_tree_index *tree) {
+                          const vpx_tree_index *tree) {
   tree2tok(tokens, tree, 0, 0, 0);
 }
 
-static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
+static unsigned int convert_distribution(unsigned int i, vpx_tree tree,
                                          unsigned int branch_ct[][2],
                                          const unsigned int num_events[]) {
   unsigned int left, right;
@@ -51,7 +51,7 @@
   return left + right;
 }
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
+void vp9_tree_probs_from_distribution(vpx_tree tree,
                                       unsigned int branch_ct[/* n-1 */][2],
                                       const unsigned int num_events[/* n */]) {
   convert_distribution(0, tree, branch_ct, num_events);

diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index 4a76d87..0f89350 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h

@@ -11,13 +11,13 @@
 #ifndef VP9_ENCODER_VP9_TREEWRITER_H_
 #define VP9_ENCODER_VP9_TREEWRITER_H_
 
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx_dsp/bitwriter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
+void vp9_tree_probs_from_distribution(vpx_tree tree,
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
                                       const unsigned int num_events[ /* n */ ]);
 
@@ -26,20 +26,20 @@
   int len;
 };
 
-void vp9_tokens_from_tree(struct vp9_token*, const vp9_tree_index *);
+void vp9_tokens_from_tree(struct vp9_token*, const vpx_tree_index *);
 
-static INLINE void vp9_write_tree(vp9_writer *w, const vp9_tree_index *tree,
-                                  const vp9_prob *probs, int bits, int len,
-                                  vp9_tree_index i) {
+static INLINE void vp9_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+                                  const vpx_prob *probs, int bits, int len,
+                                  vpx_tree_index i) {
   do {
     const int bit = (bits >> --len) & 1;
-    vp9_write(w, bit, probs[i >> 1]);
+    vpx_write(w, bit, probs[i >> 1]);
     i = tree[i + bit];
   } while (len);
 }
 
-static INLINE void vp9_write_token(vp9_writer *w, const vp9_tree_index *tree,
-                                   const vp9_prob *probs,
+static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
+                                   const vpx_prob *probs,
                                    const struct vp9_token *token) {
   vp9_write_tree(w, tree, probs, token->value, token->len, 0);
 }

diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
index 003ebd1..5074d31 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h

@@ -28,7 +28,8 @@
         temp_in[j] = intermediate[j * 32 + i];
       vp9_fdct32(temp_in, temp_out, 0);
       for (j = 0; j < 32; ++j)
-        out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+        out[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
     }
 }
   #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_c
@@ -42,7 +43,7 @@
         temp_in[j] = intermediate[j * 32 + i];
       vp9_fdct32(temp_in, temp_out, 1);
       for (j = 0; j < 32; ++j)
-        out[j + i * 32] = temp_out[j];
+        out[j + i * 32] = (tran_low_t)temp_out[j];
     }
 }
   #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_rd_c

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index cff4fcb..9e13a68 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -2266,108 +2266,6 @@
   store_output(&in1, output);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-/* These SSE2 versions of the FHT functions only actually use SSE2 in the
- * DCT_DCT case in all other cases, they revert to C code which is identical
- * to that used by the C versions of them.
- */
-
-void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output,
-                            int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct4x4_sse2(input, output, stride);
-  } else {
-    tran_low_t out[4 * 4];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[4], temp_out[4];
-    const transform_2d ht = FHT_4[tx_type];
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = input[j * stride + i] * 16;
-      if (i == 0 && temp_in[0])
-        temp_in[0] += 1;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        outptr[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j + i * 4];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (temp_out[j] + 1) >> 2;
-    }
-  }
-}
-
-void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output,
-                            int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct8x8_sse2(input, output, stride);
-  } else {
-    tran_low_t out[64];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[8], temp_out[8];
-    const transform_2d ht = FHT_8[tx_type];
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        outptr[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j + i * 8];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-    }
-  }
-}
-
-void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output,
-                              int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct16x16_sse2(input, output, stride);
-  } else {
-    tran_low_t out[256];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[16], temp_out[16];
-    const transform_2d ht = FHT_16[tx_type];
-
-    // Columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-    }
-
-    // Rows
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j + i * 16];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j + i * 16] = temp_out[j];
-    }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 /*
  * The DCTnxn functions are defined using the macros below. The main code for
  * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h &

diff --git a/vp9/encoder/x86/vp9_dct_sse2_impl.h b/vp9/encoder/x86/vp9_dct_sse2_impl.h
index 11bf5a2..86e9ecf 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct_sse2_impl.h

@@ -40,35 +40,35 @@
   // These are the coefficients used for the multiplies.
   // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
   // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+  const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+  const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+  const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64,
                                             cospi_8_64, cospi_24_64,
                                             cospi_24_64, -cospi_8_64,
                                             cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+  const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64,
                                             cospi_24_64, -cospi_8_64,
                                             cospi_8_64, cospi_24_64,
                                             cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+  const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64,
                                             cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+  const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64,
                                             cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+  const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64,
                                             cospi_8_64, cospi_24_64,
                                             -cospi_8_64, -cospi_24_64,
                                             -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+  const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64,
                                             cospi_24_64, -cospi_8_64,
                                             -cospi_24_64, cospi_8_64,
                                             -cospi_24_64, cospi_8_64);
@@ -267,7 +267,7 @@
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -588,7 +588,7 @@
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 71fdfd7..2071dfe 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -14,214 +14,6 @@
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t* zbin_ptr,
-                         const int16_t* round_ptr, const int16_t* quant_ptr,
-                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
-                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                         uint16_t* eob_ptr,
-                         const int16_t* scan_ptr,
-                         const int16_t* iscan_ptr) {
-  __m128i zero;
-  (void)scan_ptr;
-
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
-        round = _mm_load_si128((const __m128i*)round_ptr);
-        quant = _mm_load_si128((const __m128i*)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
-        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
-
 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t* zbin_ptr,
                           const int16_t* round_ptr, const int16_t* quant_ptr,

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 449d52b..ec2e87c 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -15,206 +15,6 @@
 
 SECTION .text
 
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
 %macro QUANTIZE_FP 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
                                 shift, qcoeff, dqcoeff, dequant, \

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2e43c27..c8cf973 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -38,8 +38,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.c
-VP9_COMMON_SRCS-yes += common/vp9_prob.h
-VP9_COMMON_SRCS-yes += common/vp9_prob.c
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.h
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
@@ -54,7 +52,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
-VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
 VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
@@ -69,13 +66,11 @@
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
@@ -94,7 +89,6 @@
 endif
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
@@ -122,13 +116,6 @@
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 # common (msa)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
@@ -146,10 +133,6 @@
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
 
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
@@ -164,16 +147,12 @@
 endif
 endif
 
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
 endif
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
 
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
@@ -192,7 +171,6 @@
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
 else
 ifeq ($(HAVE_NEON), yes)
@@ -210,11 +188,6 @@
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
-# TODO(johannkoenig): re-enable when chromium build is fixed
-# # https://code.google.com/p/chromium/issues/detail?id=443839
-#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d2d9288..f155b9a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -173,9 +173,12 @@
   RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
   RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
+  }
   if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
     RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
-                (MAX_LAG_BUFFERS - 1));
+      (MAX_LAG_BUFFERS - 1));
   }
 
   if (cfg->rc_resize_allowed == 1) {

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 5b62c3ec..96ede3c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -17,6 +17,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
+#include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -24,7 +25,6 @@
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
 
 #include "vp9/vp9_iface_common.h"
 
@@ -145,11 +145,11 @@
 }
 
 static int parse_bitdepth_colorspace_sampling(
-    BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
+    BITSTREAM_PROFILE profile, struct vpx_read_bit_buffer *rb) {
   vpx_color_space_t color_space;
   if (profile >= PROFILE_2)
     rb->bit_offset += 1;  // Bit-depth 10 or 12.
-  color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3);
+  color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3);
   if (color_space != VPX_CS_SRGB) {
     rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
     if (profile == PROFILE_1 || profile == PROFILE_3) {
@@ -191,8 +191,8 @@
   {
     int show_frame;
     int error_resilient;
-    struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
-    const int frame_marker = vp9_rb_read_literal(&rb, 2);
+    struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = vpx_rb_read_literal(&rb, 2);
     const BITSTREAM_PROFILE profile = vp9_read_profile(&rb);
 
     if (frame_marker != VP9_FRAME_MARKER)
@@ -204,17 +204,17 @@
     if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (vp9_rb_read_bit(&rb)) {  // show an existing frame
-      vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
+    if (vpx_rb_read_bit(&rb)) {  // show an existing frame
+      vpx_rb_read_literal(&rb, 3);  // Frame buffer to show.
       return VPX_CODEC_OK;
     }
 
     if (data_sz <= 8)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    si->is_kf = !vp9_rb_read_bit(&rb);
-    show_frame = vp9_rb_read_bit(&rb);
-    error_resilient = vp9_rb_read_bit(&rb);
+    si->is_kf = !vpx_rb_read_bit(&rb);
+    show_frame = vpx_rb_read_bit(&rb);
+    error_resilient = vpx_rb_read_bit(&rb);
 
     if (si->is_kf) {
       if (!vp9_read_sync_code(&rb))
@@ -224,7 +224,7 @@
         return VPX_CODEC_UNSUP_BITSTREAM;
       vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
     } else {
-      intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb);
+      intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb);
 
       rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
 

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 94cc7ba..8836c1f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -37,10 +37,6 @@
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_fastssim.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
@@ -104,7 +100,6 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 

diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index c105adb..0e9cf16 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk

@@ -21,10 +21,6 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_reader.h
-VP9_DX_SRCS-yes += decoder/vp9_reader.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_dthread.c

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm b/vpx_dsp/arm/loopfilter_16_neon.asm
similarity index 94%
rename from vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_16_neon.asm
index 5b8ec20..5a8fdd6 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_16_neon.asm

@@ -8,12 +8,12 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 ;                                    const uint8_t *blimit0,
 ;                                    const uint8_t *limit0,
 ;                                    const uint8_t *thresh0,
@@ -29,7 +29,7 @@
 ; sp+8  const uint8_t *limit1,
 ; sp+12 const uint8_t *thresh1,
 
-|vp9_lpf_horizontal_4_dual_neon| PROC
+|vpx_lpf_horizontal_4_dual_neon| PROC
     push        {lr}
 
     ldr         r12, [sp, #4]              ; load thresh0
@@ -66,7 +66,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon_16
+    bl          vpx_loop_filter_neon_16
 
     vst1.u8     {q5}, [r2@64], r1          ; store op1
     vst1.u8     {q6}, [r3@64], r1          ; store op0
@@ -76,9 +76,9 @@
     vpop        {d8-d15}                   ; restore neon registers
 
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_dual_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
 
-; void vp9_loop_filter_neon_16();
+; void vpx_loop_filter_neon_16();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. This function uses
 ; registers d8-d15, so the calling function must save those registers.
@@ -101,7 +101,7 @@
 ; q6    op0
 ; q7    oq0
 ; q8    oq1
-|vp9_loop_filter_neon_16| PROC
+|vpx_loop_filter_neon_16| PROC
 
     ; filter_mask
     vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
@@ -194,6 +194,6 @@
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon_16|
+    ENDP        ; |vpx_loop_filter_neon_16|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
similarity index 93%
rename from vp9/common/arm/neon/vp9_loopfilter_16_neon.c
rename to vpx_dsp/arm/loopfilter_16_neon.c
index c69ee10..d24e6ad 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vpx_dsp/arm/loopfilter_16_neon.c

@@ -10,11 +10,11 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE void vp9_loop_filter_neon_16(
+static INLINE void loop_filter_neon_16(
         uint8x16_t qblimit,  // blimit
         uint8x16_t qlimit,   // limit
         uint8x16_t qthresh,  // thresh
@@ -124,7 +124,7 @@
     return;
 }
 
-void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
@@ -163,9 +163,9 @@
     s += p;
     q10u8 = vld1q_u8(s);
 
-    vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
-                            q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
-                            &q5u8, &q6u8, &q7u8, &q8u8);
+    loop_filter_neon_16(qblimit, qlimit, qthresh,
+                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                        &q5u8, &q6u8, &q7u8, &q8u8);
 
     s -= (p * 5);
     vst1q_u8(s, q5u8);

diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vpx_dsp/arm/loopfilter_4_neon.asm
similarity index 91%
rename from vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_4_neon.asm
index 7738e0d..e45e34c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_4_neon.asm

@@ -8,18 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_neon|
-    EXPORT  |vp9_lpf_vertical_4_neon|
+    EXPORT  |vpx_lpf_horizontal_4_neon|
+    EXPORT  |vpx_lpf_vertical_4_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
 ;                                int p /* pitch */,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
@@ -32,7 +32,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_4_neon| PROC
+|vpx_lpf_horizontal_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -41,7 +41,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_lf_h_edge
+    beq         end_vpx_lf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -62,7 +62,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon
+    bl          vpx_loop_filter_neon
 
     vst1.u8     {d4}, [r2@64], r1          ; store op1
     vst1.u8     {d5}, [r3@64], r1          ; store op0
@@ -73,16 +73,16 @@
     subs        r12, r12, #1
     bne         count_lf_h_loop
 
-end_vp9_lf_h_edge
+end_vpx_lf_h_edge
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_neon|
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
 ;                              int p /* pitch */,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -95,7 +95,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_4_neon| PROC
+|vpx_lpf_vertical_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -105,7 +105,7 @@
     ldr         r3, [sp, #4]              ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_lf_v_edge
+    beq         end_vpx_lf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -135,7 +135,7 @@
     vtrn.8      d7, d16
     vtrn.8      d17, d18
 
-    bl          vp9_loop_filter_neon
+    bl          vpx_loop_filter_neon
 
     sub         r0, r0, #2
 
@@ -154,11 +154,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_lf_v_loop
 
-end_vp9_lf_v_edge
+end_vpx_lf_v_edge
     pop         {pc}
-    ENDP        ; |vp9_lpf_vertical_4_neon|
+    ENDP        ; |vpx_lpf_vertical_4_neon|
 
-; void vp9_loop_filter_neon();
+; void vpx_loop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -182,7 +182,7 @@
 ; d5    op0
 ; d6    oq0
 ; d7    oq1
-|vp9_loop_filter_neon| PROC
+|vpx_loop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
@@ -272,6 +272,6 @@
     veor        d7, d20, d18                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
+    ENDP        ; |vpx_loop_filter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
similarity index 89%
rename from vp9/common/arm/neon/vp9_loopfilter_4_neon.c
rename to vpx_dsp/arm/loopfilter_4_neon.c
index fd9db61..7ad411a 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c

@@ -10,9 +10,9 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
-static INLINE void vp9_loop_filter_neon(
+static INLINE void loop_filter_neon(
         uint8x8_t dblimit,    // flimit
         uint8x8_t dlimit,     // limit
         uint8x8_t dthresh,    // thresh
@@ -110,19 +110,19 @@
     return;
 }
 
-void vp9_lpf_horizontal_4_neon(
-        unsigned char *src,
+void vpx_lpf_horizontal_4_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vp9_lf_h_edge
+    if (count == 0)  // end_vpx_lf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -149,9 +149,9 @@
         s += pitch;
         d18u8 = vld1_u8(s);
 
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
 
         s -= (pitch * 5);
         vst1_u8(s, d4u8);
@@ -165,12 +165,12 @@
     return;
 }
 
-void vp9_lpf_vertical_4_neon(
-        unsigned char *src,
+void vpx_lpf_vertical_4_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i, pitch8;
     uint8_t *s;
@@ -181,7 +181,7 @@
     uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
     uint8x8x4_t d4Result;
 
-    if (count == 0)  // end_vp9_lf_h_edge
+    if (count == 0)  // end_vpx_lf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -244,9 +244,9 @@
         d17u8 = d2tmp11.val[0];
         d18u8 = d2tmp11.val[1];
 
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
 
         d4Result.val[0] = d4u8;
         d4Result.val[1] = d5u8;

diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
similarity index 94%
rename from vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_8_neon.asm
index 91aaec0..e81734c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm

@@ -8,18 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_8_neon|
-    EXPORT  |vp9_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
 ;                                const uint8_t *thresh,
@@ -30,7 +30,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_8_neon| PROC
+|vpx_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -39,7 +39,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_mblf_h_edge
+    beq         end_vpx_mblf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -60,7 +60,7 @@
     sub         r3, r3, r1, lsl #1
     sub         r2, r2, r1, lsl #2
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     vst1.u8     {d0}, [r2@64], r1          ; store op2
     vst1.u8     {d1}, [r3@64], r1          ; store op1
@@ -73,12 +73,12 @@
     subs        r12, r12, #1
     bne         count_mblf_h_loop
 
-end_vp9_mblf_h_edge
+end_vpx_mblf_h_edge
     pop         {r4-r5, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_8_neon|
+    ENDP        ; |vpx_lpf_horizontal_8_neon|
 
-; void vp9_lpf_vertical_8_neon(uint8_t *s,
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -91,7 +91,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_8_neon| PROC
+|vpx_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -101,7 +101,7 @@
     ldr         r3, [sp, #12]             ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_mblf_v_edge
+    beq         end_vpx_mblf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -134,7 +134,7 @@
     sub         r2, r0, #3
     add         r3, r0, #1
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     ;store op2, op1, op0, oq0
     vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
@@ -161,11 +161,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_mblf_v_loop
 
-end_vp9_mblf_v_edge
+end_vpx_mblf_v_edge
     pop         {r4-r5, pc}
-    ENDP        ; |vp9_lpf_vertical_8_neon|
+    ENDP        ; |vpx_lpf_vertical_8_neon|
 
-; void vp9_mbloop_filter_neon();
+; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -191,7 +191,7 @@
 ; d3    oq0
 ; d4    oq1
 ; d5    oq2
-|vp9_mbloop_filter_neon| PROC
+|vpx_mbloop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
@@ -446,6 +446,6 @@
 
     bx          lr
 
-    ENDP        ; |vp9_mbloop_filter_neon|
+    ENDP        ; |vpx_mbloop_filter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
similarity index 93%
rename from vp9/common/arm/neon/vp9_loopfilter_8_neon.c
rename to vpx_dsp/arm/loopfilter_8_neon.c
index 33068a8..a887e2e 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c

@@ -10,9 +10,9 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
-static INLINE void vp9_mbloop_filter_neon(
+static INLINE void mbloop_filter_neon(
         uint8x8_t dblimit,   // mblimit
         uint8x8_t dlimit,    // limit
         uint8x8_t dthresh,   // thresh
@@ -263,12 +263,12 @@
     return;
 }
 
-void vp9_lpf_horizontal_8_neon(
-        unsigned char *src,
+void vpx_lpf_horizontal_8_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s, *psrc;
@@ -276,7 +276,7 @@
     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
     uint8x8_t d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vp9_mblf_h_edge
+    if (count == 0)  // end_vpx_mblf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -303,9 +303,9 @@
         s += pitch;
         d18u8 = vld1_u8(s);
 
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 
         s -= (pitch * 6);
         vst1_u8(s, d0u8);
@@ -323,12 +323,12 @@
     return;
 }
 
-void vp9_lpf_vertical_8_neon(
-        unsigned char *src,
+void vpx_lpf_vertical_8_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s;
@@ -403,9 +403,9 @@
         d17u8 = d2tmp11.val[0];
         d18u8 = d2tmp11.val[1];
 
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 
         d4Result.val[0] = d0u8;
         d4Result.val[1] = d1u8;

diff --git a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm
similarity index 96%
rename from vp9/common/arm/neon/vp9_mb_lpf_neon.asm
rename to vpx_dsp/arm/loopfilter_mb_neon.asm
index 5fe2bba..20d9cfb 100644
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm

@@ -8,13 +8,13 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_16_neon|
-    EXPORT  |vp9_lpf_vertical_16_neon|
+    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
 ;                                 const uint8_t *blimit,
 ;                                 const uint8_t *limit,
 ;                                 const uint8_t *thresh
@@ -24,7 +24,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_horizontal_16_neon| PROC
+|vpx_lpf_horizontal_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -54,7 +54,7 @@
     vld1.u8     {d14}, [r8@64], r1         ; q6
     vld1.u8     {d15}, [r8@64], r1         ; q7
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         h_mbfilter
@@ -115,9 +115,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_16_neon|
+    ENDP        ; |vpx_lpf_horizontal_16_neon|
 
-; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
 ;                               const uint8_t *limit,
 ;                               const uint8_t *thresh)
@@ -126,7 +126,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_vertical_16_neon| PROC
+|vpx_lpf_vertical_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -176,7 +176,7 @@
     vtrn.8      d12, d13
     vtrn.8      d14, d15
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         v_mbfilter
@@ -279,9 +279,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_vertical_16_neon|
+    ENDP        ; |vpx_lpf_vertical_16_neon|
 
-; void vp9_wide_mbfilter_neon();
+; void vpx_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 ;
@@ -305,7 +305,7 @@
 ; d13   q5
 ; d14   q6
 ; d15   q7
-|vp9_wide_mbfilter_neon| PROC
+|vpx_wide_mbfilter_neon| PROC
     mov         r7, #0
 
     ; filter_mask
@@ -601,6 +601,6 @@
     vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
 
     bx          lr
-    ENDP        ; |vp9_wide_mbfilter_neon|
+    ENDP        ; |vpx_wide_mbfilter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
similarity index 70%
rename from vp9/common/arm/neon/vp9_loopfilter_neon.c
rename to vpx_dsp/arm/loopfilter_neon.c
index 31fcc63..eff87d2 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c

@@ -10,49 +10,49 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
 #if HAVE_NEON_ASM
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
                                    const uint8_t *thresh) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // HAVE_NEON_ASM

diff --git a/vp9/decoder/vp9_reader.c b/vpx_dsp/bitreader.c
similarity index 89%
rename from vp9/decoder/vp9_reader.c
rename to vpx_dsp/bitreader.c
index 2c96f74..3eae922 100644
--- a/vp9/decoder/vp9_reader.c
+++ b/vpx_dsp/bitreader.c

@@ -7,13 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./bitreader.h"
+#include "./prob.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/decoder/vp9_reader.h"
-
-int vp9_reader_init(vp9_reader *r,
+int vpx_reader_init(vpx_reader *r,
                     const uint8_t *buffer,
                     size_t size,
                     vpx_decrypt_cb decrypt_cb,
@@ -28,12 +28,12 @@
     r->range = 255;
     r->decrypt_cb = decrypt_cb;
     r->decrypt_state = decrypt_state;
-    vp9_reader_fill(r);
-    return vp9_read_bit(r) != 0;  // marker bit
+    vpx_reader_fill(r);
+    return vpx_read_bit(r) != 0;  // marker bit
   }
 }
 
-void vp9_reader_fill(vp9_reader *r) {
+void vpx_reader_fill(vpx_reader *r) {
   const uint8_t *const buffer_end = r->buffer_end;
   const uint8_t *buffer = r->buffer;
   const uint8_t *buffer_start = buffer;
@@ -73,7 +73,7 @@
   r->count = count;
 }
 
-const uint8_t *vp9_reader_find_end(vp9_reader *r) {
+const uint8_t *vpx_reader_find_end(vpx_reader *r) {
   // Find the end of the coded buffer
   while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
     r->count -= CHAR_BIT;

diff --git a/vp9/decoder/vp9_reader.h b/vpx_dsp/bitreader.h
similarity index 77%
rename from vp9/decoder/vp9_reader.h
rename to vpx_dsp/bitreader.h
index 4959985..e817c8b 100644
--- a/vp9/decoder/vp9_reader.h
+++ b/vpx_dsp/bitreader.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_READER_H_
-#define VP9_DECODER_VP9_READER_H_
+#ifndef VPX_DSP_BITREADER_H_
+#define VPX_DSP_BITREADER_H_
 
 #include <stddef.h>
 #include <limits.h>
@@ -18,8 +18,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,19 +43,19 @@
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
   uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
-} vp9_reader;
+} vpx_reader;
 
-int vp9_reader_init(vp9_reader *r,
+int vpx_reader_init(vpx_reader *r,
                     const uint8_t *buffer,
                     size_t size,
                     vpx_decrypt_cb decrypt_cb,
                     void *decrypt_state);
 
-void vp9_reader_fill(vp9_reader *r);
+void vpx_reader_fill(vpx_reader *r);
 
-const uint8_t *vp9_reader_find_end(vp9_reader *r);
+const uint8_t *vpx_reader_find_end(vpx_reader *r);
 
-static INLINE int vp9_reader_has_error(vp9_reader *r) {
+static INLINE int vpx_reader_has_error(vpx_reader *r) {
   // Check if we have reached the end of the buffer.
   //
   // Variable 'count' stores the number of bits in the 'value' buffer, minus
@@ -74,7 +73,7 @@
   return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
 }
 
-static INLINE int vp9_read(vp9_reader *r, int prob) {
+static INLINE int vpx_read(vpx_reader *r, int prob) {
   unsigned int bit = 0;
   BD_VALUE value;
   BD_VALUE bigsplit;
@@ -83,7 +82,7 @@
   unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
 
   if (r->count < 0)
-    vp9_reader_fill(r);
+    vpx_reader_fill(r);
 
   value = r->value;
   count = r->count;
@@ -99,7 +98,7 @@
   }
 
   {
-    register unsigned int shift = vp9_norm[range];
+    register unsigned int shift = vpx_norm[range];
     range <<= shift;
     value <<= shift;
     count -= shift;
@@ -111,24 +110,24 @@
   return bit;
 }
 
-static INLINE int vp9_read_bit(vp9_reader *r) {
-  return vp9_read(r, 128);  // vp9_prob_half
+static INLINE int vpx_read_bit(vpx_reader *r) {
+  return vpx_read(r, 128);  // vpx_prob_half
 }
 
-static INLINE int vp9_read_literal(vp9_reader *r, int bits) {
+static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
   int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    literal |= vp9_read_bit(r) << bit;
+    literal |= vpx_read_bit(r) << bit;
 
   return literal;
 }
 
-static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
-                                const vp9_prob *probs) {
-  vp9_tree_index i = 0;
+static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
+                                const vpx_prob *probs) {
+  vpx_tree_index i = 0;
 
-  while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
+  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
     continue;
 
   return -i;
@@ -138,4 +137,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_READER_H_
+#endif  // VPX_DSP_BITREADER_H_

diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vpx_dsp/bitreader_buffer.c
similarity index 67%
rename from vp9/decoder/vp9_read_bit_buffer.c
rename to vpx_dsp/bitreader_buffer.c
index c3b38a9..fb04ee6 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c

@@ -7,13 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vp9/decoder/vp9_read_bit_buffer.h"
+#include "./bitreader_buffer.h"
 
-size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
   return (rb->bit_offset + 7) >> 3;
 }
 
-int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
   const size_t off = rb->bit_offset;
   const size_t p = off >> 3;
   const int q = 7 - (int)(off & 0x7);
@@ -27,15 +27,15 @@
   }
 }
 
-int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
   int value = 0, bit;
   for (bit = bits - 1; bit >= 0; bit--)
-    value |= vp9_rb_read_bit(rb) << bit;
+    value |= vpx_rb_read_bit(rb) << bit;
   return value;
 }
 
-int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
                                int bits) {
-  const int value = vp9_rb_read_literal(rb, bits);
-  return vp9_rb_read_bit(rb) ? -value : value;
+  const int value = vpx_rb_read_literal(rb, bits);
+  return vpx_rb_read_bit(rb) ? -value : value;
 }

diff --git a/vpx_dsp/bitreader_buffer.h b/vpx_dsp/bitreader_buffer.h
new file mode 100644
index 0000000..03b156b
--- /dev/null
+++ b/vpx_dsp/bitreader_buffer.h

@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*vpx_rb_error_handler)(void *data);
+
+struct vpx_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  vpx_rb_error_handler error_handler;
+};
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITREADER_BUFFER_H_

diff --git a/vp9/encoder/vp9_writer.c b/vpx_dsp/bitwriter.c
similarity index 76%
rename from vp9/encoder/vp9_writer.c
rename to vpx_dsp/bitwriter.c
index ff461f2..5b232e3 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vpx_dsp/bitwriter.c

@@ -9,23 +9,23 @@
  */
 
 #include <assert.h>
-#include "vp9/encoder/vp9_writer.h"
-#include "vp9/common/vp9_entropy.h"
 
-void vp9_start_encode(vp9_writer *br, uint8_t *source) {
+#include "./bitwriter.h"
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
   br->count    = -24;
   br->buffer   = source;
   br->pos      = 0;
-  vp9_write_bit(br, 0);
+  vpx_write_bit(br, 0);
 }
 
-void vp9_stop_encode(vp9_writer *br) {
+void vpx_stop_encode(vpx_writer *br) {
   int i;
 
   for (i = 0; i < 32; i++)
-    vp9_write_bit(br, 0);
+    vpx_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)

diff --git a/vp9/encoder/vp9_writer.h b/vpx_dsp/bitwriter.h
similarity index 70%
rename from vp9/encoder/vp9_writer.h
rename to vpx_dsp/bitwriter.h
index e347ea4..f6ca9b9 100644
--- a/vp9/encoder/vp9_writer.h
+++ b/vpx_dsp/bitwriter.h

@@ -8,29 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITER_H_
-#define VP9_ENCODER_VP9_WRITER_H_
+#ifndef VPX_DSP_BITWRITER_H_
+#define VPX_DSP_BITWRITER_H_
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct vp9_writer {
+typedef struct vpx_writer {
   unsigned int lowvalue;
   unsigned int range;
   int count;
   unsigned int pos;
   uint8_t *buffer;
-} vp9_writer;
+} vpx_writer;
 
-void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
-void vp9_stop_encode(vp9_writer *bc);
+void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
+void vpx_stop_encode(vpx_writer *bc);
 
-static INLINE void vp9_write(vp9_writer *br, int bit, int probability) {
+static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -46,7 +46,7 @@
     range = br->range - split;
   }
 
-  shift = vp9_norm[range];
+  shift = vpx_norm[range];
 
   range <<= shift;
   count += shift;
@@ -78,21 +78,21 @@
   br->range = range;
 }
 
-static INLINE void vp9_write_bit(vp9_writer *w, int bit) {
-  vp9_write(w, bit, 128);  // vp9_prob_half
+static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
+  vpx_write(w, bit, 128);  // vpx_prob_half
 }
 
-static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) {
+static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    vp9_write_bit(w, 1 & (data >> bit));
+    vpx_write_bit(w, 1 & (data >> bit));
 }
 
-#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
+#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITER_H_
+#endif  // VPX_DSP_BITWRITER_H_

diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vpx_dsp/bitwriter_buffer.c
similarity index 75%
rename from vp9/encoder/vp9_write_bit_buffer.c
rename to vpx_dsp/bitwriter_buffer.c
index 6d55e84..0dfb859 100644
--- a/vp9/encoder/vp9_write_bit_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c

@@ -9,13 +9,14 @@
  */
 
 #include <limits.h>
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
-size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) {
+#include "./bitwriter_buffer.h"
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 
-void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
   const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
@@ -28,8 +29,8 @@
   wb->bit_offset = off + 1;
 }
 
-void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) {
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
   int bit;
   for (bit = bits - 1; bit >= 0; bit--)
-    vp9_wb_write_bit(wb, (data >> bit) & 1);
+    vpx_wb_write_bit(wb, (data >> bit) & 1);
 }

diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vpx_dsp/bitwriter_buffer.h
similarity index 62%
rename from vp9/encoder/vp9_write_bit_buffer.h
rename to vpx_dsp/bitwriter_buffer.h
index 59f9bbe..9397668 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vpx_dsp/bitwriter_buffer.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
-#define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#ifndef VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_DSP_BITWRITER_BUFFER_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -17,20 +17,20 @@
 extern "C" {
 #endif
 
-struct vp9_write_bit_buffer {
+struct vpx_write_bit_buffer {
   uint8_t *bit_buffer;
   size_t bit_offset;
 };
 
-size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb);
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb);
 
-void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
 
-void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits);
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits);
 
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#endif  // VPX_DSP_BITWRITER_BUFFER_H_

diff --git a/vp9/common/vp9_loopfilter_filters.c b/vpx_dsp/loopfilter.c
similarity index 91%
rename from vp9/common/vp9_loopfilter_filters.c
rename to vpx_dsp/loopfilter.c
index 3cf4c32..dc8aca5 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vpx_dsp/loopfilter.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -9,10 +9,8 @@
  */
 
 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 static INLINE int8_t signed_char_clamp(int t) {
   return (int8_t)clamp(t, -128, 127);
@@ -117,7 +115,7 @@
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
                             const uint8_t *thresh, int count) {
   int i;
@@ -134,15 +132,15 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
                           int count) {
   int i;
@@ -159,12 +157,12 @@
   }
 }
 
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
                                   thresh1, 1);
 }
 
@@ -189,7 +187,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh,
                             int count) {
   int i;
@@ -209,15 +207,15 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
                           int count) {
   int i;
@@ -234,12 +232,12 @@
   }
 }
 
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
                                     thresh1, 1);
 }
 
@@ -294,7 +292,7 @@
   }
 }
 
-void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh,
                              int count) {
   int i;
@@ -343,12 +341,12 @@
   }
 }
 
-void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
 }
 
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
 }
@@ -448,7 +446,7 @@
   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
 }
 
-void vp9_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int count, int bd) {
   int i;
@@ -471,7 +469,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
                                         const uint8_t *blimit0,
                                         const uint8_t *limit0,
                                         const uint8_t *thresh0,
@@ -479,11 +477,11 @@
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vp9_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
 }
 
-void vp9_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int count, int bd) {
   int i;
@@ -500,7 +498,7 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *blimit0,
                                       const uint8_t *limit0,
                                       const uint8_t *thresh0,
@@ -508,8 +506,8 @@
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vp9_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
                               thresh1, 1, bd);
 }
 
@@ -534,7 +532,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
   int i;
@@ -556,7 +554,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
                                         const uint8_t *blimit0,
                                         const uint8_t *limit0,
                                         const uint8_t *thresh0,
@@ -564,11 +562,11 @@
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vp9_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
 }
 
-void vp9_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int count, int bd) {
   int i;
@@ -588,7 +586,7 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *blimit0,
                                       const uint8_t *limit0,
                                       const uint8_t *thresh0,
@@ -596,8 +594,8 @@
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vp9_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
                               thresh1, 1, bd);
 }
 
@@ -664,7 +662,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int count, int bd) {
   int i;
@@ -729,13 +727,13 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
 }
 
-void vp9_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh,

diff --git a/vpx_dsp/mips/common_dspr2.h b/vpx_dsp/mips/common_dspr2.h
new file mode 100644
index 0000000..8278101
--- /dev/null
+++ b/vpx_dsp/mips/common_dspr2.h

@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_COMMON_MIPS_DSPR2_H_
+#define VPX_COMMON_MIPS_DSPR2_H_
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_COMMON_MIPS_DSPR2_H_

diff --git a/vp9/common/mips/msa/vp9_loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
similarity index 95%
rename from vp9/common/mips/msa/vp9_loopfilter_16_msa.c
rename to vpx_dsp/mips/loopfilter_16_msa.c
index aeaa48e..b7c9f7b 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c

@@ -9,9 +9,9 @@
  */
 
 #include "vpx_ports/mem.h"
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
                                  uint8_t *filter48,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
@@ -79,7 +79,7 @@
   }
 }
 
-void vp9_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -405,7 +405,7 @@
   }
 }
 
-void vp9_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
                                     const uint8_t *thresh_ptr,
@@ -415,15 +415,15 @@
 
   (void)count;
 
-  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
                                         limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    vp9_hz_lpf_t16_16w(src, pitch, filter48);
+    vpx_hz_lpf_t16_16w(src, pitch, filter48);
   }
 }
 
-void vp9_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
                                const uint8_t *b_limit_ptr,
                                const uint8_t *limit_ptr,
                                const uint8_t *thresh_ptr,
@@ -643,13 +643,13 @@
       }
     }
   } else {
-    vp9_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
+    vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
                                    thresh_ptr, count);
   }
 }
 
-static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
-                                       uint8_t *output, int32_t out_pitch) {
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -673,8 +673,8 @@
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
-                                       uint8_t *output, int32_t out_pitch) {
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
 
@@ -685,8 +685,8 @@
   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
 }
 
-static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
-                                uint8_t *output, int32_t out_pitch) {
+static void transpose_16x16(uint8_t *input, int32_t in_pitch,
+                            uint8_t *output, int32_t out_pitch) {
   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -744,7 +744,7 @@
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
                                 uint8_t *src_org, int32_t pitch_org,
                                 const uint8_t *b_limit_ptr,
                                 const uint8_t *limit_ptr,
@@ -812,7 +812,7 @@
   }
 }
 
-int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
                           uint8_t *filter48) {
   v16i8 zero = { 0 };
   v16u8 filter8, flat, flat2;
@@ -1032,7 +1032,7 @@
   }
 }
 
-void vp9_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr) {
@@ -1040,23 +1040,23 @@
   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   uint8_t *filter48 = &transposed_input[16 * 16];
 
-  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
-  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
                                        &filter48[0], src, pitch, b_limit_ptr,
                                        limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
                                    &filter48[0]);
 
     if (0 == early_exit) {
-      vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
     }
   }
 }
 
-int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
                                  uint8_t *src_org, int32_t pitch,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
@@ -1134,7 +1134,7 @@
   }
 }
 
-int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
                            uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
@@ -1455,7 +1455,7 @@
   }
 }
 
-void vp9_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit_ptr,
                                   const uint8_t *limit_ptr,
                                   const uint8_t *thresh_ptr) {
@@ -1463,18 +1463,18 @@
   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   uint8_t *filter48 = &transposed_input[16 * 16];
 
-  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
-  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
                                         &filter48[0], src, pitch, b_limit_ptr,
                                         limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
                                     &filter48[0]);
 
     if (0 == early_exit) {
-      vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
     }
   }
 }

diff --git a/vp9/common/mips/msa/vp9_loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
similarity index 94%
rename from vp9/common/mips/msa/vp9_loopfilter_4_msa.c
rename to vpx_dsp/mips/loopfilter_4_msa.c
index 7f69135..daf5f38 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
                               const uint8_t *thresh_ptr,
@@ -39,7 +39,7 @@
   SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
 }
 
-void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit0_ptr,
                                    const uint8_t *limit0_ptr,
                                    const uint8_t *thresh0_ptr,
@@ -71,7 +71,7 @@
   ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
 }
 
-void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
                             const uint8_t *thresh_ptr,
@@ -102,7 +102,7 @@
   ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
 }
 
-void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                  const uint8_t *b_limit0_ptr,
                                  const uint8_t *limit0_ptr,
                                  const uint8_t *thresh0_ptr,

diff --git a/vp9/common/mips/msa/vp9_loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
similarity index 97%
rename from vp9/common/mips/msa/vp9_loopfilter_8_msa.c
rename to vpx_dsp/mips/loopfilter_8_msa.c
index 26a858d..00b6db5 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-void vp9_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
                               const uint8_t *thresh_ptr,
@@ -83,7 +83,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
@@ -158,7 +158,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
                             const uint8_t *thresh_ptr,
@@ -237,7 +237,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                  const uint8_t *b_limit0,
                                  const uint8_t *limit0,
                                  const uint8_t *thresh0,

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
similarity index 85%
rename from vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
rename to vpx_dsp/mips/loopfilter_filters_dspr2.c
index 3df7f4c..99a96d8 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_4_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -50,7 +49,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   /* loop filter designed to work using chars so that we can make maximum use
      of 8 bit simd instructions. */
@@ -88,14 +87,14 @@
           : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
       );
 
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
-                                pm1, p0, p3, p4, p5, p6,
-                                thresh_vec, &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                            pm1, p0, p3, p4, p5, p6,
+                            thresh_vec, &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         __asm__ __volatile__ (
             "sw     %[p1],  (%[s1])    \n\t"
@@ -114,7 +113,7 @@
   }
 }
 
-void vp9_lpf_vertical_4_dspr2(unsigned char *s,
+void vpx_lpf_vertical_4_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -144,7 +143,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -217,14 +216,14 @@
      * mask will be zero and filtering is not needed
      */
     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
-                                p0, p3, p4, p5, p6, thresh_vec,
-                                &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                            p0, p3, p4, p5, p6, thresh_vec,
+                            &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         /* unpack processed 4x4 neighborhood
          * don't use transpose on output data
@@ -307,56 +306,56 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
                                        1);
 }
 
-void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh) {
-  vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // #if HAVE_DSPR2

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/vpx_dsp/mips/loopfilter_filters_dspr2.h
similarity index 85%
rename from vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
rename to vpx_dsp/mips/loopfilter_filters_dspr2.h
index 675db65..4a1506b 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.h

@@ -13,10 +13,10 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,10 +24,10 @@
 
 #if HAVE_DSPR2
 /* inputs & outputs are quad-byte vectors */
-static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
-                                    uint32_t *ps1, uint32_t *ps0,
-                                    uint32_t *qs0, uint32_t *qs1) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
+                                uint32_t *ps1, uint32_t *ps0,
+                                uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -73,34 +73,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -113,13 +113,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -142,23 +142,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -196,12 +196,12 @@
   *qs1 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
-                                     uint32_t ps1, uint32_t ps0,
-                                     uint32_t qs0, uint32_t qs1,
-                                     uint32_t *p1_f0, uint32_t *p0_f0,
-                                     uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
+                                 uint32_t ps1, uint32_t ps0,
+                                 uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -247,34 +247,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -286,13 +286,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -315,23 +315,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -369,10 +369,10 @@
   *q1_f0 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
-                                      uint32_t *op1, uint32_t *op0,
-                                      uint32_t *oq0, uint32_t *oq1,
-                                      uint32_t *oq2, uint32_t *oq3) {
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                  uint32_t *op1, uint32_t *op0,
+                                  uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
@@ -446,14 +446,14 @@
   *oq2 = res_oq2;
 }
 
-static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
-                                       uint32_t p1, uint32_t p0,
-                                       uint32_t q0, uint32_t q1,
-                                       uint32_t q2, uint32_t q3,
-                                       uint32_t *op2_f1,
-                                       uint32_t *op1_f1, uint32_t *op0_f1,
-                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
-                                       uint32_t *oq2_f1) {
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                   uint32_t p1, uint32_t p0,
+                                   uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3,
+                                   uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   uint32_t  res_op2, res_op1, res_op0;
   uint32_t  res_oq0, res_oq1, res_oq2;
@@ -524,14 +524,14 @@
   *oq2_f1 = res_oq2;
 }
 
-static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
-                                           uint32_t *op5, uint32_t *op4,
-                                           uint32_t *op3, uint32_t *op2,
-                                           uint32_t *op1, uint32_t *op0,
-                                           uint32_t *oq0, uint32_t *oq1,
-                                           uint32_t *oq2, uint32_t *oq3,
-                                           uint32_t *oq4, uint32_t *oq5,
-                                           uint32_t *oq6, uint32_t *oq7) {
+static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                       uint32_t *op5, uint32_t *op4,
+                                       uint32_t *op3, uint32_t *op2,
+                                       uint32_t *op1, uint32_t *op0,
+                                       uint32_t *oq0, uint32_t *oq1,
+                                       uint32_t *oq2, uint32_t *oq3,
+                                       uint32_t *oq4, uint32_t *oq5,
+                                       uint32_t *oq6, uint32_t *oq7) {
   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/vpx_dsp/mips/loopfilter_macros_dspr2.h
similarity index 99%
rename from vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
rename to vpx_dsp/mips/loopfilter_macros_dspr2.h
index ca01a6a..994ff18 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_macros_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/vpx_dsp/mips/loopfilter_masks_dspr2.h
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
rename to vpx_dsp/mips/loopfilter_masks_dspr2.h
index 5b0d9cc..e82dfb7 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_masks_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,13 +24,13 @@
 #if HAVE_DSPR2
 /* processing 4 pixels at the same time
  * compute hev and mask in the same function */
-static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                             uint32_t p1, uint32_t p0,
-                                             uint32_t p3, uint32_t p2,
-                                             uint32_t q0, uint32_t q1,
-                                             uint32_t q2, uint32_t q3,
-                                             uint32_t thresh, uint32_t *hev,
-                                             uint32_t *mask) {
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0,
+                                         uint32_t p3, uint32_t p2,
+                                         uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
   uint32_t  c, r, r3, r_k;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -129,16 +129,16 @@
   *mask = s2;
 }
 
-static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
-                                                       uint32_t flimit,
-                                                       uint32_t thresh,
-                                                       uint32_t p1, uint32_t p0,
-                                                       uint32_t p3, uint32_t p2,
-                                                       uint32_t q0, uint32_t q1,
-                                                       uint32_t q2, uint32_t q3,
-                                                       uint32_t *hev,
-                                                       uint32_t *mask,
-                                                       uint32_t *flat) {
+static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                   uint32_t flimit,
+                                                   uint32_t thresh,
+                                                   uint32_t p1, uint32_t p0,
+                                                   uint32_t p3, uint32_t p2,
+                                                   uint32_t q0, uint32_t q1,
+                                                   uint32_t q2, uint32_t q3,
+                                                   uint32_t *hev,
+                                                   uint32_t *mask,
+                                                   uint32_t *flat) {
   uint32_t  c, r, r3, r_k, r_flat;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -279,12 +279,12 @@
   *flat = flat1;
 }
 
-static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
-                                 uint32_t p2, uint32_t p1,
-                                 uint32_t p0, uint32_t q0,
-                                 uint32_t q1, uint32_t q2,
-                                 uint32_t q3, uint32_t q4,
-                                 uint32_t *flat2) {
+static INLINE void flatmask5(uint32_t p4, uint32_t p3,
+                             uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0,
+                             uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4,
+                             uint32_t *flat2) {
   uint32_t  c, r, r_k, r_flat;
   uint32_t  ones = 0xFFFFFFFF;
   uint32_t  flat_thresh = 0x01010101;

diff --git a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_dspr2.c
index 7cd0b63..4138f56 100644
--- a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_8_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -53,7 +52,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < 2; i++) {
     sp3 = s - (pitch << 2);
@@ -81,13 +80,13 @@
           [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])    \n\t"
@@ -104,13 +103,13 @@
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -130,18 +129,18 @@
       );
     } else if ((flat != 0) && (mask != 0)) {
       /* filtering */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -319,7 +318,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dspr2(unsigned char *s,
+void vpx_lpf_vertical_8_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -351,7 +350,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -451,39 +450,39 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat != 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 6c94674..8a48650 100644
--- a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_16_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
                                  int pitch,
                                  const uint8_t *blimit,
                                  const uint8_t *limit,
@@ -58,7 +57,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < (2 * count); i++) {
     sp7 = s - (pitch << 3);
@@ -110,17 +109,17 @@
           [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])            \n\t"
@@ -139,17 +138,17 @@
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
       COMBINE_LEFT_RIGHT_3TO6()
@@ -189,13 +188,13 @@
       /* f1 */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -215,18 +214,18 @@
       );
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -399,36 +398,36 @@
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 + f2 */
       /* f0  function */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* f1  function */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       /* f2  function */
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
index 851fc6c..e580f01 100644
--- a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_vertical_16_dspr2(uint8_t *s,
+void vpx_lpf_vertical_16_dspr2(uint8_t *s,
                                int pitch,
                                const uint8_t *blimit,
                                const uint8_t *limit,
@@ -55,7 +54,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -248,61 +247,61 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
                (mask == 0xFFFFFFFF)) {
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       STORE_F2()
     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
       /* f1 */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -466,32 +465,32 @@
       }
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1+f2 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/msa/vp9_loopfilter_msa.h b/vpx_dsp/mips/loopfilter_msa.h
similarity index 98%
rename from vp9/common/mips/msa/vp9_loopfilter_msa.h
rename to vpx_dsp/mips/loopfilter_msa.h
index bfbe870..62b1706 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h
+++ b/vpx_dsp/mips/loopfilter_msa.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
+#ifndef VPX_DSP_LOOPFILTER_MSA_H_
+#define VPX_DSP_LOOPFILTER_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -243,4 +243,4 @@
   mask_out = limit_in < (v16u8)mask_out;                         \
   mask_out = __msa_xori_b(mask_out, 0xff);                       \
 }
-#endif  /* VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ */
+#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */

diff --git a/vp9/common/vp9_prob.c b/vpx_dsp/prob.c
similarity index 83%
rename from vp9/common/vp9_prob.c
rename to vpx_dsp/prob.c
index 3b7b9bf..639d24d 100644
--- a/vp9/common/vp9_prob.c
+++ b/vpx_dsp/prob.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_prob.h"
+#include "./prob.h"
 
-const uint8_t vp9_norm[256] = {
+const uint8_t vpx_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -30,10 +30,10 @@
 };
 
 static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const vp9_tree_index *tree,
-                                          const vp9_prob *pre_probs,
+                                          const vpx_tree_index *tree,
+                                          const vpx_prob *pre_probs,
                                           const unsigned int *counts,
-                                          vp9_prob *probs) {
+                                          vpx_prob *probs) {
   const int l = tree[i];
   const unsigned int left_count = (l <= 0)
                  ? counts[-l]
@@ -47,7 +47,7 @@
   return left_count + right_count;
 }
 
-void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, vp9_prob *probs) {
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs) {
   tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }

diff --git a/vp9/common/vp9_prob.h b/vpx_dsp/prob.h
similarity index 70%
rename from vp9/common/vp9_prob.h
rename to vpx_dsp/prob.h
index c69c62c..729f90a 100644
--- a/vp9/common/vp9_prob.h
+++ b/vpx_dsp/prob.h

@@ -8,64 +8,63 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PROB_H_
-#define VP9_COMMON_VP9_PROB_H_
+#ifndef VPX_DSP_PROB_H_
+#define VPX_DSP_PROB_H_
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_common.h"
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_common.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef uint8_t vp9_prob;
+typedef uint8_t vpx_prob;
 
 #define MAX_PROB 255
 
-#define vp9_prob_half ((vp9_prob) 128)
+#define vpx_prob_half ((vpx_prob) 128)
 
-typedef int8_t vp9_tree_index;
+typedef int8_t vpx_tree_index;
 
 #define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
 
-#define vp9_complement(x) (255 - x)
+#define vpx_complement(x) (255 - x)
 
 #define MODE_MV_COUNT_SAT 20
 
 /* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of vp9_tree_indices.
+   Each node of the tree is a pair of vpx_tree_indices.
    Array index often references a corresponding probability table.
    Index <= 0 means done encoding/decoding and value = -Index,
    Index > 0 means need another bit, specification at index.
    Nonnegative indices are always even;  processing begins at node 0. */
 
-typedef const vp9_tree_index vp9_tree[];
+typedef const vpx_tree_index vpx_tree[];
 
-static INLINE vp9_prob clip_prob(int p) {
+static INLINE vpx_prob clip_prob(int p) {
   return (p > 255) ? 255 : (p < 1) ? 1 : p;
 }
 
-static INLINE vp9_prob get_prob(int num, int den) {
+static INLINE vpx_prob get_prob(int num, int den) {
   return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
 }
 
-static INLINE vp9_prob get_binary_prob(int n0, int n1) {
+static INLINE vpx_prob get_binary_prob(int n0, int n1) {
   return get_prob(n0, n0 + n1);
 }
 
 /* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
+static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
                                    const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
-  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
+  const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
   const unsigned int count = MIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
@@ -77,7 +76,7 @@
   70, 76, 83, 89, 96, 102, 108, 115, 121, 128
 };
 
-static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob,
+static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
                                            const unsigned int ct[2]) {
   const unsigned int den = ct[0] + ct[1];
   if (den == 0) {
@@ -85,20 +84,20 @@
   } else {
     const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
     const unsigned int factor = count_to_update_factor[count];
-    const vp9_prob prob =
+    const vpx_prob prob =
         clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
     return weighted_prob(pre_prob, prob, factor);
   }
 }
 
-void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, vp9_prob *probs);
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs);
 
 
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
+DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PROB_H_
+#endif  // VPX_DSP_PROB_H_

diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
new file mode 100644
index 0000000..137f5bc
--- /dev/null
+++ b/vpx_dsp/quantize.c

@@ -0,0 +1,337 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 16;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 15;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        if (abs_qcoeff)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif

diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
new file mode 100644
index 0000000..0ad1744
--- /dev/null
+++ b/vpx_dsp/quantize.h

@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_QUANTIZE_H_
+#define VPX_DSP_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_QUANTIZE_H_

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 70a131c..2efc945 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -9,9 +9,73 @@
 ##
 
 DSP_SRCS-yes += vpx_dsp.mk
+DSP_SRCS-yes += vpx_dsp_common.h
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
+
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_16_neon.c
+DSP_SRCS-yes   += arm/loopfilter_8_neon.c
+DSP_SRCS-yes   += arm/loopfilter_4_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+DSP_SRCS-yes            += quantize.c
+DSP_SRCS-yes            += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+endif
+endif
+endif  # CONFIG_VP9_ENCODER
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c

diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
new file mode 100644
index 0000000..6793036
--- /dev/null
+++ b/vpx_dsp/vpx_dsp_common.h

@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_COMMON_H_
+#define VPX_DSP_COMMON_H_
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_COMMON_H_

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 94bf4bf..32b79d2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -5,12 +5,18 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 EOF
 }
 forward_decls qw/vpx_dsp_forward_decls/;
 
-# Functions which use x86inc.asm instead of x86_abi_support.asm
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
 if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
   $mmx_x86inc = 'mmx';
   $sse_x86inc = 'sse';
@@ -18,23 +24,127 @@
   $ssse3_x86inc = 'ssse3';
   $avx_x86inc = 'avx';
   $avx2_x86inc = 'avx2';
-} else {
-  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
-  $avx_x86inc = $avx2_x86inc = '';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
 }
 
-# Functions which are 64 bit only.
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
   $mmx_x86_64 = 'mmx';
   $sse2_x86_64 = 'sse2';
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
-} else {
-  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
-  $avx_x86_64 = $avx2_x86_64 = '';
 }
 
+#
+# Loopfilter
+#
+add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16 sse2 neon_asm msa/;
+$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
+
+add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm msa/;
+$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_8 sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm msa/;
+$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_4 mmx neon msa/;
+
+add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm msa/;
+$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_8 sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm msa/;
+$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_4 mmx neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/;
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+
+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b sse2/;
+
+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
+} else {
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
+}  # CONFIG_VP9_ENCODER
+}  # CONFIG_VP9_HIGHBITDEPTH
+
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
 # Block subtraction

diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
similarity index 96%
rename from vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
rename to vpx_dsp/x86/highbd_loopfilter_sse2.c
index b40669c..c4fd5e1 100644
--- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c

@@ -10,9 +10,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
@@ -509,7 +508,7 @@
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
                                        const uint8_t *_blimit,
                                        const uint8_t *_limit,
                                        const uint8_t *_thresh,
@@ -520,7 +519,7 @@
     highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
 }
 
-void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
@@ -688,7 +687,7 @@
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
   filt = signed_char_clamp_bd_sse2(filt, bd);
   filt = _mm_and_si128(filt, mask);
 
@@ -757,7 +756,7 @@
   _mm_store_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_blimit0,
                                            const uint8_t *_limit0,
                                            const uint8_t *_thresh0,
@@ -765,12 +764,12 @@
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
                                    1, bd);
 }
 
-void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
@@ -892,7 +891,7 @@
   filt = _mm_adds_epi16(filt, work_a);
   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
 
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
   filt = _mm_and_si128(filt, mask);
 
   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
@@ -937,7 +936,7 @@
   _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 }
 
-void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_blimit0,
                                            const uint8_t *_limit0,
                                            const uint8_t *_thresh0,
@@ -945,8 +944,8 @@
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
                                    bd);
 }
 
@@ -1055,7 +1054,7 @@
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
@@ -1072,7 +1071,7 @@
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
                                    bd);
 
   src[0] = t_dst;
@@ -1082,7 +1081,7 @@
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *blimit0,
                                          const uint8_t *limit0,
                                          const uint8_t *thresh0,
@@ -1098,7 +1097,7 @@
   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
                                         thresh0, blimit1, limit1, thresh1, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1109,7 +1108,7 @@
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
@@ -1126,7 +1125,7 @@
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
                                    bd);
 
   src[0] = t_dst;
@@ -1136,7 +1135,7 @@
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *blimit0,
                                          const uint8_t *limit0,
                                          const uint8_t *thresh0,
@@ -1152,7 +1151,7 @@
   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
                                         thresh0, blimit1, limit1, thresh1, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1164,7 +1163,7 @@
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh,
@@ -1193,7 +1192,7 @@
   highbd_transpose(src, 8, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
                                           int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,

diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
similarity index 86%
rename from vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
rename to vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index cbdd1c9..06c748d 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,8 +10,9 @@
 
 #include <emmintrin.h>
 
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 // from vp9_idct.h: typedef int32_t tran_low_t;
@@ -85,13 +86,13 @@
       for (j = 0; j < 4; j++) {
         if (test & (1 << (4 * j))) {
           int k = 4 * i + j;
-          int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
-                              INT32_MIN, INT32_MAX);
-          tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
-                    quant_shift_ptr[k != 0]) >> 16;  // quantization
-          qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+          const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0];
+          const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1;
+          const uint32_t abs_qcoeff =
+              (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16);
+          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
           dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (tmp)
+          if (abs_qcoeff)
             eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
         }
       }
@@ -162,17 +163,15 @@
       const int rc = idx_arr[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int64_t tmp = clamp(abs_coeff +
-                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                          INT32_MIN, INT32_MAX);
-      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
+      if (abs_qcoeff)
         eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
     }
   }

diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index fe35c1e..b45331c 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c

@@ -50,7 +50,7 @@
                                     high_variance_fn_t var_fn, int block_size) {
   int i, j;
   uint64_t sse_long = 0;
-  int64_t sum_long = 0;
+  int32_t sum_long = 0;
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
@@ -63,7 +63,7 @@
     }
   }
   *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
 }
 
 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
@@ -72,7 +72,7 @@
                                     high_variance_fn_t var_fn, int block_size) {
   int i, j;
   uint64_t sse_long = 0;
-  int64_t sum_long = 0;
+  int32_t sum_long = 0;
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
@@ -85,7 +85,7 @@
     }
   }
   *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
 }
 
 
@@ -386,7 +386,7 @@
     } \
   } \
   se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
   *sse_ptr = sse; \
   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
@@ -555,7 +555,7 @@
     } \
   } \
   se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
   *sse_ptr = sse; \
   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
similarity index 98%
rename from vp9/common/x86/vp9_loopfilter_intrin_avx2.c
rename to vpx_dsp/x86/loopfilter_avx2.c
index 770a65f..23a97dd 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c

@@ -10,7 +10,7 @@
 
 #include <immintrin.h>  /* AVX2 */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
@@ -103,7 +103,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -515,7 +515,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -976,7 +976,7 @@
     }
 }
 
-void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh, int count) {
     if (count == 1)

diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
similarity index 98%
rename from vp9/common/x86/vp9_loopfilter_mmx.asm
rename to vpx_dsp/x86/loopfilter_mmx.asm
index f5f7d5a..b9c18b6 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm

@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 
-;void vp9_lpf_horizontal_4_mmx
+;void vpx_lpf_horizontal_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
@@ -21,8 +21,8 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
-sym(vp9_lpf_horizontal_4_mmx):
+global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
+sym(vpx_lpf_horizontal_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -224,7 +224,7 @@
     ret
 
 
-;void vp9_lpf_vertical_4_mmx
+;void vpx_lpf_vertical_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
@@ -233,8 +233,8 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_lpf_vertical_4_mmx) PRIVATE
-sym(vp9_lpf_vertical_4_mmx):
+global sym(vpx_lpf_vertical_4_mmx) PRIVATE
+sym(vpx_lpf_vertical_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
similarity index 95%
rename from vp9/common/x86/vp9_loopfilter_intrin_sse2.c
rename to vpx_dsp/x86/loopfilter_sse2.c
index e321dbe..ed10127 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c

@@ -10,8 +10,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_loopfilter.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
@@ -100,7 +100,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -495,7 +495,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
@@ -717,7 +717,7 @@
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
                                 const unsigned char *_blimit,
                                 const unsigned char *_limit,
                                 const unsigned char *_thresh, int count) {
@@ -727,7 +727,7 @@
     mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 }
 
-void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh, int count) {
@@ -874,7 +874,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -943,7 +943,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
                                     const uint8_t *_blimit0,
                                     const uint8_t *_limit0,
                                     const uint8_t *_thresh0,
@@ -1115,7 +1115,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1190,7 +1190,7 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0,
@@ -1286,7 +1286,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1333,43 +1333,47 @@
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  // Read in 16 lines
-  x0 = _mm_loadl_epi64((__m128i *)in0);
-  x8 = _mm_loadl_epi64((__m128i *)in1);
-  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
-  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
-  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
-  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
-  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
-  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
-  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
-  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
-  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
-  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
-  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
-  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
-  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
-  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
+  // 2-way interleave w/hoisting of unpacks
+  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
+  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
 
-  x0 = _mm_unpacklo_epi8(x0, x1);
-  x1 = _mm_unpacklo_epi8(x2, x3);
-  x2 = _mm_unpacklo_epi8(x4, x5);
-  x3 = _mm_unpacklo_epi8(x6, x7);
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
+  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
 
-  x8 = _mm_unpacklo_epi8(x8, x9);
-  x9 = _mm_unpacklo_epi8(x10, x11);
-  x10 = _mm_unpacklo_epi8(x12, x13);
-  x11 = _mm_unpacklo_epi8(x14, x15);
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
+  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
 
-  x4 = _mm_unpacklo_epi16(x0, x1);
-  x5 = _mm_unpacklo_epi16(x2, x3);
-  x12 = _mm_unpacklo_epi16(x8, x9);
-  x13 = _mm_unpacklo_epi16(x10, x11);
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
+  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
+  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
 
-  x6 = _mm_unpacklo_epi32(x4, x5);
-  x7 = _mm_unpackhi_epi32(x4, x5);
-  x14 = _mm_unpacklo_epi32(x12, x13);
-  x15 = _mm_unpackhi_epi32(x12, x13);
+  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
+  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
+  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
+
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
+  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
+
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
+  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
+  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
+
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
+  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
+  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
+
+  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
+  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
+  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
+  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
 
   // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
@@ -1405,33 +1409,36 @@
 
     x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
     x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
-    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
-    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
-    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
-    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
-    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
-    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     x0 = _mm_unpacklo_epi8(x0, x1);
+
+    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
+    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     x1 = _mm_unpacklo_epi8(x2, x3);
+
+    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
+    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     x2 = _mm_unpacklo_epi8(x4, x5);
+
+    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
+    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     x3 = _mm_unpacklo_epi8(x6, x7);
+
     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
     x4 = _mm_unpacklo_epi16(x0, x1);
     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
     x5 = _mm_unpacklo_epi16(x2, x3);
     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     x6 = _mm_unpacklo_epi32(x4, x5);
-    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
     _mm_storel_pd((double *)(out + 0*out_p),
                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
     _mm_storeh_pd((double *)(out + 1*out_p),
                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
     _mm_storel_pd((double *)(out + 2*out_p),
                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
     _mm_storeh_pd((double *)(out + 3*out_p),
@@ -1443,13 +1450,13 @@
     x5 = _mm_unpackhi_epi16(x2, x3);
     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     x6 = _mm_unpacklo_epi32(x4, x5);
-    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-    x7 = _mm_unpackhi_epi32(x4, x5);
-
     _mm_storel_pd((double *)(out + 4*out_p),
                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
     _mm_storeh_pd((double *)(out + 5*out_p),
                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
     _mm_storel_pd((double *)(out + 6*out_p),
                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
     _mm_storeh_pd((double *)(out + 7*out_p),
@@ -1457,7 +1464,7 @@
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
@@ -1471,7 +1478,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1482,7 +1489,7 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh, int count) {
@@ -1498,7 +1505,7 @@
   transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1507,7 +1514,7 @@
   transpose(src, 8, dst, p, 1);
 }
 
-void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
@@ -1521,7 +1528,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1533,7 +1540,7 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1561,7 +1568,7 @@
   transpose(src, 8, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);

diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000..8d51aeb
--- /dev/null
+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t* zbin_ptr,
+                         const int16_t* round_ptr, const int16_t* quant_ptr,
+                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         uint16_t* eob_ptr,
+                         const int16_t* scan_ptr,
+                         const int16_t* iscan_ptr) {
+  __m128i zero;
+  (void)scan_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+  if (!skip_block) {
+    __m128i eob;
+    __m128i zbin;
+    __m128i round, quant, dequant, shift;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        __m128i pw_1;
+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        pw_1 = _mm_set1_epi16(1);
+        zbin = _mm_sub_epi16(zbin, pw_1);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        shift = _mm_unpackhi_epi64(shift, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000..3784d9d
--- /dev/null
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7