Merge "Correctly report "Unsupported bitstream profile""

commit: 149822e399087f4dd8e4e6efce9ad8d259d5ccea [log] [tgz]
author: Yaowu Xu <yaowu@google.com> Mon Jul 20 22:49:53 2015 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Mon Jul 20 22:49:54 2015 +0000
tree: b7a25252810ce3e24990e54806f6ecb1fb797860
parent: add779e42593dac57a30da250862e2705e5e80dc [diff]
parent: 7c0c62df1d25e76c52afb411ad342a82ed5d1ff0 [diff]
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 66ca4bb..5a4b1ed 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -901,14 +901,6 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 4ee4ad4..08a5ff6 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -531,14 +531,6 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index ba51309..96aaa23 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc

@@ -19,7 +19,7 @@
 #include "test/util.h"
 
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx/vpx_integer.h"
@@ -60,49 +60,49 @@
 void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count, int bd) {
-  vp9_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count, int bd) {
-  vp9_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
-  vp9_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count, int bd) {
-  vp9_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
 }
 #else
 void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count) {
-  vp9_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
-  vp9_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count) {
-  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
@@ -114,25 +114,25 @@
 void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
-  vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count) {
-  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON_ASM
@@ -141,13 +141,13 @@
 void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh,
                              int count) {
-  vp9_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
@@ -534,46 +534,46 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 12, 1),
         make_tuple(&wrapper_vertical_16_dual_sse2,
@@ -586,10 +586,10 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
@@ -598,8 +598,8 @@
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8,
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
                    2)));
 #endif
 
@@ -608,42 +608,42 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_sse2,
-                   &vp9_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_sse2,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_sse2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_sse2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_sse2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_sse2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -657,36 +657,36 @@
 #if HAVE_NEON_ASM
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
-        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&wrapper_vertical_16_neon,
                    &wrapper_vertical_16_c, 8, 1),
         make_tuple(&wrapper_vertical_16_dual_neon,
                    &wrapper_vertical_16_dual_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_8_neon,
-                   &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_vertical_8_neon,
-                   &vp9_lpf_vertical_8_c, 8, 1),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_4_neon,
-                   &vp9_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vp9_lpf_vertical_4_neon,
-                   &vp9_lpf_vertical_4_c, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_8_neon,
+                   &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_8_neon,
+                   &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_4_neon,
+                   &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_4_neon,
+                   &vpx_lpf_vertical_4_c, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
 #if HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_8_dual_neon,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_neon,
-                   &vp9_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                   &vpx_lpf_vertical_8_dual_c, 8),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_4_dual_neon,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_neon,
-                   &vp9_lpf_vertical_4_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                   &vpx_lpf_vertical_4_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
@@ -694,23 +694,23 @@
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_msa, &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_lpf_vertical_8_msa, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_msa,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_dual_msa,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_msa,
-                   &vp9_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_msa,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_msa,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_msa,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_msa,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_msa,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
 }  // namespace

diff --git a/test/test.mk b/test/test.mk
index a8a365e..8ecc856 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -91,6 +91,7 @@
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 
 ## VP8
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
@@ -142,7 +143,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc

diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc
index c7f0cd8..b0431d8 100644
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc

@@ -14,11 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "vp9/decoder/vp9_reader.h"
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/bitwriter.h"
 
 #include "test/acm_random.h"
-#include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
 

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 943c00b..0e09652 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc

@@ -19,7 +19,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"

diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 6ebea9f..cd07a56 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h

@@ -15,14 +15,13 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/mips/common_dspr2.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #if HAVE_DSPR2
-#define CROP_WIDTH 512
 extern uint8_t *vp9_ff_cropTbl;
 
 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
@@ -51,40 +50,6 @@
    );                                                                          \
   out;                                                                    })
 
-static INLINE void vp9_prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   0,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   1,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
-static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   4,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   5,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
 void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                    int dest_stride);
 

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
index 91d62bc..aad7c45 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c

@@ -44,7 +44,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -148,8 +148,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -245,7 +245,7 @@
       : [pos] "r" (pos)
     );
 
-    vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -257,7 +257,7 @@
                                      filter_y, w, h);
         break;
       case 64:
-        vp9_prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
         convolve_bi_avg_vert_64_dspr2(src, src_stride,
                                       dst, dst_stride,
                                       filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
index 148b20f..bc60e93 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c

@@ -40,9 +40,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -135,9 +135,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -290,9 +290,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -539,11 +539,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -781,9 +781,9 @@
     );
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -807,8 +807,8 @@
                                       filter_x, h, 2);
         break;
       case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
         convolve_bi_avg_horiz_64_dspr2(src, src_stride,
                                       dst, dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
index 92644f2..b714f9a 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c

@@ -41,8 +41,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -132,8 +132,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -272,8 +272,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -504,9 +504,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -747,8 +747,8 @@
   );
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
+  prefetch_load(src);
+  prefetch_load(src + 32);
 
   switch (w) {
     case 4:
@@ -769,7 +769,7 @@
                                             (w/16));
       break;
     case 64:
-      vp9_prefetch_load(src + 32);
+      prefetch_load(src + 32);
       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
                                             dst, dst_stride,
                                             filter, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
index 1debdb4..27ea100 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c

@@ -39,9 +39,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -122,9 +122,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -252,9 +252,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -459,11 +459,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -651,7 +651,7 @@
   if (16 == x_step_q4) {
     uint32_t pos = 38;
 
-    vp9_prefetch_load((const uint8_t *)filter_x);
+    prefetch_load((const uint8_t *)filter_x);
 
     /* bit positon for extract from acc */
     __asm__ __volatile__ (
@@ -661,9 +661,9 @@
     );
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -687,8 +687,8 @@
                                    filter_x, (int32_t)h, 2);
         break;
       case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
         convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
                                    dst, (int32_t)dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
index bf01f11..32f5fb6 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c

@@ -44,7 +44,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -141,7 +141,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -230,7 +230,7 @@
       : [pos] "r" (pos)
     );
 
-    vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
     switch (w) {
       case 4 :
@@ -242,7 +242,7 @@
                                  filter_y, w, h);
         break;
       case 64 :
-        vp9_prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
         convolve_bi_vert_64_dspr2(src, src_stride,
                                   dst, dst_stride,
                                   filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index 1742279..d9cbfe6 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c

@@ -49,7 +49,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -210,8 +210,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -372,7 +372,7 @@
         : [pos] "r" (pos)
       );
 
-      vp9_prefetch_store(dst);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -384,7 +384,7 @@
                                     filter_y, w, h);
           break;
         case 64:
-          vp9_prefetch_store(dst + 32);
+          prefetch_store(dst + 32);
           convolve_avg_vert_64_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_y, h);
@@ -452,17 +452,17 @@
   uint32_t tp3, tp4, tn2;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -482,9 +482,9 @@
     case 8:
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -509,9 +509,9 @@
     case 16:
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -544,9 +544,9 @@
     case 32:
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -593,16 +593,16 @@
       }
       break;
     case 64:
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
index 69da1cf..cdb8312 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c

@@ -43,9 +43,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -165,9 +165,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -357,9 +357,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -668,11 +668,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -985,9 +985,9 @@
       );
 
       /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+      prefetch_load(src);
+      prefetch_load(src + 32);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -1011,8 +1011,8 @@
                                       filter_x, h, 2);
           break;
         case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+          prefetch_load(src + 64);
+          prefetch_store(dst + 32);
 
           convolve_avg_horiz_64_dspr2(src, src_stride,
                                       dst, dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 58b50d2..a1309d1 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c

@@ -60,8 +60,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -176,8 +176,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -355,8 +355,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -645,9 +645,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -993,8 +993,8 @@
     src -= (src_stride * 3 + 3);
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
+    prefetch_load(src);
+    prefetch_load(src + 32);
 
     switch (w) {
       case 4:
@@ -1015,7 +1015,7 @@
                                            (w/16));
         break;
       case 64:
-        vp9_prefetch_load(src + 32);
+        prefetch_load(src + 32);
         convolve_horiz_64_transposed_dspr2(src, src_stride,
                                            temp, intermediate_height,
                                            filter_x, intermediate_height);
@@ -1078,9 +1078,9 @@
   int x, y;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
@@ -1089,9 +1089,9 @@
 
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         (%[src])      \n\t"
@@ -1112,9 +1112,9 @@
 
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1137,9 +1137,9 @@
 
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1169,9 +1169,9 @@
 
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1209,16 +1209,16 @@
       uint32_t tp1, tp2, tp3, tp4;
       uint32_t tp5, tp6, tp7, tp8;
 
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
index 0303896..d0e3095 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c

@@ -43,9 +43,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -154,9 +154,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -323,9 +323,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -593,11 +593,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -859,7 +859,7 @@
     if (16 == x_step_q4) {
       uint32_t pos = 38;
 
-      vp9_prefetch_load((const uint8_t *)filter_x);
+      prefetch_load((const uint8_t *)filter_x);
       src -= 3;
 
       /* bit positon for extract from acc */
@@ -870,9 +870,9 @@
       );
 
       /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+      prefetch_load(src);
+      prefetch_load(src + 32);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -896,8 +896,8 @@
                                   filter_x, (int32_t)h, 2);
           break;
         case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+          prefetch_load(src + 64);
+          prefetch_store(dst + 32);
 
           convolve_horiz_64_dspr2(src, (int32_t)src_stride,
                                   dst, (int32_t)dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
index 0930bb3..98acb81 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c

@@ -49,7 +49,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -203,8 +203,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -358,7 +358,7 @@
         : [pos] "r" (pos)
       );
 
-      vp9_prefetch_store(dst);
+      prefetch_store(dst);
 
       switch (w) {
         case 4 :
@@ -370,7 +370,7 @@
                                 filter_y, w, h);
           break;
         case 64 :
-          vp9_prefetch_store(dst + 32);
+          prefetch_store(dst + 32);
           convolve_vert_64_dspr2(src, src_stride,
                                  dst, dst_stride,
                                  filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 202d913..10a24f3 100644
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

@@ -34,7 +34,7 @@
 
   for (i = no_rows; i--; ) {
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 16));
+    prefetch_load((const uint8_t *)(input + 16));
 
     __asm__ __volatile__ (
         "lh       %[load1],              0(%[input])                    \n\t"
@@ -421,14 +421,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 16; ++i) {
     dest_pix = (dest + i);
@@ -1124,7 +1124,7 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
         iadst16(input, outptr);
         input += 16;
@@ -1144,7 +1144,7 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
         iadst16(input, outptr);
         input += 16;

diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 7ceebb6..a256145 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c

@@ -44,14 +44,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 32; ++i) {
     dest_pix = dest + i;

diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index 74a90b0..dd18831 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

@@ -96,8 +96,8 @@
     }
 
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 32));
-    vp9_prefetch_load((const uint8_t *)(input + 48));
+    prefetch_load((const uint8_t *)(input + 32));
+    prefetch_load((const uint8_t *)(input + 48));
 
     __asm__ __volatile__ (
         "lh       %[load1],             2(%[input])                     \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 280190a..4e31f9f 100644
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

@@ -115,14 +115,14 @@
   uint8_t   *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 4; ++i) {
       dest_pix = (dest + i);

diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 04d2266..6898d56 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

@@ -211,14 +211,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 8; ++i) {
       dest_pix = (dest + i);

diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 44dfb4d..42c3a09 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h

@@ -56,20 +56,6 @@
       return (uint16_t)clamp(val, 0, 4095);
   }
 }
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-
-#else
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_DEBUG

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 2fc97c3..55d2176 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -12,10 +12,10 @@
 #define VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 637f451..a9c6dc4 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h

@@ -14,8 +14,9 @@
 
 #include "./vpx_config.h"
 
+#include "vpx_dsp/prob.h"
+
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 9816728..0915918 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -9,6 +9,7 @@
  */
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -326,55 +327,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                    lfi0->hev_thr);
         } else if (mask_16x16_0 & 1) {
-          vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                               lfi0->hev_thr);
         } else {
-          vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                               lfi1->lim, lfi1->hev_thr);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
-          vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
-          vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                              lfi0->hev_thr, 1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
@@ -426,55 +427,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                           lfi0->hev_thr, bd);
         } else if (mask_16x16_0 & 1) {
-          vp9_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                                      lfi0->hev_thr, bd);
         } else {
-          vp9_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                                      lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_8x8_0 & 1) {
-          vp9_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_0 & 1) {
-          vp9_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
@@ -512,11 +513,11 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 2);
           count = 2;
         } else {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1);
         }
       } else if (mask_8x8 & 1) {
@@ -524,28 +525,28 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4 & 1) {
@@ -553,31 +554,31 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4_int & 1) {
-        vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                              lfi->hev_thr, 1);
       }
     }
@@ -609,11 +610,11 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 2, bd);
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 1, bd);
         }
       } else if (mask_8x8 & 1) {
@@ -621,31 +622,31 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                              lfi->lim, lfi->hev_thr,
                                              lfin->mblim, lfin->lim,
                                              lfin->hev_thr, bd);
           } else {
             if (mask_4x4_int & 1) {
-              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1, bd);
             } else if (mask_4x4_int & 2) {
-              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                           lfin->lim, lfin->hev_thr, 1, bd);
             }
           }
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1, bd);
 
           if (mask_4x4_int & 1) {
-            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1, bd);
           }
         }
@@ -654,35 +655,35 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                              lfi->lim, lfi->hev_thr,
                                              lfin->mblim, lfin->lim,
                                              lfin->hev_thr, bd);
           } else {
             if (mask_4x4_int & 1) {
-              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1, bd);
             } else if (mask_4x4_int & 2) {
-              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                           lfin->lim, lfin->hev_thr, 1, bd);
             }
           }
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1, bd);
 
           if (mask_4x4_int & 1) {
-            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1, bd);
           }
         }
       } else if (mask_4x4_int & 1) {
-        vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1, bd);
       }
     }
@@ -1093,15 +1094,15 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       } else if (mask_4x4 & 1) {
-        vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
@@ -1127,18 +1128,18 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, bd);
       } else if (mask_8x8 & 1) {
-        vp9_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
                                   lfi->hev_thr, 1, bd);
       } else if (mask_4x4 & 1) {
-        vp9_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+      vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
     s += 8;
     lfl += 1;

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1c1d75f..102f2a0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -220,49 +220,6 @@
 specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 #
-# Loopfilter
-#
-add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
-
-add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
-
-add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
-
-add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
-
-#
 # post proc
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -668,42 +625,6 @@
   specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
 
   #
-  # Loopfilter
-  #
-  add_proto qw/void vp9_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_16 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_16_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_8 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_8_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_4 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_4_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_16 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_8 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_8_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_4 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_4_dual sse2/;
-
-  #
   # post proc
   #
   if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -860,12 +781,6 @@
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b/;
-
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/;
-
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant/;
 } else {
@@ -881,12 +796,6 @@
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
-
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
-
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
@@ -1014,12 +923,6 @@
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b sse2/;
-
-  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
-
   #
   # Structured Similarity (SSIM)
   #
@@ -1030,13 +933,13 @@
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4 sse2/;
+  specialize qw/vp9_highbd_fht4x4/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8 sse2/;
+  specialize qw/vp9_highbd_fht8x8/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16 sse2/;
+  specialize qw/vp9_highbd_fht16x16/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fwht4x4/;

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 95c9918..29b45c1 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_SEG_COMMON_H_
 #define VP9_COMMON_VP9_SEG_COMMON_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index b9983f4..fd662d5 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -14,6 +14,8 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
@@ -38,8 +40,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d42a654..7b102ef 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -21,7 +21,6 @@
 
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_reader.h"
 
 static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);

diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 53bac8c..01c2e29 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h

@@ -11,8 +11,9 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_
 
+#include "vpx_dsp/bitreader.h"
+
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index af47f85..4eaf270 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h

@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_util/vpx_thread.h"
 
@@ -21,7 +22,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index cf0e48a..0902053 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -12,8 +12,8 @@
 #ifndef VP9_DECODER_VP9_DETOKENIZE_H_
 #define VP9_DECODER_VP9_DETOKENIZE_H_
 
+#include "vpx_dsp/bitreader.h"
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
 #include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus

diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
index 436f434..dd00d21 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/decoder/vp9_dsubexp.h

@@ -12,7 +12,7 @@
 #ifndef VP9_DECODER_VP9_DSUBEXP_H_
 #define VP9_DECODER_VP9_DSUBEXP_H_
 
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx_dsp/bitreader.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 4ca4083..01e5e13 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -13,6 +13,7 @@
 #include <limits.h>
 
 #include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 
@@ -32,7 +33,6 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
 static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
   {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},

diff --git a/vp9/encoder/vp9_cost.c b/vp9/encoder/vp9_cost.c
index 1c3c3d2..01ee553 100644
--- a/vp9/encoder/vp9_cost.c
+++ b/vp9/encoder/vp9_cost.c

@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <assert.h>
 
 #include "vp9/encoder/vp9_cost.h"
 

diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h
index 6d2b940..375f21b 100644
--- a/vp9/encoder/vp9_cost.h
+++ b/vp9/encoder/vp9_cost.h

@@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_COST_H_
 #define VP9_ENCODER_VP9_COST_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index f74cdd8..9c3c510 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -23,7 +24,6 @@
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -823,7 +823,10 @@
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
-          vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          if (tx_type == DCT_DCT)
+            vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob,
@@ -845,7 +848,10 @@
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
-          vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          if (tx_type == DCT_DCT)
+            vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob,

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 8cfc8a3..781204d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -4307,7 +4307,7 @@
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
-    double samples;
+    double samples = 0.0;
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 32c1f76..d53d95d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -9,7 +9,7 @@
  */
 
 #include <math.h>
-
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -20,113 +20,6 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
-    if (abs_qcoeff)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
-    if (abs_qcoeff)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -298,224 +191,6 @@
 }
 #endif
 
-void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp)
-          eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff)
-          eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, int skip_block,
-                                   const int16_t *zbin_ptr,
-                                   const int16_t *round_ptr,
-                                   const int16_t *quant_ptr,
-                                   const int16_t *quant_shift_ptr,
-                                   tran_low_t *qcoeff_ptr,
-                                   tran_low_t *dqcoeff_ptr,
-                                   const int16_t *dequant_ptr,
-                                   uint16_t *eob_ptr,
-                                   const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 = abs_coeff
-                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 55e5469..6132036 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -37,34 +37,9 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr);
-#endif
-
 struct VP9_COMP;
 struct VP9Common;
 

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 8c6c633..2be2a64 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -1810,13 +1810,12 @@
     cpi->resize_count = 0;
     return 0;
   }
-  // Resize based on average QP over some window.
+  // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
   if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
     const int window = (int)(5 * cpi->framerate);
     cpi->resize_avg_qp += cm->base_qindex;
-    if (cpi->rc.buffer_level < (int)(cpi->oxcf.drop_frames_water_mark *
-        rc->optimal_buffer_level / 100))
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
       ++cpi->resize_buffer_underflow;
     ++cpi->resize_count;
     // Check for resize action every "window" frames.

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 09cff00..9eeed15 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -845,7 +845,10 @@
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
             const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
-            vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+            if (tx_type == DCT_DCT)
+              vp9_highbd_fdct4x4(src_diff, coeff, 8);
+            else
+              vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                                  so->scan, so->neighbors,

diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index b345b16..98f5057 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c

@@ -7,13 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "vpx_dsp/bitwriter.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
-
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_subexp.h"
-#include "vp9/encoder/vp9_writer.h"
 
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 

diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 6fbb747..9776f61 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h

@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 struct vp9_writer;
 

diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index 4a76d87..e46ae39 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h

@@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_TREEWRITER_H_
 #define VP9_ENCODER_VP9_TREEWRITER_H_
 
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx_dsp/bitwriter.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index cff4fcb..9e13a68 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -2266,108 +2266,6 @@
   store_output(&in1, output);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-/* These SSE2 versions of the FHT functions only actually use SSE2 in the
- * DCT_DCT case in all other cases, they revert to C code which is identical
- * to that used by the C versions of them.
- */
-
-void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output,
-                            int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct4x4_sse2(input, output, stride);
-  } else {
-    tran_low_t out[4 * 4];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[4], temp_out[4];
-    const transform_2d ht = FHT_4[tx_type];
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = input[j * stride + i] * 16;
-      if (i == 0 && temp_in[0])
-        temp_in[0] += 1;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        outptr[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j + i * 4];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (temp_out[j] + 1) >> 2;
-    }
-  }
-}
-
-void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output,
-                            int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct8x8_sse2(input, output, stride);
-  } else {
-    tran_low_t out[64];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[8], temp_out[8];
-    const transform_2d ht = FHT_8[tx_type];
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        outptr[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j + i * 8];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-    }
-  }
-}
-
-void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output,
-                              int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct16x16_sse2(input, output, stride);
-  } else {
-    tran_low_t out[256];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[16], temp_out[16];
-    const transform_2d ht = FHT_16[tx_type];
-
-    // Columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-    }
-
-    // Rows
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j + i * 16];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j + i * 16] = temp_out[j];
-    }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 /*
  * The DCTnxn functions are defined using the macros below. The main code for
  * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h &

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 71fdfd7..2071dfe 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -14,214 +14,6 @@
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t* zbin_ptr,
-                         const int16_t* round_ptr, const int16_t* quant_ptr,
-                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
-                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                         uint16_t* eob_ptr,
-                         const int16_t* scan_ptr,
-                         const int16_t* iscan_ptr) {
-  __m128i zero;
-  (void)scan_ptr;
-
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
-        round = _mm_load_si128((const __m128i*)round_ptr);
-        quant = _mm_load_si128((const __m128i*)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
-        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
-
 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t* zbin_ptr,
                           const int16_t* round_ptr, const int16_t* quant_ptr,

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 449d52b..ec2e87c 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -15,206 +15,6 @@
 
 SECTION .text
 
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
 %macro QUANTIZE_FP 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
                                 shift, qcoeff, dqcoeff, dequant, \

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index bcab558..c8cf973 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -38,8 +38,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.c
-VP9_COMMON_SRCS-yes += common/vp9_prob.h
-VP9_COMMON_SRCS-yes += common/vp9_prob.c
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.h
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
@@ -54,7 +52,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
-VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
 VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
@@ -69,14 +66,11 @@
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
@@ -95,7 +89,6 @@
 endif
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
@@ -123,13 +116,6 @@
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 # common (msa)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
@@ -147,10 +133,6 @@
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
 
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
@@ -165,16 +147,12 @@
 endif
 endif
 
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
 endif
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
 
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
@@ -193,7 +171,6 @@
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
 else
 ifeq ($(HAVE_NEON), yes)
@@ -211,11 +188,6 @@
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
-# TODO(johannkoenig): re-enable when chromium build is fixed
-# # https://code.google.com/p/chromium/issues/detail?id=443839
-#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 5b62c3ec..12d5d7c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -17,6 +17,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
+#include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -24,7 +25,6 @@
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
 
 #include "vp9/vp9_iface_common.h"
 

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 94cc7ba..8836c1f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -37,10 +37,6 @@
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_fastssim.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
@@ -104,7 +100,6 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 

diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index c105adb..0e9cf16 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk

@@ -21,10 +21,6 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_reader.h
-VP9_DX_SRCS-yes += decoder/vp9_reader.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_dthread.c

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm b/vpx_dsp/arm/loopfilter_16_neon.asm
similarity index 94%
rename from vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_16_neon.asm
index 5b8ec20..5a8fdd6 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_16_neon.asm

@@ -8,12 +8,12 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 ;                                    const uint8_t *blimit0,
 ;                                    const uint8_t *limit0,
 ;                                    const uint8_t *thresh0,
@@ -29,7 +29,7 @@
 ; sp+8  const uint8_t *limit1,
 ; sp+12 const uint8_t *thresh1,
 
-|vp9_lpf_horizontal_4_dual_neon| PROC
+|vpx_lpf_horizontal_4_dual_neon| PROC
     push        {lr}
 
     ldr         r12, [sp, #4]              ; load thresh0
@@ -66,7 +66,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon_16
+    bl          vpx_loop_filter_neon_16
 
     vst1.u8     {q5}, [r2@64], r1          ; store op1
     vst1.u8     {q6}, [r3@64], r1          ; store op0
@@ -76,9 +76,9 @@
     vpop        {d8-d15}                   ; restore neon registers
 
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_dual_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
 
-; void vp9_loop_filter_neon_16();
+; void vpx_loop_filter_neon_16();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. This function uses
 ; registers d8-d15, so the calling function must save those registers.
@@ -101,7 +101,7 @@
 ; q6    op0
 ; q7    oq0
 ; q8    oq1
-|vp9_loop_filter_neon_16| PROC
+|vpx_loop_filter_neon_16| PROC
 
     ; filter_mask
     vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
@@ -194,6 +194,6 @@
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon_16|
+    ENDP        ; |vpx_loop_filter_neon_16|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
similarity index 93%
rename from vp9/common/arm/neon/vp9_loopfilter_16_neon.c
rename to vpx_dsp/arm/loopfilter_16_neon.c
index c69ee10..d24e6ad 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vpx_dsp/arm/loopfilter_16_neon.c

@@ -10,11 +10,11 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE void vp9_loop_filter_neon_16(
+static INLINE void loop_filter_neon_16(
         uint8x16_t qblimit,  // blimit
         uint8x16_t qlimit,   // limit
         uint8x16_t qthresh,  // thresh
@@ -124,7 +124,7 @@
     return;
 }
 
-void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
@@ -163,9 +163,9 @@
     s += p;
     q10u8 = vld1q_u8(s);
 
-    vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
-                            q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
-                            &q5u8, &q6u8, &q7u8, &q8u8);
+    loop_filter_neon_16(qblimit, qlimit, qthresh,
+                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                        &q5u8, &q6u8, &q7u8, &q8u8);
 
     s -= (p * 5);
     vst1q_u8(s, q5u8);

diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vpx_dsp/arm/loopfilter_4_neon.asm
similarity index 91%
rename from vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_4_neon.asm
index 7738e0d..e45e34c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_4_neon.asm

@@ -8,18 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_neon|
-    EXPORT  |vp9_lpf_vertical_4_neon|
+    EXPORT  |vpx_lpf_horizontal_4_neon|
+    EXPORT  |vpx_lpf_vertical_4_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
 ;                                int p /* pitch */,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
@@ -32,7 +32,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_4_neon| PROC
+|vpx_lpf_horizontal_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -41,7 +41,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_lf_h_edge
+    beq         end_vpx_lf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -62,7 +62,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon
+    bl          vpx_loop_filter_neon
 
     vst1.u8     {d4}, [r2@64], r1          ; store op1
     vst1.u8     {d5}, [r3@64], r1          ; store op0
@@ -73,16 +73,16 @@
     subs        r12, r12, #1
     bne         count_lf_h_loop
 
-end_vp9_lf_h_edge
+end_vpx_lf_h_edge
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_neon|
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
 ;                              int p /* pitch */,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -95,7 +95,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_4_neon| PROC
+|vpx_lpf_vertical_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -105,7 +105,7 @@
     ldr         r3, [sp, #4]              ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_lf_v_edge
+    beq         end_vpx_lf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -135,7 +135,7 @@
     vtrn.8      d7, d16
     vtrn.8      d17, d18
 
-    bl          vp9_loop_filter_neon
+    bl          vpx_loop_filter_neon
 
     sub         r0, r0, #2
 
@@ -154,11 +154,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_lf_v_loop
 
-end_vp9_lf_v_edge
+end_vpx_lf_v_edge
     pop         {pc}
-    ENDP        ; |vp9_lpf_vertical_4_neon|
+    ENDP        ; |vpx_lpf_vertical_4_neon|
 
-; void vp9_loop_filter_neon();
+; void vpx_loop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -182,7 +182,7 @@
 ; d5    op0
 ; d6    oq0
 ; d7    oq1
-|vp9_loop_filter_neon| PROC
+|vpx_loop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
@@ -272,6 +272,6 @@
     veor        d7, d20, d18                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
+    ENDP        ; |vpx_loop_filter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
similarity index 89%
rename from vp9/common/arm/neon/vp9_loopfilter_4_neon.c
rename to vpx_dsp/arm/loopfilter_4_neon.c
index fd9db61..7ad411a 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c

@@ -10,9 +10,9 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
-static INLINE void vp9_loop_filter_neon(
+static INLINE void loop_filter_neon(
         uint8x8_t dblimit,    // flimit
         uint8x8_t dlimit,     // limit
         uint8x8_t dthresh,    // thresh
@@ -110,19 +110,19 @@
     return;
 }
 
-void vp9_lpf_horizontal_4_neon(
-        unsigned char *src,
+void vpx_lpf_horizontal_4_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vp9_lf_h_edge
+    if (count == 0)  // end_vpx_lf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -149,9 +149,9 @@
         s += pitch;
         d18u8 = vld1_u8(s);
 
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
 
         s -= (pitch * 5);
         vst1_u8(s, d4u8);
@@ -165,12 +165,12 @@
     return;
 }
 
-void vp9_lpf_vertical_4_neon(
-        unsigned char *src,
+void vpx_lpf_vertical_4_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i, pitch8;
     uint8_t *s;
@@ -181,7 +181,7 @@
     uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
     uint8x8x4_t d4Result;
 
-    if (count == 0)  // end_vp9_lf_h_edge
+    if (count == 0)  // end_vpx_lf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -244,9 +244,9 @@
         d17u8 = d2tmp11.val[0];
         d18u8 = d2tmp11.val[1];
 
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
 
         d4Result.val[0] = d4u8;
         d4Result.val[1] = d5u8;

diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
similarity index 94%
rename from vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_8_neon.asm
index 91aaec0..e81734c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm

@@ -8,18 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_8_neon|
-    EXPORT  |vp9_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
 ;                                const uint8_t *thresh,
@@ -30,7 +30,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_8_neon| PROC
+|vpx_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -39,7 +39,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_mblf_h_edge
+    beq         end_vpx_mblf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -60,7 +60,7 @@
     sub         r3, r3, r1, lsl #1
     sub         r2, r2, r1, lsl #2
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     vst1.u8     {d0}, [r2@64], r1          ; store op2
     vst1.u8     {d1}, [r3@64], r1          ; store op1
@@ -73,12 +73,12 @@
     subs        r12, r12, #1
     bne         count_mblf_h_loop
 
-end_vp9_mblf_h_edge
+end_vpx_mblf_h_edge
     pop         {r4-r5, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_8_neon|
+    ENDP        ; |vpx_lpf_horizontal_8_neon|
 
-; void vp9_lpf_vertical_8_neon(uint8_t *s,
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -91,7 +91,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_8_neon| PROC
+|vpx_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -101,7 +101,7 @@
     ldr         r3, [sp, #12]             ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_mblf_v_edge
+    beq         end_vpx_mblf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -134,7 +134,7 @@
     sub         r2, r0, #3
     add         r3, r0, #1
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     ;store op2, op1, op0, oq0
     vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
@@ -161,11 +161,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_mblf_v_loop
 
-end_vp9_mblf_v_edge
+end_vpx_mblf_v_edge
     pop         {r4-r5, pc}
-    ENDP        ; |vp9_lpf_vertical_8_neon|
+    ENDP        ; |vpx_lpf_vertical_8_neon|
 
-; void vp9_mbloop_filter_neon();
+; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -191,7 +191,7 @@
 ; d3    oq0
 ; d4    oq1
 ; d5    oq2
-|vp9_mbloop_filter_neon| PROC
+|vpx_mbloop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
@@ -446,6 +446,6 @@
 
     bx          lr
 
-    ENDP        ; |vp9_mbloop_filter_neon|
+    ENDP        ; |vpx_mbloop_filter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
similarity index 93%
rename from vp9/common/arm/neon/vp9_loopfilter_8_neon.c
rename to vpx_dsp/arm/loopfilter_8_neon.c
index 33068a8..a887e2e 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c

@@ -10,9 +10,9 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
-static INLINE void vp9_mbloop_filter_neon(
+static INLINE void mbloop_filter_neon(
         uint8x8_t dblimit,   // mblimit
         uint8x8_t dlimit,    // limit
         uint8x8_t dthresh,   // thresh
@@ -263,12 +263,12 @@
     return;
 }
 
-void vp9_lpf_horizontal_8_neon(
-        unsigned char *src,
+void vpx_lpf_horizontal_8_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s, *psrc;
@@ -276,7 +276,7 @@
     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
     uint8x8_t d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vp9_mblf_h_edge
+    if (count == 0)  // end_vpx_mblf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -303,9 +303,9 @@
         s += pitch;
         d18u8 = vld1_u8(s);
 
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 
         s -= (pitch * 6);
         vst1_u8(s, d0u8);
@@ -323,12 +323,12 @@
     return;
 }
 
-void vp9_lpf_vertical_8_neon(
-        unsigned char *src,
+void vpx_lpf_vertical_8_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s;
@@ -403,9 +403,9 @@
         d17u8 = d2tmp11.val[0];
         d18u8 = d2tmp11.val[1];
 
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 
         d4Result.val[0] = d0u8;
         d4Result.val[1] = d1u8;

diff --git a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm
similarity index 96%
rename from vp9/common/arm/neon/vp9_mb_lpf_neon.asm
rename to vpx_dsp/arm/loopfilter_mb_neon.asm
index 5fe2bba..20d9cfb 100644
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm

@@ -8,13 +8,13 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_16_neon|
-    EXPORT  |vp9_lpf_vertical_16_neon|
+    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
 ;                                 const uint8_t *blimit,
 ;                                 const uint8_t *limit,
 ;                                 const uint8_t *thresh
@@ -24,7 +24,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_horizontal_16_neon| PROC
+|vpx_lpf_horizontal_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -54,7 +54,7 @@
     vld1.u8     {d14}, [r8@64], r1         ; q6
     vld1.u8     {d15}, [r8@64], r1         ; q7
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         h_mbfilter
@@ -115,9 +115,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_16_neon|
+    ENDP        ; |vpx_lpf_horizontal_16_neon|
 
-; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
 ;                               const uint8_t *limit,
 ;                               const uint8_t *thresh)
@@ -126,7 +126,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_vertical_16_neon| PROC
+|vpx_lpf_vertical_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -176,7 +176,7 @@
     vtrn.8      d12, d13
     vtrn.8      d14, d15
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         v_mbfilter
@@ -279,9 +279,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_vertical_16_neon|
+    ENDP        ; |vpx_lpf_vertical_16_neon|
 
-; void vp9_wide_mbfilter_neon();
+; void vpx_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 ;
@@ -305,7 +305,7 @@
 ; d13   q5
 ; d14   q6
 ; d15   q7
-|vp9_wide_mbfilter_neon| PROC
+|vpx_wide_mbfilter_neon| PROC
     mov         r7, #0
 
     ; filter_mask
@@ -601,6 +601,6 @@
     vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
 
     bx          lr
-    ENDP        ; |vp9_wide_mbfilter_neon|
+    ENDP        ; |vpx_wide_mbfilter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
similarity index 70%
rename from vp9/common/arm/neon/vp9_loopfilter_neon.c
rename to vpx_dsp/arm/loopfilter_neon.c
index 31fcc63..eff87d2 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c

@@ -10,49 +10,49 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
 #if HAVE_NEON_ASM
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
                                    const uint8_t *thresh) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // HAVE_NEON_ASM

diff --git a/vp9/decoder/vp9_reader.c b/vpx_dsp/bitreader.c
similarity index 97%
rename from vp9/decoder/vp9_reader.c
rename to vpx_dsp/bitreader.c
index 2c96f74..bc7ab3a 100644
--- a/vp9/decoder/vp9_reader.c
+++ b/vpx_dsp/bitreader.c

@@ -7,12 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./bitreader.h"
+#include "./prob.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/decoder/vp9_reader.h"
-
 int vp9_reader_init(vp9_reader *r,
                     const uint8_t *buffer,
                     size_t size,

diff --git a/vp9/decoder/vp9_reader.h b/vpx_dsp/bitreader.h
similarity index 95%
rename from vp9/decoder/vp9_reader.h
rename to vpx_dsp/bitreader.h
index 4959985..cdd1306 100644
--- a/vp9/decoder/vp9_reader.h
+++ b/vpx_dsp/bitreader.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_READER_H_
-#define VP9_DECODER_VP9_READER_H_
+#ifndef VPX_DSP_BITREADER_H_
+#define VPX_DSP_BITREADER_H_
 
 #include <stddef.h>
 #include <limits.h>
@@ -18,8 +18,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -138,4 +137,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_READER_H_
+#endif  // VPX_DSP_BITREADER_H_

diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vpx_dsp/bitreader_buffer.c
similarity index 95%
rename from vp9/decoder/vp9_read_bit_buffer.c
rename to vpx_dsp/bitreader_buffer.c
index c3b38a9..b49a2e2 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c

@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vp9/decoder/vp9_read_bit_buffer.h"
+#include "./bitreader_buffer.h"
 
 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
   return (rb->bit_offset + 7) >> 3;

diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vpx_dsp/bitreader_buffer.h
similarity index 88%
rename from vp9/decoder/vp9_read_bit_buffer.h
rename to vpx_dsp/bitreader_buffer.h
index fc88bd7..5289f3f 100644
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ b/vpx_dsp/bitreader_buffer.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
+#ifndef VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_DSP_BITREADER_BUFFER_H_
 
 #include <limits.h>
 
@@ -42,4 +42,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_
+#endif  // VPX_DSP_BITREADER_BUFFER_H_

diff --git a/vp9/encoder/vp9_writer.c b/vpx_dsp/bitwriter.c
similarity index 91%
rename from vp9/encoder/vp9_writer.c
rename to vpx_dsp/bitwriter.c
index ff461f2..1d34935 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vpx_dsp/bitwriter.c

@@ -9,8 +9,8 @@
  */
 
 #include <assert.h>
-#include "vp9/encoder/vp9_writer.h"
-#include "vp9/common/vp9_entropy.h"
+
+#include "./bitwriter.h"
 
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;

diff --git a/vp9/encoder/vp9_writer.h b/vpx_dsp/bitwriter.h
similarity index 93%
rename from vp9/encoder/vp9_writer.h
rename to vpx_dsp/bitwriter.h
index e347ea4..08acfa4 100644
--- a/vp9/encoder/vp9_writer.h
+++ b/vpx_dsp/bitwriter.h

@@ -8,12 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITER_H_
-#define VP9_ENCODER_VP9_WRITER_H_
+#ifndef VPX_DSP_BITWRITER_H_
+#define VPX_DSP_BITWRITER_H_
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -95,4 +95,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITER_H_
+#endif  // VPX_DSP_BITWRITER_H_

diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vpx_dsp/bitwriter_buffer.c
similarity index 95%
rename from vp9/encoder/vp9_write_bit_buffer.c
rename to vpx_dsp/bitwriter_buffer.c
index 6d55e84..3f7733d 100644
--- a/vp9/encoder/vp9_write_bit_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c

@@ -9,7 +9,8 @@
  */
 
 #include <limits.h>
-#include "vp9/encoder/vp9_write_bit_buffer.h"
+
+#include "./bitwriter_buffer.h"
 
 size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);

diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vpx_dsp/bitwriter_buffer.h
similarity index 85%
rename from vp9/encoder/vp9_write_bit_buffer.h
rename to vpx_dsp/bitwriter_buffer.h
index 59f9bbe..faa46c4 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vpx_dsp/bitwriter_buffer.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
-#define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#ifndef VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_DSP_BITWRITER_BUFFER_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -33,4 +33,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#endif  // VPX_DSP_BITWRITER_BUFFER_H_

diff --git a/vp9/common/vp9_loopfilter_filters.c b/vpx_dsp/loopfilter.c
similarity index 91%
rename from vp9/common/vp9_loopfilter_filters.c
rename to vpx_dsp/loopfilter.c
index 4876f9e..dc8aca5 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vpx_dsp/loopfilter.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -115,7 +115,7 @@
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
                             const uint8_t *thresh, int count) {
   int i;
@@ -132,15 +132,15 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
                           int count) {
   int i;
@@ -157,12 +157,12 @@
   }
 }
 
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
                                   thresh1, 1);
 }
 
@@ -187,7 +187,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh,
                             int count) {
   int i;
@@ -207,15 +207,15 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
                           int count) {
   int i;
@@ -232,12 +232,12 @@
   }
 }
 
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
                                     thresh1, 1);
 }
 
@@ -292,7 +292,7 @@
   }
 }
 
-void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh,
                              int count) {
   int i;
@@ -341,12 +341,12 @@
   }
 }
 
-void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
 }
 
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
 }
@@ -446,7 +446,7 @@
   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
 }
 
-void vp9_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int count, int bd) {
   int i;
@@ -469,7 +469,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
                                         const uint8_t *blimit0,
                                         const uint8_t *limit0,
                                         const uint8_t *thresh0,
@@ -477,11 +477,11 @@
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vp9_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
 }
 
-void vp9_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int count, int bd) {
   int i;
@@ -498,7 +498,7 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *blimit0,
                                       const uint8_t *limit0,
                                       const uint8_t *thresh0,
@@ -506,8 +506,8 @@
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vp9_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
                               thresh1, 1, bd);
 }
 
@@ -532,7 +532,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
   int i;
@@ -554,7 +554,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
                                         const uint8_t *blimit0,
                                         const uint8_t *limit0,
                                         const uint8_t *thresh0,
@@ -562,11 +562,11 @@
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vp9_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
 }
 
-void vp9_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int count, int bd) {
   int i;
@@ -586,7 +586,7 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *blimit0,
                                       const uint8_t *limit0,
                                       const uint8_t *thresh0,
@@ -594,8 +594,8 @@
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vp9_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
                               thresh1, 1, bd);
 }
 
@@ -662,7 +662,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int count, int bd) {
   int i;
@@ -727,13 +727,13 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
 }
 
-void vp9_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh,

diff --git a/vpx_dsp/mips/common_dspr2.h b/vpx_dsp/mips/common_dspr2.h
new file mode 100644
index 0000000..8278101
--- /dev/null
+++ b/vpx_dsp/mips/common_dspr2.h

@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_COMMON_MIPS_DSPR2_H_
+#define VPX_COMMON_MIPS_DSPR2_H_
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_COMMON_MIPS_DSPR2_H_

diff --git a/vp9/common/mips/msa/vp9_loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
similarity index 95%
rename from vp9/common/mips/msa/vp9_loopfilter_16_msa.c
rename to vpx_dsp/mips/loopfilter_16_msa.c
index aeaa48e..b7c9f7b 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c

@@ -9,9 +9,9 @@
  */
 
 #include "vpx_ports/mem.h"
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
                                  uint8_t *filter48,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
@@ -79,7 +79,7 @@
   }
 }
 
-void vp9_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -405,7 +405,7 @@
   }
 }
 
-void vp9_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
                                     const uint8_t *thresh_ptr,
@@ -415,15 +415,15 @@
 
   (void)count;
 
-  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
                                         limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    vp9_hz_lpf_t16_16w(src, pitch, filter48);
+    vpx_hz_lpf_t16_16w(src, pitch, filter48);
   }
 }
 
-void vp9_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
                                const uint8_t *b_limit_ptr,
                                const uint8_t *limit_ptr,
                                const uint8_t *thresh_ptr,
@@ -643,13 +643,13 @@
       }
     }
   } else {
-    vp9_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
+    vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
                                    thresh_ptr, count);
   }
 }
 
-static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
-                                       uint8_t *output, int32_t out_pitch) {
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -673,8 +673,8 @@
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
-                                       uint8_t *output, int32_t out_pitch) {
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
 
@@ -685,8 +685,8 @@
   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
 }
 
-static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
-                                uint8_t *output, int32_t out_pitch) {
+static void transpose_16x16(uint8_t *input, int32_t in_pitch,
+                            uint8_t *output, int32_t out_pitch) {
   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -744,7 +744,7 @@
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
                                 uint8_t *src_org, int32_t pitch_org,
                                 const uint8_t *b_limit_ptr,
                                 const uint8_t *limit_ptr,
@@ -812,7 +812,7 @@
   }
 }
 
-int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
                           uint8_t *filter48) {
   v16i8 zero = { 0 };
   v16u8 filter8, flat, flat2;
@@ -1032,7 +1032,7 @@
   }
 }
 
-void vp9_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr) {
@@ -1040,23 +1040,23 @@
   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   uint8_t *filter48 = &transposed_input[16 * 16];
 
-  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
-  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
                                        &filter48[0], src, pitch, b_limit_ptr,
                                        limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
                                    &filter48[0]);
 
     if (0 == early_exit) {
-      vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
     }
   }
 }
 
-int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
                                  uint8_t *src_org, int32_t pitch,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
@@ -1134,7 +1134,7 @@
   }
 }
 
-int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
                            uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
@@ -1455,7 +1455,7 @@
   }
 }
 
-void vp9_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit_ptr,
                                   const uint8_t *limit_ptr,
                                   const uint8_t *thresh_ptr) {
@@ -1463,18 +1463,18 @@
   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   uint8_t *filter48 = &transposed_input[16 * 16];
 
-  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
-  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
                                         &filter48[0], src, pitch, b_limit_ptr,
                                         limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
                                     &filter48[0]);
 
     if (0 == early_exit) {
-      vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
     }
   }
 }

diff --git a/vp9/common/mips/msa/vp9_loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
similarity index 94%
rename from vp9/common/mips/msa/vp9_loopfilter_4_msa.c
rename to vpx_dsp/mips/loopfilter_4_msa.c
index 7f69135..daf5f38 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
                               const uint8_t *thresh_ptr,
@@ -39,7 +39,7 @@
   SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
 }
 
-void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit0_ptr,
                                    const uint8_t *limit0_ptr,
                                    const uint8_t *thresh0_ptr,
@@ -71,7 +71,7 @@
   ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
 }
 
-void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
                             const uint8_t *thresh_ptr,
@@ -102,7 +102,7 @@
   ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
 }
 
-void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                  const uint8_t *b_limit0_ptr,
                                  const uint8_t *limit0_ptr,
                                  const uint8_t *thresh0_ptr,

diff --git a/vp9/common/mips/msa/vp9_loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
similarity index 97%
rename from vp9/common/mips/msa/vp9_loopfilter_8_msa.c
rename to vpx_dsp/mips/loopfilter_8_msa.c
index 26a858d..00b6db5 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-void vp9_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
                               const uint8_t *thresh_ptr,
@@ -83,7 +83,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
@@ -158,7 +158,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
                             const uint8_t *thresh_ptr,
@@ -237,7 +237,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                  const uint8_t *b_limit0,
                                  const uint8_t *limit0,
                                  const uint8_t *thresh0,

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
similarity index 85%
rename from vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
rename to vpx_dsp/mips/loopfilter_filters_dspr2.c
index 3df7f4c..99a96d8 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_4_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -50,7 +49,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   /* loop filter designed to work using chars so that we can make maximum use
      of 8 bit simd instructions. */
@@ -88,14 +87,14 @@
           : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
       );
 
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
-                                pm1, p0, p3, p4, p5, p6,
-                                thresh_vec, &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                            pm1, p0, p3, p4, p5, p6,
+                            thresh_vec, &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         __asm__ __volatile__ (
             "sw     %[p1],  (%[s1])    \n\t"
@@ -114,7 +113,7 @@
   }
 }
 
-void vp9_lpf_vertical_4_dspr2(unsigned char *s,
+void vpx_lpf_vertical_4_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -144,7 +143,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -217,14 +216,14 @@
      * mask will be zero and filtering is not needed
      */
     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
-                                p0, p3, p4, p5, p6, thresh_vec,
-                                &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                            p0, p3, p4, p5, p6, thresh_vec,
+                            &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         /* unpack processed 4x4 neighborhood
          * don't use transpose on output data
@@ -307,56 +306,56 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
                                        1);
 }
 
-void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh) {
-  vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // #if HAVE_DSPR2

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/vpx_dsp/mips/loopfilter_filters_dspr2.h
similarity index 85%
rename from vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
rename to vpx_dsp/mips/loopfilter_filters_dspr2.h
index 675db65..4a1506b 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.h

@@ -13,10 +13,10 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,10 +24,10 @@
 
 #if HAVE_DSPR2
 /* inputs & outputs are quad-byte vectors */
-static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
-                                    uint32_t *ps1, uint32_t *ps0,
-                                    uint32_t *qs0, uint32_t *qs1) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
+                                uint32_t *ps1, uint32_t *ps0,
+                                uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -73,34 +73,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -113,13 +113,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -142,23 +142,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -196,12 +196,12 @@
   *qs1 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
-                                     uint32_t ps1, uint32_t ps0,
-                                     uint32_t qs0, uint32_t qs1,
-                                     uint32_t *p1_f0, uint32_t *p0_f0,
-                                     uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
+                                 uint32_t ps1, uint32_t ps0,
+                                 uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -247,34 +247,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -286,13 +286,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -315,23 +315,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -369,10 +369,10 @@
   *q1_f0 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
-                                      uint32_t *op1, uint32_t *op0,
-                                      uint32_t *oq0, uint32_t *oq1,
-                                      uint32_t *oq2, uint32_t *oq3) {
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                  uint32_t *op1, uint32_t *op0,
+                                  uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
@@ -446,14 +446,14 @@
   *oq2 = res_oq2;
 }
 
-static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
-                                       uint32_t p1, uint32_t p0,
-                                       uint32_t q0, uint32_t q1,
-                                       uint32_t q2, uint32_t q3,
-                                       uint32_t *op2_f1,
-                                       uint32_t *op1_f1, uint32_t *op0_f1,
-                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
-                                       uint32_t *oq2_f1) {
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                   uint32_t p1, uint32_t p0,
+                                   uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3,
+                                   uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   uint32_t  res_op2, res_op1, res_op0;
   uint32_t  res_oq0, res_oq1, res_oq2;
@@ -524,14 +524,14 @@
   *oq2_f1 = res_oq2;
 }
 
-static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
-                                           uint32_t *op5, uint32_t *op4,
-                                           uint32_t *op3, uint32_t *op2,
-                                           uint32_t *op1, uint32_t *op0,
-                                           uint32_t *oq0, uint32_t *oq1,
-                                           uint32_t *oq2, uint32_t *oq3,
-                                           uint32_t *oq4, uint32_t *oq5,
-                                           uint32_t *oq6, uint32_t *oq7) {
+static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                       uint32_t *op5, uint32_t *op4,
+                                       uint32_t *op3, uint32_t *op2,
+                                       uint32_t *op1, uint32_t *op0,
+                                       uint32_t *oq0, uint32_t *oq1,
+                                       uint32_t *oq2, uint32_t *oq3,
+                                       uint32_t *oq4, uint32_t *oq5,
+                                       uint32_t *oq6, uint32_t *oq7) {
   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/vpx_dsp/mips/loopfilter_macros_dspr2.h
similarity index 99%
rename from vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
rename to vpx_dsp/mips/loopfilter_macros_dspr2.h
index ca01a6a..994ff18 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_macros_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/vpx_dsp/mips/loopfilter_masks_dspr2.h
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
rename to vpx_dsp/mips/loopfilter_masks_dspr2.h
index 5b0d9cc..e82dfb7 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_masks_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,13 +24,13 @@
 #if HAVE_DSPR2
 /* processing 4 pixels at the same time
  * compute hev and mask in the same function */
-static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                             uint32_t p1, uint32_t p0,
-                                             uint32_t p3, uint32_t p2,
-                                             uint32_t q0, uint32_t q1,
-                                             uint32_t q2, uint32_t q3,
-                                             uint32_t thresh, uint32_t *hev,
-                                             uint32_t *mask) {
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0,
+                                         uint32_t p3, uint32_t p2,
+                                         uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
   uint32_t  c, r, r3, r_k;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -129,16 +129,16 @@
   *mask = s2;
 }
 
-static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
-                                                       uint32_t flimit,
-                                                       uint32_t thresh,
-                                                       uint32_t p1, uint32_t p0,
-                                                       uint32_t p3, uint32_t p2,
-                                                       uint32_t q0, uint32_t q1,
-                                                       uint32_t q2, uint32_t q3,
-                                                       uint32_t *hev,
-                                                       uint32_t *mask,
-                                                       uint32_t *flat) {
+static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                   uint32_t flimit,
+                                                   uint32_t thresh,
+                                                   uint32_t p1, uint32_t p0,
+                                                   uint32_t p3, uint32_t p2,
+                                                   uint32_t q0, uint32_t q1,
+                                                   uint32_t q2, uint32_t q3,
+                                                   uint32_t *hev,
+                                                   uint32_t *mask,
+                                                   uint32_t *flat) {
   uint32_t  c, r, r3, r_k, r_flat;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -279,12 +279,12 @@
   *flat = flat1;
 }
 
-static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
-                                 uint32_t p2, uint32_t p1,
-                                 uint32_t p0, uint32_t q0,
-                                 uint32_t q1, uint32_t q2,
-                                 uint32_t q3, uint32_t q4,
-                                 uint32_t *flat2) {
+static INLINE void flatmask5(uint32_t p4, uint32_t p3,
+                             uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0,
+                             uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4,
+                             uint32_t *flat2) {
   uint32_t  c, r, r_k, r_flat;
   uint32_t  ones = 0xFFFFFFFF;
   uint32_t  flat_thresh = 0x01010101;

diff --git a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_dspr2.c
index 7cd0b63..4138f56 100644
--- a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_8_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -53,7 +52,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < 2; i++) {
     sp3 = s - (pitch << 2);
@@ -81,13 +80,13 @@
           [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])    \n\t"
@@ -104,13 +103,13 @@
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -130,18 +129,18 @@
       );
     } else if ((flat != 0) && (mask != 0)) {
       /* filtering */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -319,7 +318,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dspr2(unsigned char *s,
+void vpx_lpf_vertical_8_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -351,7 +350,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -451,39 +450,39 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat != 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 6c94674..8a48650 100644
--- a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_16_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
                                  int pitch,
                                  const uint8_t *blimit,
                                  const uint8_t *limit,
@@ -58,7 +57,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < (2 * count); i++) {
     sp7 = s - (pitch << 3);
@@ -110,17 +109,17 @@
           [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])            \n\t"
@@ -139,17 +138,17 @@
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
       COMBINE_LEFT_RIGHT_3TO6()
@@ -189,13 +188,13 @@
       /* f1 */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -215,18 +214,18 @@
       );
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -399,36 +398,36 @@
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 + f2 */
       /* f0  function */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* f1  function */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       /* f2  function */
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
index 851fc6c..e580f01 100644
--- a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_vertical_16_dspr2(uint8_t *s,
+void vpx_lpf_vertical_16_dspr2(uint8_t *s,
                                int pitch,
                                const uint8_t *blimit,
                                const uint8_t *limit,
@@ -55,7 +54,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -248,61 +247,61 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
                (mask == 0xFFFFFFFF)) {
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       STORE_F2()
     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
       /* f1 */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -466,32 +465,32 @@
       }
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1+f2 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/msa/vp9_loopfilter_msa.h b/vpx_dsp/mips/loopfilter_msa.h
similarity index 98%
rename from vp9/common/mips/msa/vp9_loopfilter_msa.h
rename to vpx_dsp/mips/loopfilter_msa.h
index bfbe870..62b1706 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h
+++ b/vpx_dsp/mips/loopfilter_msa.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
+#ifndef VPX_DSP_LOOPFILTER_MSA_H_
+#define VPX_DSP_LOOPFILTER_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -243,4 +243,4 @@
   mask_out = limit_in < (v16u8)mask_out;                         \
   mask_out = __msa_xori_b(mask_out, 0xff);                       \
 }
-#endif  /* VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ */
+#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */

diff --git a/vp9/common/vp9_prob.c b/vpx_dsp/prob.c
similarity index 97%
rename from vp9/common/vp9_prob.c
rename to vpx_dsp/prob.c
index 3b7b9bf..8f44e18 100644
--- a/vp9/common/vp9_prob.c
+++ b/vpx_dsp/prob.c

@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_prob.h"
+#include "./prob.h"
 
 const uint8_t vp9_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,

diff --git a/vp9/common/vp9_prob.h b/vpx_dsp/prob.h
similarity index 95%
rename from vp9/common/vp9_prob.h
rename to vpx_dsp/prob.h
index c69c62c..07a367a 100644
--- a/vp9/common/vp9_prob.h
+++ b/vpx_dsp/prob.h

@@ -8,15 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PROB_H_
-#define VP9_COMMON_VP9_PROB_H_
+#ifndef VPX_DSP_PROB_H_
+#define VPX_DSP_PROB_H_
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_common.h"
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_common.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -101,4 +100,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PROB_H_
+#endif  // VPX_DSP_PROB_H_

diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
new file mode 100644
index 0000000..137f5bc
--- /dev/null
+++ b/vpx_dsp/quantize.c

@@ -0,0 +1,337 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 16;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 15;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        if (abs_qcoeff)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif

diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
new file mode 100644
index 0000000..0ad1744
--- /dev/null
+++ b/vpx_dsp/quantize.h

@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_QUANTIZE_H_
+#define VPX_DSP_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_QUANTIZE_H_

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 22d1fed..2efc945 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -13,6 +13,69 @@
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
+
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_16_neon.c
+DSP_SRCS-yes   += arm/loopfilter_8_neon.c
+DSP_SRCS-yes   += arm/loopfilter_4_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+DSP_SRCS-yes            += quantize.c
+DSP_SRCS-yes            += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+endif
+endif
+endif  # CONFIG_VP9_ENCODER
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c

diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index c7bac95..6793036 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h

@@ -24,6 +24,20 @@
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
+#if CONFIG_VP9_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8e4e966..393539c 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -5,12 +5,18 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 EOF
 }
 forward_decls qw/vpx_dsp_forward_decls/;
 
-# Functions which use x86inc.asm instead of x86_abi_support.asm
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
 if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
   $mmx_x86inc = 'mmx';
   $sse_x86inc = 'sse';
@@ -18,23 +24,127 @@
   $ssse3_x86inc = 'ssse3';
   $avx_x86inc = 'avx';
   $avx2_x86inc = 'avx2';
-} else {
-  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
-  $avx_x86inc = $avx2_x86inc = '';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
 }
 
-# Functions which are 64 bit only.
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
   $mmx_x86_64 = 'mmx';
   $sse2_x86_64 = 'sse2';
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
-} else {
-  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
-  $avx_x86_64 = $avx2_x86_64 = '';
 }
 
+#
+# Loopfilter
+#
+add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16 sse2 neon_asm msa/;
+$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
+
+add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm msa/;
+$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_8 sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm msa/;
+$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_4 mmx neon msa/;
+
+add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm msa/;
+$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_8 sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm msa/;
+$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_4 mmx neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/;
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+
+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b sse2/;
+
+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
+} else {
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
+}  # CONFIG_VP9_ENCODER
+}  # CONFIG_VP9_HIGHBITDEPTH
+
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
 # Block subtraction

diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
similarity index 96%
rename from vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
rename to vpx_dsp/x86/highbd_loopfilter_sse2.c
index b40669c..c4fd5e1 100644
--- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c

@@ -10,9 +10,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
@@ -509,7 +508,7 @@
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
                                        const uint8_t *_blimit,
                                        const uint8_t *_limit,
                                        const uint8_t *_thresh,
@@ -520,7 +519,7 @@
     highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
 }
 
-void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
@@ -688,7 +687,7 @@
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
   filt = signed_char_clamp_bd_sse2(filt, bd);
   filt = _mm_and_si128(filt, mask);
 
@@ -757,7 +756,7 @@
   _mm_store_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_blimit0,
                                            const uint8_t *_limit0,
                                            const uint8_t *_thresh0,
@@ -765,12 +764,12 @@
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
                                    1, bd);
 }
 
-void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
@@ -892,7 +891,7 @@
   filt = _mm_adds_epi16(filt, work_a);
   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
 
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
   filt = _mm_and_si128(filt, mask);
 
   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
@@ -937,7 +936,7 @@
   _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 }
 
-void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_blimit0,
                                            const uint8_t *_limit0,
                                            const uint8_t *_thresh0,
@@ -945,8 +944,8 @@
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
                                    bd);
 }
 
@@ -1055,7 +1054,7 @@
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
@@ -1072,7 +1071,7 @@
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
                                    bd);
 
   src[0] = t_dst;
@@ -1082,7 +1081,7 @@
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *blimit0,
                                          const uint8_t *limit0,
                                          const uint8_t *thresh0,
@@ -1098,7 +1097,7 @@
   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
                                         thresh0, blimit1, limit1, thresh1, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1109,7 +1108,7 @@
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
@@ -1126,7 +1125,7 @@
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
                                    bd);
 
   src[0] = t_dst;
@@ -1136,7 +1135,7 @@
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *blimit0,
                                          const uint8_t *limit0,
                                          const uint8_t *thresh0,
@@ -1152,7 +1151,7 @@
   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
                                         thresh0, blimit1, limit1, thresh1, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1164,7 +1163,7 @@
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh,
@@ -1193,7 +1192,7 @@
   highbd_transpose(src, 8, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
                                           int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,

diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
similarity index 97%
rename from vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
rename to vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 0174cfe..06c748d 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,8 +10,9 @@
 
 #include <emmintrin.h>
 
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 // from vp9_idct.h: typedef int32_t tran_low_t;

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
similarity index 98%
rename from vp9/common/x86/vp9_loopfilter_intrin_avx2.c
rename to vpx_dsp/x86/loopfilter_avx2.c
index 770a65f..23a97dd 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c

@@ -10,7 +10,7 @@
 
 #include <immintrin.h>  /* AVX2 */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
@@ -103,7 +103,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -515,7 +515,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -976,7 +976,7 @@
     }
 }
 
-void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh, int count) {
     if (count == 1)

diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
similarity index 98%
rename from vp9/common/x86/vp9_loopfilter_mmx.asm
rename to vpx_dsp/x86/loopfilter_mmx.asm
index f5f7d5a..b9c18b6 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm

@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 
-;void vp9_lpf_horizontal_4_mmx
+;void vpx_lpf_horizontal_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
@@ -21,8 +21,8 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
-sym(vp9_lpf_horizontal_4_mmx):
+global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
+sym(vpx_lpf_horizontal_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -224,7 +224,7 @@
     ret
 
 
-;void vp9_lpf_vertical_4_mmx
+;void vpx_lpf_vertical_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
@@ -233,8 +233,8 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_lpf_vertical_4_mmx) PRIVATE
-sym(vp9_lpf_vertical_4_mmx):
+global sym(vpx_lpf_vertical_4_mmx) PRIVATE
+sym(vpx_lpf_vertical_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
similarity index 97%
rename from vp9/common/x86/vp9_loopfilter_intrin_sse2.c
rename to vpx_dsp/x86/loopfilter_sse2.c
index fe8af54..ed10127 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c

@@ -10,8 +10,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_loopfilter.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
@@ -100,7 +100,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -495,7 +495,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
@@ -717,7 +717,7 @@
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
                                 const unsigned char *_blimit,
                                 const unsigned char *_limit,
                                 const unsigned char *_thresh, int count) {
@@ -727,7 +727,7 @@
     mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 }
 
-void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh, int count) {
@@ -874,7 +874,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -943,7 +943,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
                                     const uint8_t *_blimit0,
                                     const uint8_t *_limit0,
                                     const uint8_t *_thresh0,
@@ -1115,7 +1115,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1190,7 +1190,7 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0,
@@ -1286,7 +1286,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1464,7 +1464,7 @@
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
@@ -1478,7 +1478,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1489,7 +1489,7 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh, int count) {
@@ -1505,7 +1505,7 @@
   transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1514,7 +1514,7 @@
   transpose(src, 8, dst, p, 1);
 }
 
-void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
@@ -1528,7 +1528,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1540,7 +1540,7 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1568,7 +1568,7 @@
   transpose(src, 8, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);

diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000..8d51aeb
--- /dev/null
+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t* zbin_ptr,
+                         const int16_t* round_ptr, const int16_t* quant_ptr,
+                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         uint16_t* eob_ptr,
+                         const int16_t* scan_ptr,
+                         const int16_t* iscan_ptr) {
+  __m128i zero;
+  (void)scan_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+  if (!skip_block) {
+    __m128i eob;
+    __m128i zbin;
+    __m128i round, quant, dequant, shift;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        __m128i pw_1;
+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        pw_1 = _mm_set1_epi16(1);
+        zbin = _mm_sub_epi16(zbin, pw_1);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        shift = _mm_unpackhi_epi64(shift, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000..3784d9d
--- /dev/null
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
commit	149822e399087f4dd8e4e6efce9ad8d259d5ccea	[log] [tgz]
author	Yaowu Xu <yaowu@google.com>	Mon Jul 20 22:49:53 2015 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Mon Jul 20 22:49:54 2015 +0000
tree	b7a25252810ce3e24990e54806f6ecb1fb797860
parent	add779e42593dac57a30da250862e2705e5e80dc [diff]
parent	7c0c62df1d25e76c52afb411ad342a82ed5d1ff0 [diff]