Merge "mips msa vp8 sixtap filter optimization"

commit: 00a35fdedacde01a8a2bb23cae1e1270d7834c3a [log] [tgz]
author: Parag Salasakar <img.mips1@gmail.com> Thu Jul 23 01:03:39 2015 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Thu Jul 23 01:03:39 2015 +0000
tree: f50ce4e6fb1ae98eaa4bf6424bb317d0621afffd
parent: 963b9b8818d209ad41c704b7e0591b967799275c [diff]
parent: 55c0df5ef13f2c828b968788a319f27bf5b944cf [diff]
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index db7e961..c1c5d77 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -902,116 +902,6 @@
   }
 }
 
-DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
-    { 0,   0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0, 128},
-    { 0,   0,   0, 128},
-    { 0,   0, 128},
-    { 0, 128},
-    { 128},
-    { 0,   0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0,   0, 128},
-    { 0,   0,   0,   0, 128},
-    { 0,   0,   0, 128},
-    { 0,   0, 128},
-    { 0, 128},
-    { 128}
-};
-
-/* This test exercises the horizontal and vertical filter functions. */
-TEST_P(ConvolveTest, ChangeFilterWorks) {
-  uint8_t* const in = input();
-  uint8_t* const out = output();
-
-  /* Assume that the first input sample is at the 8/16th position. */
-  const int kInitialSubPelOffset = 8;
-
-  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
-   * at position -3 with respect to the current filtering position. Since
-   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
-   * which is non-zero only in the last tap. So, applying the filter at the
-   * current input position will result in an output equal to the pixel at
-   * offset +4 (-3 + 7) with respect to the current filtering position.
-   */
-  const int kPixelSelected = 4;
-
-  /* Assume that each output pixel requires us to step on by 17/16th pixels in
-   * the input.
-   */
-  const int kInputPixelStep = 17;
-
-  /* The filters are setup in such a way that the expected output produces
-   * sets of 8 identical output samples. As the filter position moves to the
-   * next 1/16th pixel position the only active (=128) filter tap moves one
-   * position to the left, resulting in the same input pixel being replicated
-   * in to the output for 8 consecutive samples. After each set of 8 positions
-   * the filters select a different input pixel. kFilterPeriodAdjust below
-   * computes which input pixel is written to the output for a specified
-   * x or y position.
-   */
-
-  /* Test the horizontal filter. */
-  ASM_REGISTER_STATE_CHECK(
-      UUT_->h8_(in, kInputStride, out, kOutputStride,
-                kChangeFilters[kInitialSubPelOffset],
-                kInputPixelStep, NULL, 0, Width(), Height()));
-
-  for (int x = 0; x < Width(); ++x) {
-    const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    ASSERT_EQ(lookup(in, ref_x), lookup(out, x))
-        << "x == " << x << "width = " << Width();
-  }
-
-  /* Test the vertical filter. */
-  ASM_REGISTER_STATE_CHECK(
-      UUT_->v8_(in, kInputStride, out, kOutputStride,
-                NULL, 0, kChangeFilters[kInitialSubPelOffset],
-                kInputPixelStep, Width(), Height()));
-
-  for (int y = 0; y < Height(); ++y) {
-    const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjust * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    ASSERT_EQ(lookup(in, ref_y * kInputStride), lookup(out, y * kInputStride))
-        << "y == " << y;
-  }
-
-  /* Test the horizontal and vertical filters in combination. */
-  ASM_REGISTER_STATE_CHECK(
-      UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                 kChangeFilters[kInitialSubPelOffset], kInputPixelStep,
-                 kChangeFilters[kInitialSubPelOffset], kInputPixelStep,
-                 Width(), Height()));
-
-  for (int y = 0; y < Height(); ++y) {
-    const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y =
-        kPixelSelected + ((kInitialSubPelOffset
-            + kFilterPeriodAdjustY * kInputPixelStep)
-                          >> SUBPEL_BITS);
-    for (int x = 0; x < Width(); ++x) {
-      const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x =
-          kPixelSelected + ((kInitialSubPelOffset
-              + kFilterPeriodAdjustX * kInputPixelStep)
-                            >> SUBPEL_BITS);
-
-      ASSERT_EQ(lookup(in, ref_y * kInputStride + ref_x),
-                lookup(out, y * kOutputStride + x))
-          << "x == " << x << ", y == " << y;
-    }
-  }
-}
-
 /* This test exercises that enough rows and columns are filtered with every
    possible initial fractional positions and scaling steps. */
 TEST_P(ConvolveTest, CheckScalingFiltering) {

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 66ca4bb..0b2ecb5 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -19,6 +19,7 @@
 #include "test/util.h"
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
@@ -901,14 +902,6 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
@@ -929,7 +922,8 @@
                    &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+// TODO(jingning) Re-enable the mips/msa unit test.
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE && 0
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans16x16DCT,
     ::testing::Values(

diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 42ac13f..32be1f4 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc

@@ -129,8 +129,13 @@
   vpx_codec_ctx_t dec;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
   const uint32_t frame_size = static_cast<uint32_t>(video.frame_size());
+#if CONFIG_VP9_HIGHBITDEPTH
   EXPECT_EQ(VPX_CODEC_MEM_ERROR,
             vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#else
+  EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+#endif
   vpx_codec_iter_t iter = NULL;
   EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
 

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 4ee4ad4..b5733c1 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -19,6 +19,7 @@
 #include "test/util.h"
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
@@ -531,21 +532,14 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+// TODO(jingning) Re-enable the mips/msa unit test.
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE && 0
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4DCT,
     ::testing::Values(

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 61bfe50..6348057 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -19,6 +19,7 @@
 #include "test/util.h"
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
@@ -772,7 +773,8 @@
                    VPX_BITS_8)));
 #endif
 
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+// TODO(jingning) Re-enable the mips/msa unit test.
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE && 0
 INSTANTIATE_TEST_CASE_P(
     MSA, FwdTrans8x8DCT,
     ::testing::Values(

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 2cbbc6b..1b5ef5c 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -112,7 +112,9 @@
 
 const DecodeParam kVP9InvalidFileTests[] = {
   {1, "invalid-vp90-02-v2.webm"},
+#if CONFIG_VP9_HIGHBITDEPTH
   {1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
+#endif
   {1, "invalid-vp90-03-v3.webm"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
   {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index ba51309..96aaa23 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc

@@ -19,7 +19,7 @@
 #include "test/util.h"
 
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx/vpx_integer.h"
@@ -60,49 +60,49 @@
 void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count, int bd) {
-  vp9_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count, int bd) {
-  vp9_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
-  vp9_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
 }
 
 void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count, int bd) {
-  vp9_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
 }
 #else
 void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count) {
-  vp9_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
-  vp9_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count) {
-  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
@@ -114,25 +114,25 @@
 void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh,
                               int count) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
-  vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int count) {
-  vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON_ASM
@@ -141,13 +141,13 @@
 void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh,
                              int count) {
-  vp9_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
 }
 
 void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh,
                            int count) {
-  vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
 }
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
@@ -534,46 +534,46 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 10, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12, 1),
-        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12, 2),
-        make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
+                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
+                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
+                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
+                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+        make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
+                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
         make_tuple(&wrapper_vertical_16_sse2,
                    &wrapper_vertical_16_c, 12, 1),
         make_tuple(&wrapper_vertical_16_dual_sse2,
@@ -586,10 +586,10 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
@@ -598,8 +598,8 @@
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8,
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
                    2)));
 #endif
 
@@ -608,42 +608,42 @@
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 8),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 10),
-        make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_4_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
-                   &vp9_highbd_lpf_horizontal_8_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
-                   &vp9_highbd_lpf_vertical_4_dual_c, 12),
-        make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
-                   &vp9_highbd_lpf_vertical_8_dual_c, 12)));
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_8_dual_sse2,
+                   &vpx_highbd_lpf_horizontal_8_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_4_dual_sse2,
+                   &vpx_highbd_lpf_vertical_4_dual_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
+                   &vpx_highbd_lpf_vertical_8_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_sse2,
-                   &vp9_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_sse2,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_sse2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_sse2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_sse2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_sse2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -657,36 +657,36 @@
 #if HAVE_NEON_ASM
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
-        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_neon,
+                   &vpx_lpf_horizontal_16_c, 8, 2),
         make_tuple(&wrapper_vertical_16_neon,
                    &wrapper_vertical_16_c, 8, 1),
         make_tuple(&wrapper_vertical_16_dual_neon,
                    &wrapper_vertical_16_dual_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_8_neon,
-                   &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_vertical_8_neon,
-                   &vp9_lpf_vertical_8_c, 8, 1),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_4_neon,
-                   &vp9_lpf_horizontal_4_c, 8, 1),
-        make_tuple(&vp9_lpf_vertical_4_neon,
-                   &vp9_lpf_vertical_4_c, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_8_neon,
+                   &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_8_neon,
+                   &vpx_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_4_neon,
+                   &vpx_lpf_horizontal_4_c, 8, 1),
+        make_tuple(&vpx_lpf_vertical_4_neon,
+                   &vpx_lpf_vertical_4_c, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
 #if HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_8_dual_neon,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_neon,
-                   &vp9_lpf_vertical_8_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_neon,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_neon,
+                   &vpx_lpf_vertical_8_dual_c, 8),
 #endif  // HAVE_NEON_ASM
-        make_tuple(&vp9_lpf_horizontal_4_dual_neon,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_neon,
-                   &vp9_lpf_vertical_4_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_neon,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_neon,
+                   &vpx_lpf_vertical_4_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
@@ -694,23 +694,23 @@
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_msa, &vp9_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vp9_lpf_horizontal_16_msa, &vp9_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vp9_lpf_vertical_8_msa, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test9Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_4_dual_msa,
-                   &vp9_lpf_horizontal_4_dual_c, 8),
-        make_tuple(&vp9_lpf_horizontal_8_dual_msa,
-                   &vp9_lpf_horizontal_8_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_4_dual_msa,
-                   &vp9_lpf_vertical_4_dual_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_dual_msa,
-                   &vp9_lpf_vertical_8_dual_c, 8)));
+        make_tuple(&vpx_lpf_horizontal_4_dual_msa,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_msa,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_msa,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_msa,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
 }  // namespace

diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 3e93ae8..8d19aff 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc

@@ -19,6 +19,7 @@
 #include "test/util.h"
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_integer.h"

diff --git a/test/test.mk b/test/test.mk
index a8a365e..8ecc856 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -91,6 +91,7 @@
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 
 ## VP8
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
@@ -142,7 +143,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 64095bc..74775bb 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -1994,5 +1994,47 @@
                       make_tuple(5, 6, subpel_variance32x64_msa, 0),
                       make_tuple(6, 5, subpel_variance64x32_msa, 0),
                       make_tuple(6, 6, subpel_variance64x64_msa, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_msa =
+    vpx_sub_pixel_avg_variance64x64_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_msa =
+    vpx_sub_pixel_avg_variance64x32_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_msa =
+    vpx_sub_pixel_avg_variance32x64_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_msa =
+    vpx_sub_pixel_avg_variance32x32_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_msa =
+    vpx_sub_pixel_avg_variance32x16_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_msa =
+    vpx_sub_pixel_avg_variance16x32_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_msa =
+    vpx_sub_pixel_avg_variance16x16_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_msa =
+    vpx_sub_pixel_avg_variance16x8_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_msa =
+    vpx_sub_pixel_avg_variance8x16_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_msa =
+    vpx_sub_pixel_avg_variance8x8_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_msa =
+    vpx_sub_pixel_avg_variance8x4_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_msa =
+    vpx_sub_pixel_avg_variance4x8_msa;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_msa =
+    vpx_sub_pixel_avg_variance4x4_msa;
+INSTANTIATE_TEST_CASE_P(
+    MSA, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_msa, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_msa, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_msa, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_msa, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_msa, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_msa, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_msa, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_msa, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_msa, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_msa, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_msa, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_msa, 0),
+                      make_tuple(2, 2, subpel_avg_variance4x4_msa, 0)));
 #endif  // HAVE_MSA
 }  // namespace

diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 92c236f..07968bc 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc

@@ -21,8 +21,8 @@
 const unsigned int kFrames = 100;
 const int kBitrate = 500;
 
-#define ARF_NOT_SEEN   1000001
-#define ARF_SEEN_ONCE  1000000
+#define ARF_NOT_SEEN               1000001
+#define ARF_SEEN_ONCE              1000000
 
 typedef struct {
   const char *filename;
@@ -108,7 +108,7 @@
   }
 
   virtual void BeginPassHook(unsigned int) {
-    min_arf_ = ARF_NOT_SEEN;
+    min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
 
@@ -137,15 +137,15 @@
     if (frames == 1) {
       run_of_visible_frames_++;
     } else if (frames == 2) {
-      if (min_arf_ == ARF_NOT_SEEN) {
-        min_arf_ = ARF_SEEN_ONCE;
-      } else if (min_arf_ == ARF_SEEN_ONCE ||
-                 run_of_visible_frames_ < min_arf_) {
-        min_arf_ = run_of_visible_frames_;
+      if (min_run_ == ARF_NOT_SEEN) {
+        min_run_ = ARF_SEEN_ONCE;
+      } else if (min_run_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_run_) {
+        min_run_ = run_of_visible_frames_;
       }
       run_of_visible_frames_ = 1;
     } else {
-      min_arf_ = 0;
+      min_run_ = 0;
       run_of_visible_frames_ = 1;
     }
   }
@@ -166,8 +166,8 @@
     }
   }
 
-  int GetMinArfDistance() const {
-    return min_arf_;
+  int GetMinVisibleRun() const {
+    return min_run_;
   }
 
   int GetMinArfDistanceRequested() const {
@@ -185,7 +185,7 @@
 
  private:
   int min_arf_requested_;
-  int min_arf_;
+  int min_run_;
   int run_of_visible_frames_;
 };
 
@@ -214,9 +214,10 @@
   }
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video));
-  const int min_arf_dist = GetMinArfDistance();
+  const int min_run = GetMinVisibleRun();
   const int min_arf_dist_requested = GetMinArfDistanceRequested();
-  if (min_arf_dist != ARF_NOT_SEEN && min_arf_dist != ARF_SEEN_ONCE) {
+  if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+    const int min_arf_dist = min_run + 1;
     EXPECT_GE(min_arf_dist, min_arf_dist_requested);
   }
   delete(video);

diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index 856b4c1..09c2069 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc

@@ -291,6 +291,12 @@
         make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
         make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
         make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProColTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
+        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
 #endif
 
 #if HAVE_MSA

diff --git a/test/vp9_boolcoder_test.cc b/test/vp9_boolcoder_test.cc
index c7f0cd8..b917429 100644
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc

@@ -14,11 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "vp9/decoder/vp9_reader.h"
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/bitwriter.h"
 
 #include "test/acm_random.h"
-#include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -50,9 +50,9 @@
         const int random_seed = 6432;
         const int kBufferSize = 10000;
         ACMRandom bit_rnd(random_seed);
-        vp9_writer bw;
+        vpx_writer bw;
         uint8_t bw_buffer[kBufferSize];
-        vp9_start_encode(&bw, bw_buffer);
+        vpx_start_encode(&bw, bw_buffer);
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
         for (int i = 0; i < kBitsToTest; ++i) {
@@ -61,16 +61,16 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          vp9_write(&bw, bit, static_cast<int>(probas[i]));
+          vpx_write(&bw, bit, static_cast<int>(probas[i]));
         }
 
-        vp9_stop_encode(&bw);
+        vpx_stop_encode(&bw);
 
         // First bit should be zero
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
-        vp9_reader br;
-        vp9_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vpx_reader br;
+        vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
@@ -78,7 +78,7 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
+          GTEST_ASSERT_EQ(vpx_read(&br, probas[i]), bit)
               << "pos: " << i << " / " << kBitsToTest
               << " bit_method: " << bit_method
               << " method: " << method;

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 943c00b..0e09652 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc

@@ -19,7 +19,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"

diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index 6ebea9f..cd07a56 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h

@@ -15,14 +15,13 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/mips/common_dspr2.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #if HAVE_DSPR2
-#define CROP_WIDTH 512
 extern uint8_t *vp9_ff_cropTbl;
 
 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                    ({   \
@@ -51,40 +50,6 @@
    );                                                                          \
   out;                                                                    })
 
-static INLINE void vp9_prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   0,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   1,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
-static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__ (
-      "pref   4,  0(%[src])   \n\t"
-      :
-      : [src] "r" (src)
-  );
-}
-
-/* prefetch data for store */
-static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__ (
-      "pref   5,  0(%[dst])   \n\t"
-      :
-      : [dst] "r" (dst)
-  );
-}
-
 void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
                                    int dest_stride);
 

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
index 91d62bc..aad7c45 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c

@@ -44,7 +44,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -148,8 +148,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -245,7 +245,7 @@
       : [pos] "r" (pos)
     );
 
-    vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -257,7 +257,7 @@
                                      filter_y, w, h);
         break;
       case 64:
-        vp9_prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
         convolve_bi_avg_vert_64_dspr2(src, src_stride,
                                       dst, dst_stride,
                                       filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
index 148b20f..bc60e93 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c

@@ -40,9 +40,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -135,9 +135,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -290,9 +290,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -539,11 +539,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -781,9 +781,9 @@
     );
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -807,8 +807,8 @@
                                       filter_x, h, 2);
         break;
       case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
         convolve_bi_avg_horiz_64_dspr2(src, src_stride,
                                       dst, dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
index 92644f2..b714f9a 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c

@@ -41,8 +41,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -132,8 +132,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -272,8 +272,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -504,9 +504,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -747,8 +747,8 @@
   );
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
+  prefetch_load(src);
+  prefetch_load(src + 32);
 
   switch (w) {
     case 4:
@@ -769,7 +769,7 @@
                                             (w/16));
       break;
     case 64:
-      vp9_prefetch_load(src + 32);
+      prefetch_load(src + 32);
       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
                                             dst, dst_stride,
                                             filter, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
index 1debdb4..27ea100 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c

@@ -39,9 +39,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -122,9 +122,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -252,9 +252,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -459,11 +459,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -651,7 +651,7 @@
   if (16 == x_step_q4) {
     uint32_t pos = 38;
 
-    vp9_prefetch_load((const uint8_t *)filter_x);
+    prefetch_load((const uint8_t *)filter_x);
 
     /* bit positon for extract from acc */
     __asm__ __volatile__ (
@@ -661,9 +661,9 @@
     );
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
-    vp9_prefetch_store(dst);
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
 
     switch (w) {
       case 4:
@@ -687,8 +687,8 @@
                                    filter_x, (int32_t)h, 2);
         break;
       case 64:
-        vp9_prefetch_load(src + 64);
-        vp9_prefetch_store(dst + 32);
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
 
         convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
                                    dst, (int32_t)dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
index bf01f11..32f5fb6 100644
--- a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c

@@ -44,7 +44,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -141,7 +141,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -230,7 +230,7 @@
       : [pos] "r" (pos)
     );
 
-    vp9_prefetch_store(dst);
+    prefetch_store(dst);
 
     switch (w) {
       case 4 :
@@ -242,7 +242,7 @@
                                  filter_y, w, h);
         break;
       case 64 :
-        vp9_prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
         convolve_bi_vert_64_dspr2(src, src_stride,
                                   dst, dst_stride,
                                   filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index 1742279..d9cbfe6 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c

@@ -49,7 +49,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -210,8 +210,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -372,7 +372,7 @@
         : [pos] "r" (pos)
       );
 
-      vp9_prefetch_store(dst);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -384,7 +384,7 @@
                                     filter_y, w, h);
           break;
         case 64:
-          vp9_prefetch_store(dst + 32);
+          prefetch_store(dst + 32);
           convolve_avg_vert_64_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_y, h);
@@ -452,17 +452,17 @@
   uint32_t tp3, tp4, tn2;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -482,9 +482,9 @@
     case 8:
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -509,9 +509,9 @@
     case 16:
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -544,9 +544,9 @@
     case 32:
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -593,16 +593,16 @@
       }
       break;
     case 64:
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
index 69da1cf..cdb8312 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c

@@ -43,9 +43,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -165,9 +165,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -357,9 +357,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -668,11 +668,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -985,9 +985,9 @@
       );
 
       /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+      prefetch_load(src);
+      prefetch_load(src + 32);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -1011,8 +1011,8 @@
                                       filter_x, h, 2);
           break;
         case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+          prefetch_load(src + 64);
+          prefetch_store(dst + 32);
 
           convolve_avg_horiz_64_dspr2(src, src_stride,
                                       dst, dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 58b50d2..a1309d1 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c

@@ -60,8 +60,8 @@
   for (y = h; y--;) {
     dst_ptr = dst;
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],         0(%[src])                      \n\t"
@@ -176,8 +176,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
 
     dst_ptr = dst;
     odd_dst = (dst_ptr + dst_stride);
@@ -355,8 +355,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -645,9 +645,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
 
     src = src_ptr;
     dst = dst_ptr;
@@ -993,8 +993,8 @@
     src -= (src_stride * 3 + 3);
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src);
-    vp9_prefetch_load(src + 32);
+    prefetch_load(src);
+    prefetch_load(src + 32);
 
     switch (w) {
       case 4:
@@ -1015,7 +1015,7 @@
                                            (w/16));
         break;
       case 64:
-        vp9_prefetch_load(src + 32);
+        prefetch_load(src + 32);
         convolve_horiz_64_transposed_dspr2(src, src_stride,
                                            temp, intermediate_height,
                                            filter_x, intermediate_height);
@@ -1078,9 +1078,9 @@
   int x, y;
 
   /* prefetch data to cache memory */
-  vp9_prefetch_load(src);
-  vp9_prefetch_load(src + 32);
-  vp9_prefetch_store(dst);
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
 
   switch (w) {
     case 4:
@@ -1089,9 +1089,9 @@
 
       /* 1 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         (%[src])      \n\t"
@@ -1112,9 +1112,9 @@
 
       /* 2 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1137,9 +1137,9 @@
 
       /* 4 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1169,9 +1169,9 @@
 
       /* 8 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_store(dst + dst_stride);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"
@@ -1209,16 +1209,16 @@
       uint32_t tp1, tp2, tp3, tp4;
       uint32_t tp5, tp6, tp7, tp8;
 
-      vp9_prefetch_load(src + 64);
-      vp9_prefetch_store(dst + 32);
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
 
       /* 16 word storage */
       for (y = h; y--; ) {
-        vp9_prefetch_load(src + src_stride);
-        vp9_prefetch_load(src + src_stride + 32);
-        vp9_prefetch_load(src + src_stride + 64);
-        vp9_prefetch_store(dst + dst_stride);
-        vp9_prefetch_store(dst + dst_stride + 32);
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
 
         __asm__ __volatile__ (
             "ulw              %[tp1],         0(%[src])      \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
index 0303896..d0e3095 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c

@@ -43,9 +43,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -154,9 +154,9 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src + src_stride);
-    vp9_prefetch_load(src + src_stride + 32);
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
 
     __asm__ __volatile__ (
         "ulw              %[tp1],      0(%[src])                      \n\t"
@@ -323,9 +323,9 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_store(dst_ptr + dst_stride);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
 
     for (c = 0; c < count; c++) {
       __asm__ __volatile__ (
@@ -593,11 +593,11 @@
     dst = dst_ptr;
 
     /* prefetch data to cache memory */
-    vp9_prefetch_load(src_ptr + src_stride);
-    vp9_prefetch_load(src_ptr + src_stride + 32);
-    vp9_prefetch_load(src_ptr + src_stride + 64);
-    vp9_prefetch_store(dst_ptr + dst_stride);
-    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
 
     for (c = 0; c < 4; c++) {
       __asm__ __volatile__ (
@@ -859,7 +859,7 @@
     if (16 == x_step_q4) {
       uint32_t pos = 38;
 
-      vp9_prefetch_load((const uint8_t *)filter_x);
+      prefetch_load((const uint8_t *)filter_x);
       src -= 3;
 
       /* bit positon for extract from acc */
@@ -870,9 +870,9 @@
       );
 
       /* prefetch data to cache memory */
-      vp9_prefetch_load(src);
-      vp9_prefetch_load(src + 32);
-      vp9_prefetch_store(dst);
+      prefetch_load(src);
+      prefetch_load(src + 32);
+      prefetch_store(dst);
 
       switch (w) {
         case 4:
@@ -896,8 +896,8 @@
                                   filter_x, (int32_t)h, 2);
           break;
         case 64:
-          vp9_prefetch_load(src + 64);
-          vp9_prefetch_store(dst + 32);
+          prefetch_load(src + 64);
+          prefetch_store(dst + 32);
 
           convolve_horiz_64_dspr2(src, (int32_t)src_stride,
                                   dst, (int32_t)dst_stride,

diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
index 0930bb3..98acb81 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c

@@ -49,7 +49,7 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride);
 
     for (x = 0; x < w; x += 4) {
       src_ptr = src + x;
@@ -203,8 +203,8 @@
 
   for (y = h; y--;) {
     /* prefetch data to cache memory */
-    vp9_prefetch_store(dst + dst_stride);
-    vp9_prefetch_store(dst + dst_stride + 32);
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
 
     for (x = 0; x < 64; x += 4) {
       src_ptr = src + x;
@@ -358,7 +358,7 @@
         : [pos] "r" (pos)
       );
 
-      vp9_prefetch_store(dst);
+      prefetch_store(dst);
 
       switch (w) {
         case 4 :
@@ -370,7 +370,7 @@
                                 filter_y, w, h);
           break;
         case 64 :
-          vp9_prefetch_store(dst + 32);
+          prefetch_store(dst + 32);
           convolve_vert_64_dspr2(src, src_stride,
                                  dst, dst_stride,
                                  filter_y, h);

diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
index 202d913..10a24f3 100644
--- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

@@ -34,7 +34,7 @@
 
   for (i = no_rows; i--; ) {
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 16));
+    prefetch_load((const uint8_t *)(input + 16));
 
     __asm__ __volatile__ (
         "lh       %[load1],              0(%[input])                    \n\t"
@@ -421,14 +421,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 16; ++i) {
     dest_pix = (dest + i);
@@ -1124,7 +1124,7 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
         iadst16(input, outptr);
         input += 16;
@@ -1144,7 +1144,7 @@
 
       for (i = 0; i < 16; ++i) {
         /* prefetch row */
-        vp9_prefetch_load((const uint8_t *)(input + 16));
+        prefetch_load((const uint8_t *)(input + 16));
 
         iadst16(input, outptr);
         input += 16;

diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
index 7ceebb6..a256145 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c

@@ -44,14 +44,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 32; ++i) {
     dest_pix = dest + i;

diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index 74a90b0..dd18831 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c

@@ -96,8 +96,8 @@
     }
 
     /* prefetch row */
-    vp9_prefetch_load((const uint8_t *)(input + 32));
-    vp9_prefetch_load((const uint8_t *)(input + 48));
+    prefetch_load((const uint8_t *)(input + 32));
+    prefetch_load((const uint8_t *)(input + 48));
 
     __asm__ __volatile__ (
         "lh       %[load1],             2(%[input])                     \n\t"

diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
index 280190a..4e31f9f 100644
--- a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c

@@ -115,14 +115,14 @@
   uint8_t   *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 4; ++i) {
       dest_pix = (dest + i);

diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
index 04d2266..6898d56 100644
--- a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c

@@ -211,14 +211,14 @@
   uint8_t *cm = vp9_ff_cropTbl;
 
   /* prefetch vp9_ff_cropTbl */
-  vp9_prefetch_load(vp9_ff_cropTbl);
-  vp9_prefetch_load(vp9_ff_cropTbl +  32);
-  vp9_prefetch_load(vp9_ff_cropTbl +  64);
-  vp9_prefetch_load(vp9_ff_cropTbl +  96);
-  vp9_prefetch_load(vp9_ff_cropTbl + 128);
-  vp9_prefetch_load(vp9_ff_cropTbl + 160);
-  vp9_prefetch_load(vp9_ff_cropTbl + 192);
-  vp9_prefetch_load(vp9_ff_cropTbl + 224);
+  prefetch_load(vp9_ff_cropTbl);
+  prefetch_load(vp9_ff_cropTbl +  32);
+  prefetch_load(vp9_ff_cropTbl +  64);
+  prefetch_load(vp9_ff_cropTbl +  96);
+  prefetch_load(vp9_ff_cropTbl + 128);
+  prefetch_load(vp9_ff_cropTbl + 160);
+  prefetch_load(vp9_ff_cropTbl + 192);
+  prefetch_load(vp9_ff_cropTbl + 224);
 
   for (i = 0; i < 8; ++i) {
       dest_pix = (dest + i);

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 8054cdf..d776b44 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -166,7 +166,7 @@
   int up_available;
   int left_available;
 
-  const vp9_prob (*partition_probs)[PARTITION_TYPES - 1];
+  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
 
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
@@ -262,7 +262,7 @@
   }
 }
 
-static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
+static INLINE const vpx_prob *get_y_mode_probs(const MODE_INFO *mi,
                                                const MODE_INFO *above_mi,
                                                const MODE_INFO *left_mi,
                                                int block) {

diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 9c2d779..42c3a09 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h

@@ -16,6 +16,7 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -24,9 +25,6 @@
 extern "C" {
 #endif
 
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
     assert(sizeof(dest) == sizeof(src)); \
@@ -42,18 +40,6 @@
 #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
 #define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
 
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
@@ -70,20 +56,6 @@
       return (uint16_t)clamp(val, 0, 4095);
   }
 }
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-
-#else
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_DEBUG

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index ad6c04b..579857b 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 
 // Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   2, 6,                                // 0 = LOW_VAL
   -TWO_TOKEN, 4,                       // 1 = TWO
   -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
@@ -27,30 +27,30 @@
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
 };
 
-const vp9_prob vp9_cat1_prob[] = { 159 };
-const vp9_prob vp9_cat2_prob[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob[] = {
+const vpx_prob vp9_cat1_prob[] = { 159 };
+const vpx_prob vp9_cat2_prob[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob[] = {
     254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 #if CONFIG_VP9_HIGHBITDEPTH
-const vp9_prob vp9_cat1_prob_high10[] = { 159 };
-const vp9_prob vp9_cat2_prob_high10[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob_high10[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob_high10[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob_high10[] = {
+const vpx_prob vp9_cat1_prob_high10[] = { 159 };
+const vpx_prob vp9_cat2_prob_high10[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob_high10[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob_high10[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob_high10[] = {
     255, 255, 254, 254, 254, 252, 249, 243,
     230, 196, 177, 153, 140, 133, 130, 129
 };
-const vp9_prob vp9_cat1_prob_high12[] = { 159 };
-const vp9_prob vp9_cat2_prob_high12[] = { 165, 145 };
-const vp9_prob vp9_cat3_prob_high12[] = { 173, 148, 140 };
-const vp9_prob vp9_cat4_prob_high12[] = { 176, 155, 140, 135 };
-const vp9_prob vp9_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
-const vp9_prob vp9_cat6_prob_high12[] = {
+const vpx_prob vp9_cat1_prob_high12[] = { 159 };
+const vpx_prob vp9_cat2_prob_high12[] = { 165, 145 };
+const vpx_prob vp9_cat3_prob_high12[] = { 173, 148, 140 };
+const vpx_prob vp9_cat4_prob_high12[] = { 176, 155, 140, 135 };
+const vpx_prob vp9_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
+const vpx_prob vp9_cat6_prob_high12[] = {
     255, 255, 255, 255, 254, 254, 254, 252, 249,
     243, 230, 196, 177, 153, 140, 133, 130, 129
 };
@@ -147,7 +147,7 @@
 // by averaging :
 // vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] +
 //                              vp9_pareto8_full[l+1][node] ) >> 1;
-const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
+const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
   {  6,  86, 128,  11,  87,  42,  91,  52},
   {  9,  86, 129,  17,  88,  61,  94,  76},
@@ -742,14 +742,14 @@
   }
 };
 
-static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
+static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
   memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
-         MODEL_NODES * sizeof(vp9_prob));
+         MODEL_NODES * sizeof(vpx_prob));
 }
 
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
   if (full != model)
-    memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+    memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 2fc97c3..a1746bc 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -12,10 +12,10 @@
 #define VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -76,8 +76,8 @@
 #define EOB_MODEL_TOKEN 3
 
 typedef struct {
-  const vp9_tree_index *tree;
-  const vp9_prob *prob;
+  const vpx_tree_index *tree;
+  const vpx_prob *prob;
   int len;
   int base_val;
   const int16_t *cost;
@@ -160,17 +160,17 @@
 #define PIVOT_NODE                  2   // which node is pivot
 
 #define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
-extern const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
-extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+extern const vpx_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
+extern const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 
-typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
+typedef vpx_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
                                       [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
 
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
                                           [COEFF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
 
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
+void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full);
 
 typedef char ENTROPY_CONTEXT;
 

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 22d431b..670348b 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -13,7 +13,7 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
-const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
+const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = {
   {  // above = dc
     { 137,  30,  42, 148, 151, 207,  70,  52,  91 },  // left = dc
     {  92,  45, 102, 136, 116, 180,  74,  90, 100 },  // left = v
@@ -127,7 +127,7 @@
   }
 };
 
-const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
+const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
   { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
   { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
   { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
@@ -140,14 +140,14 @@
   { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
 };
 
-static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
+static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
   {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
   { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
   { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
   { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
 };
 
-static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
+static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 120,   7,  76, 176, 208, 126,  28,  54, 103 },  // y = dc
   {  48,  12, 154, 155, 139,  90,  34, 117, 119 },  // y = v
   {  67,   6,  25, 204, 243, 158,  13,  21,  96 },  // y = h
@@ -160,7 +160,7 @@
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                      [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
   { 158,  97,  94 },  // a/l both not split
@@ -184,7 +184,7 @@
   {  12,   3,   3 },  // a/l both split
 };
 
-static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
   { 199, 122, 141 },  // a/l both not split
@@ -208,7 +208,7 @@
   {  10,   7,   6 },  // a/l both split
 };
 
-static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
+static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
                                               [INTER_MODES - 1] = {
   {2,       173,   34},  // 0 = both zero mv
   {7,       145,   85},  // 1 = one zero mv + one a predicted mv
@@ -220,7 +220,7 @@
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
+const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -232,31 +232,31 @@
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
 
-const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
+const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
 
-const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
+const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
 
-static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
 };
 
-static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
   239, 183, 119,  96,  41
 };
 
-static const vp9_prob default_comp_ref_p[REF_CONTEXTS] = {
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
   50, 126, 123, 221, 226
 };
 
-static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
   {  33,  16 },
   {  77,  74 },
   { 142, 142 },
@@ -302,11 +302,11 @@
   ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
 }
 
-static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = {
+static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
   { 36, 255, },
@@ -328,7 +328,7 @@
   vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree
+const vpx_tree_index vp9_switchable_interp_tree
                          [TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
@@ -355,24 +355,24 @@
           pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+    vpx_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
                 counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+    vpx_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
                          counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+    vpx_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
 
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      vp9_tree_merge_probs(vp9_switchable_interp_tree,
+      vpx_tree_merge_probs(vp9_switchable_interp_tree,
                            pre_fc->switchable_interp_prob[i],
                            counts->switchable_interp[i],
                            fc->switchable_interp_prob[i]);

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 8c9e6a7..371738a 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -28,9 +28,9 @@
 struct VP9Common;
 
 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
+  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
 };
 
 struct tx_counts {
@@ -41,19 +41,19 @@
 };
 
 typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+  vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vpx_prob single_ref_prob[REF_CONTEXTS][2];
+  vpx_prob comp_ref_prob[REF_CONTEXTS];
   struct tx_probs tx_probs;
-  vp9_prob skip_probs[SKIP_CONTEXTS];
+  vpx_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
   int initialized;
 } FRAME_CONTEXT;
@@ -77,15 +77,15 @@
   nmv_context_counts mv;
 } FRAME_COUNTS;
 
-extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
+extern const vpx_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+extern const vpx_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+extern const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
-extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern const vp9_tree_index vp9_switchable_interp_tree
+extern const vpx_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
+extern const vpx_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+extern const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+extern const vpx_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
 
 void vp9_setup_past_independence(struct VP9Common *cm);

diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 76cdb05..3acfe14 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c

@@ -14,13 +14,13 @@
 // Integer pel reference mv threshold for use of high-precision 1/8 mv
 #define COMPANDED_MVREF_THRESH 8
 
-const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
+const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
 
-const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
+const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
   -MV_CLASS_1, 4,
   6, 8,
@@ -33,11 +33,11 @@
   -MV_CLASS_9, -MV_CLASS_10,
 };
 
-const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
 
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
+const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
   -0, 2,
   -1, 4,
   -2, -3
@@ -183,7 +183,7 @@
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
-  vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+  vpx_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
                        fc->joints);
 
   for (i = 0; i < 2; ++i) {
@@ -192,19 +192,19 @@
     const nmv_component_counts *c = &counts->comps[i];
 
     comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
-    vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+    vpx_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
                          comp->classes);
-    vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+    vpx_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
                          comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
       comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+      vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
                            c->class0_fp[j], comp->class0_fp[j]);
 
-    vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+    vpx_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
       comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);

diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 637f451..8c817bf 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h

@@ -14,8 +14,9 @@
 
 #include "./vpx_config.h"
 
+#include "vpx_dsp/prob.h"
+
 #include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -76,24 +77,24 @@
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_joint_tree[];
-extern const vp9_tree_index vp9_mv_class_tree[];
-extern const vp9_tree_index vp9_mv_class0_tree[];
-extern const vp9_tree_index vp9_mv_fp_tree[];
+extern const vpx_tree_index vp9_mv_joint_tree[];
+extern const vpx_tree_index vp9_mv_class_tree[];
+extern const vpx_tree_index vp9_mv_class0_tree[];
+extern const vpx_tree_index vp9_mv_fp_tree[];
 
 typedef struct {
-  vp9_prob sign;
-  vp9_prob classes[MV_CLASSES - 1];
-  vp9_prob class0[CLASS0_SIZE - 1];
-  vp9_prob bits[MV_OFFSET_BITS];
-  vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
-  vp9_prob fp[MV_FP_SIZE - 1];
-  vp9_prob class0_hp;
-  vp9_prob hp;
+  vpx_prob sign;
+  vpx_prob classes[MV_CLASSES - 1];
+  vpx_prob class0[CLASS0_SIZE - 1];
+  vpx_prob bits[MV_OFFSET_BITS];
+  vpx_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vpx_prob fp[MV_FP_SIZE - 1];
+  vpx_prob class0_hp;
+  vpx_prob hp;
 } nmv_component;
 
 typedef struct {
-  vp9_prob joints[MV_JOINTS - 1];
+  vpx_prob joints[MV_JOINTS - 1];
   nmv_component comps[2];
 } nmv_context;
 

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 9816728..0915918 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -9,6 +9,7 @@
  */
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
@@ -326,55 +327,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                    lfi0->hev_thr);
         } else if (mask_16x16_0 & 1) {
-          vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                               lfi0->hev_thr);
         } else {
-          vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                               lfi1->lim, lfi1->hev_thr);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_8x8_0 & 1) {
-          vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_0 & 1) {
-          vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
                              1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                   lfi1->hev_thr);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                              lfi0->hev_thr, 1);
         } else {
-          vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
+          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
                              lfi1->hev_thr, 1);
         }
       }
@@ -426,55 +427,55 @@
     if (mask & 1) {
       if ((mask_16x16_0 | mask_16x16_1) & 1) {
         if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                           lfi0->hev_thr, bd);
         } else if (mask_16x16_0 & 1) {
-          vp9_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
                                      lfi0->hev_thr, bd);
         } else {
-          vp9_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
                                      lfi1->lim, lfi1->hev_thr, bd);
         }
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
         if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vp9_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_8x8_0 & 1) {
-          vp9_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
 
       if ((mask_4x4_0 | mask_4x4_1) & 1) {
         if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vp9_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_0 & 1) {
-          vp9_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
 
       if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
         if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vp9_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
                                          lfi1->hev_thr, bd);
         } else if (mask_4x4_int_0 & 1) {
-          vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
+          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
                                     lfi0->hev_thr, 1, bd);
         } else {
-          vp9_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
+          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
                                     lfi1->lim, lfi1->hev_thr, 1, bd);
         }
       }
@@ -512,11 +513,11 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 2);
           count = 2;
         } else {
-          vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1);
         }
       } else if (mask_8x8 & 1) {
@@ -524,28 +525,28 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4 & 1) {
@@ -553,31 +554,31 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
                                     lfin->hev_thr);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                       lfi->lim, lfi->hev_thr, lfin->mblim,
                                       lfin->lim, lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
-              vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+              vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, 1);
             else if (mask_4x4_int & 2)
-              vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                    lfin->lim, lfin->hev_thr, 1);
           }
           count = 2;
         } else {
-          vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
           if (mask_4x4_int & 1)
-            vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+            vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                  lfi->hev_thr, 1);
         }
       } else if (mask_4x4_int & 1) {
-        vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                              lfi->hev_thr, 1);
       }
     }
@@ -609,11 +610,11 @@
     if (mask & 1) {
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 2, bd);
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 1, bd);
         }
       } else if (mask_8x8 & 1) {
@@ -621,31 +622,31 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
 
           if ((mask_4x4_int & 3) == 3) {
-            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                              lfi->lim, lfi->hev_thr,
                                              lfin->mblim, lfin->lim,
                                              lfin->hev_thr, bd);
           } else {
             if (mask_4x4_int & 1) {
-              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1, bd);
             } else if (mask_4x4_int & 2) {
-              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                           lfin->lim, lfin->hev_thr, 1, bd);
             }
           }
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1, bd);
 
           if (mask_4x4_int & 1) {
-            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1, bd);
           }
         }
@@ -654,35 +655,35 @@
           // Next block's thresholds.
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          vp9_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
                                            lfin->hev_thr, bd);
           if ((mask_4x4_int & 3) == 3) {
-            vp9_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim,
                                              lfi->lim, lfi->hev_thr,
                                              lfin->mblim, lfin->lim,
                                              lfin->hev_thr, bd);
           } else {
             if (mask_4x4_int & 1) {
-              vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1, bd);
             } else if (mask_4x4_int & 2) {
-              vp9_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
+              vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
                                           lfin->lim, lfin->hev_thr, 1, bd);
             }
           }
           count = 2;
         } else {
-          vp9_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+          vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1, bd);
 
           if (mask_4x4_int & 1) {
-            vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
+            vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1, bd);
           }
         }
       } else if (mask_4x4_int & 1) {
-        vp9_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1, bd);
       }
     }
@@ -1093,15 +1094,15 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       } else if (mask_4x4 & 1) {
-        vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
@@ -1127,18 +1128,18 @@
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        vp9_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr, bd);
       } else if (mask_8x8 & 1) {
-        vp9_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
                                   lfi->hev_thr, 1, bd);
       } else if (mask_4x4 & 1) {
-        vp9_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
+        vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
       }
     }
     if (mask_4x4_int & 1)
-      vp9_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
+      vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
                                 lfi->hev_thr, 1, bd);
     s += 8;
     lfl += 1;

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ca0dfc8..c373c02 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -345,7 +345,7 @@
   xd->partition_probs =
       frame_is_intra_only(cm) ?
           &vp9_kf_partition_probs[0] :
-          (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+          (const vpx_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
 }
 
 static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -373,7 +373,7 @@
   set_partition_probs(cm, xd);
 }
 
-static INLINE const vp9_prob* get_partition_probs(const MACROBLOCKD *xd,
+static INLINE const vpx_prob* get_partition_probs(const MACROBLOCKD *xd,
                                                   int ctx) {
   return xd->partition_probs[ctx];
 }

diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 7616144..67b95db 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h

@@ -47,7 +47,7 @@
   return above_sip + left_sip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
+static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
                                                 const MACROBLOCKD *xd) {
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
@@ -60,7 +60,7 @@
   return above_skip + left_skip;
 }
 
-static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm,
                                          const MACROBLOCKD *xd) {
   return cm->fc->skip_probs[vp9_get_skip_context(xd)];
 }
@@ -69,14 +69,14 @@
 
 int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd) {
   return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
 int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
   return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
@@ -84,7 +84,7 @@
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                     const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
   return cm->fc->comp_ref_prob[pred_context];
@@ -92,14 +92,14 @@
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
-static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
+static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
@@ -127,7 +127,7 @@
   return (above_ctx + left_ctx) > max_tx_size;
 }
 
-static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
+static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
                                            const struct tx_probs *tx_probs) {
   switch (max_tx_size) {
     case TX_8X8:
@@ -142,7 +142,7 @@
   }
 }
 
-static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size,
+static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
                                             const MACROBLOCKD *xd,
                                             const struct tx_probs *tx_probs) {
   return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);

diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 1e9acb8..f969ff1 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c

@@ -1059,20 +1059,19 @@
   }
 }
 
-void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,
                              int aoff, int loff, int plane) {
-  const int bwl = bwl_in - tx_size;
-  const int wmask = (1 << bwl) - 1;
-  const int have_top = (block_idx >> bwl) || xd->up_available;
-  const int have_left = (block_idx & wmask) || xd->left_available;
-  const int have_right = ((block_idx & wmask) != wmask);
+  const int bw = (1 << bwl_in);
+  const int txw = (1 << tx_size);
+  const int have_top = loff || xd->up_available;
+  const int have_left = aoff || xd->left_available;
+  const int have_right = (aoff + txw) < bw;
   const int x = aoff * 4;
   const int y = loff * 4;
 
-  assert(bwl >= 0);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,

diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index da5e435..de45380 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h

@@ -20,7 +20,7 @@
 
 void vp9_init_intra_predictors(void);
 
-void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 538f1ed..f80d31e 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -220,49 +220,6 @@
 specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 #
-# Loopfilter
-#
-add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_16_neon_asm=vp9_lpf_vertical_16_neon;
-
-add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
-
-add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
-
-add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
-
-add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
-$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
-
-add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2 msa/;
-
-add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
-
-#
 # post proc
 #
 if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -668,42 +625,6 @@
   specialize qw/vp9_highbd_convolve8_avg_vert/, "$sse2_x86_64";
 
   #
-  # Loopfilter
-  #
-  add_proto qw/void vp9_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_16 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_16_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_8 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_8_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_4 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_vertical_4_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_16 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_8 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_8_dual sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_4 sse2/;
-
-  add_proto qw/void vp9_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-  specialize qw/vp9_highbd_lpf_horizontal_4_dual sse2/;
-
-  #
   # post proc
   #
   if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -824,7 +745,7 @@
 specialize qw/vp9_int_pro_row sse2 neon/;
 
 add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
-specialize qw/vp9_int_pro_col sse2/;
+specialize qw/vp9_int_pro_col sse2 neon/;
 
 add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
 specialize qw/vp9_vector_var sse2/;
@@ -860,12 +781,6 @@
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b/;
-
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/;
-
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant/;
 } else {
@@ -881,12 +796,6 @@
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
-
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
-
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
@@ -920,21 +829,12 @@
   add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct4x4_1 sse2/;
 
-  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4 sse2/;
-
   add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct8x8_1 sse2/;
 
-  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8 sse2/;
-
   add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct16x16_1 sse2/;
 
-  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16 sse2/;
-
   add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct32x32_1 sse2/;
 
@@ -959,21 +859,12 @@
   add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct4x4_1 sse2/;
 
-  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4 sse2 msa/;
-
   add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct8x8_1 sse2 neon msa/;
 
-  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8 sse2 neon msa/, "$ssse3_x86_64_x86inc";
-
   add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct16x16_1 sse2 msa/;
 
-  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16 sse2 msa/;
-
   add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct32x32_1 sse2 msa/;
 
@@ -1014,12 +905,6 @@
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b sse2/;
-
-  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
-
   #
   # Structured Similarity (SSIM)
   #
@@ -1030,32 +915,23 @@
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4 sse2/;
+  specialize qw/vp9_highbd_fht4x4/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8 sse2/;
+  specialize qw/vp9_highbd_fht8x8/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16 sse2/;
+  specialize qw/vp9_highbd_fht16x16/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fwht4x4/;
 
-  add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct4x4 sse2/;
-
   add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fdct8x8_1/;
 
-  add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct8x8 sse2/;
-
   add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fdct16x16_1/;
 
-  add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct16x16 sse2/;
-
   add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fdct32x32_1/;
 

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 471e238..c8ef618 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -54,7 +54,7 @@
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
+const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7
 };

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 95c9918..5b75d8d 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_SEG_COMMON_H_
 #define VP9_COMMON_VP9_SEG_COMMON_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,8 +42,8 @@
   uint8_t abs_delta;
   uint8_t temporal_update;
 
-  vp9_prob tree_probs[SEG_TREE_PROBS];
-  vp9_prob pred_probs[PREDICTION_PROBS];
+  vpx_prob tree_probs[SEG_TREE_PROBS];
+  vpx_prob pred_probs[PREDICTION_PROBS];
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
   unsigned int feature_mask[MAX_SEGMENTS];
@@ -76,7 +76,7 @@
   return seg->feature_data[segment_id][feature_id];
 }
 
-extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
+extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
index 9619e37..69b68e6 100644
--- a/vp9/common/x86/vp9_idct_sse2.asm
+++ b/vp9/common/x86/vp9_idct_sse2.asm

@@ -11,38 +11,50 @@
 
 SECTION .text
 
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
 %macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
   paddw           m0,        m2
   psubw           m3,        m1
-  psubw           m4,        m0,        m3
-  psraw           m4,        1
-  psubw           m5,        m4,        m1 ;b1
-  psubw           m4,        m2 ;c1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
   psubw           m0,        m5
   paddw           m3,        m4
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
 %endmacro
 
 %macro TRANSPOSE_4X4 0
-  SWAP            4, 0
-  SWAP            5, 2
-  punpcklwd       m4,        m1
-  pshufd          m0,        m4, 0x0e
-  punpcklwd       m5,        m3
-  pshufd          m2,        m5, 0x0e
-  SWAP            1, 4
-  SWAP            3, 0
-  punpckldq       m1,        m5
-  pshufd          m4,        m1, 0x0e
-  punpckldq       m3,        m2
-  pshufd          m0,        m3, 0x0e
-  SWAP            2, 3, 0, 1, 4
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
 %endmacro
 
-; transposes a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
 %macro TRANSPOSE_4X4_WIDE 0
   mova            m3, m0
   punpcklwd       m0, m1
@@ -68,16 +80,18 @@
 %endmacro
 
 INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 4, 7, input, output, stride
-  mova            m0,        [inputq +  0] ;a1
-  mova            m1,        [inputq + 16] ;c1
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+  mova            m0,        [inputq +  0]
+  mova            m1,        [inputq + 16]
 
   psraw           m0,        2
   psraw           m1,        2
 
   TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
   TRANSFORM_COLS
   TRANSPOSE_4X4
+  REORDER_INPUTS
   TRANSFORM_COLS
 
   pxor            m4, m4

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index b3cf3fd..c6d3bf1 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -14,6 +14,8 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/bitreader_buffer.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
@@ -38,8 +40,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
@@ -74,19 +74,19 @@
   return len != 0 && len <= (size_t)(end - start);
 }
 
-static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
-  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
+static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
+  const int data = vpx_rb_read_literal(rb, get_unsigned_bits(max));
   return data > max ? max : data;
 }
 
-static TX_MODE read_tx_mode(vp9_reader *r) {
-  TX_MODE tx_mode = vp9_read_literal(r, 2);
+static TX_MODE read_tx_mode(vpx_reader *r) {
+  TX_MODE tx_mode = vpx_read_literal(r, 2);
   if (tx_mode == ALLOW_32X32)
-    tx_mode += vp9_read_bit(r);
+    tx_mode += vpx_read_bit(r);
   return tx_mode;
 }
 
-static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
   int i, j;
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
@@ -102,14 +102,14 @@
       vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
@@ -117,9 +117,9 @@
 }
 
 static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
-                                                vp9_reader *r) {
+                                                vpx_reader *r) {
   if (is_compound_reference_allowed(cm)) {
-    return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT
+    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
                                               : COMPOUND_REFERENCE)
                            : SINGLE_REFERENCE;
   } else {
@@ -127,7 +127,7 @@
   }
 }
 
-static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vpx_reader *r) {
   FRAME_CONTEXT *const fc = cm->fc;
   int i;
 
@@ -146,14 +146,14 @@
       vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
 }
 
-static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
+static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
-    if (vp9_read(r, MV_UPDATE_PROB))
-      p[i] = (vp9_read_literal(r, 7) << 1) | 1;
+    if (vpx_read(r, MV_UPDATE_PROB))
+      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
 }
 
-static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
@@ -361,54 +361,38 @@
   }
 }
 
-static INLINE void dec_txfrm_block_to_raster_xy(int bwl,
-                                                TX_SIZE tx_size, int block,
-                                                int *x, int *y) {
-  const int tx_cols_log2 = bwl - tx_size;
-  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  *x = (raster_mb & (tx_cols - 1)) << tx_size;
-  *y = (raster_mb >> tx_cols_log2) << tx_size;
-}
-
-struct intra_args {
-  MACROBLOCKD *xd;
-  vp9_reader *r;
-  int seg_id;
-};
-
-static void predict_and_reconstruct_intra_block(int plane, int block,
-                                                TX_SIZE tx_size, void *arg) {
-  struct intra_args *const args = (struct intra_args *)arg;
-  MACROBLOCKD *const xd = args->xd;
+static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
+                                                vpx_reader *r,
+                                                MB_MODE_INFO *const mbmi,
+                                                int plane,
+                                                int row, int col,
+                                                TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->mi[0];
-  const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
-                                            : mi->mbmi.uv_mode;
-  int x, y;
+  PREDICTION_MODE mode = (plane == 0) ? mbmi->mode : mbmi->uv_mode;
   uint8_t *dst;
-  const int bwl = pd->n4_wl;
-  dec_txfrm_block_to_raster_xy(bwl, tx_size, block, &x, &y);
-  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
+  dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
 
-  vp9_predict_intra_block(xd, block >> (tx_size << 1),
-                          bwl, tx_size, mode,
+  if (mbmi->sb_type < BLOCK_8X8)
+    if (plane == 0)
+      mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride,
-                          x, y, plane);
+                          col, row, plane);
 
-  if (!mi->mbmi.skip) {
+  if (!mbmi->skip) {
     const TX_TYPE tx_type = (plane || xd->lossless) ?
         DCT_DCT : intra_mode_to_tx_type_lookup[mode];
     const scan_order *sc = (plane || xd->lossless) ?
         &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
-    const int eob = vp9_decode_block_tokens(xd, plane, sc, x, y, tx_size,
-                                            args->r, args->seg_id);
+    const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
+                                            r, mbmi->segment_id);
     inverse_transform_block_intra(xd, plane, tx_type, tx_size,
                                   dst, pd->dst.stride, eob);
   }
 }
 
-static int reconstruct_inter_block(MACROBLOCKD *const xd, vp9_reader *r,
+static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
                                    MB_MODE_INFO *const mbmi, int plane,
                                    int row, int col, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -776,58 +760,6 @@
   return MIN(mbmi->tx_size,  x);
 }
 
-// TODO(slavarnway): Eliminate the foreach_ functions in future commits.
-// NOTE: Jingning removed the foreach_ for recon inter in a previous commit.
-
-typedef void (*dec_foreach_transformed_block_visitor)(int plane, int block,
-                                                      TX_SIZE tx_size,
-                                                      void *arg);
-
-static void dec_foreach_transformed_block_in_plane(
-    const MACROBLOCKD *const xd,
-    int plane,
-    dec_foreach_transformed_block_visitor visit, void *arg) {
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
-  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size =
-      plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-              : mbmi->tx_size;
-  const int num_4x4_w = pd->n4_w;
-  const int num_4x4_h = pd->n4_h;
-  const int step = 1 << (tx_size << 1);
-  int i = 0, r, c;
-
-  // If mb_to_right_edge is < 0 we are in a situation in which
-  // the current block size extends into the UMV and we won't
-  // visit the sub blocks that are wholly within the UMV.
-  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
-      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
-      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-  // Keep track of the row and column of the blocks we use so that we know
-  // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
-    for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
-      // Skip visiting the sub blocks that are wholly within the UMV.
-      if (c < max_blocks_wide)
-        visit(plane, i, tx_size, arg);
-      i += step;
-    }
-  }
-}
-
-static void dec_foreach_transformed_block(const MACROBLOCKD* const xd,
-    dec_foreach_transformed_block_visitor visit, void *arg) {
-  int plane;
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-    dec_foreach_transformed_block_in_plane(xd, plane, visit, arg);
-}
-
 static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
   int i;
   for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -880,7 +812,7 @@
 
 static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          int mi_row, int mi_col,
-                         vp9_reader *r, BLOCK_SIZE bsize,
+                         vpx_reader *r, BLOCK_SIZE bsize,
                          int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
@@ -900,16 +832,33 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
   if (mbmi->skip) {
     dec_reset_skip_context(xd);
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = {xd, r, mbmi->segment_id};
-    dec_foreach_transformed_block(xd,
-                                  predict_and_reconstruct_intra_block, &arg);
+    int plane;
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const TX_SIZE tx_size =
+          plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+                  : mbmi->tx_size;
+      const int num_4x4_w = pd->n4_w;
+      const int num_4x4_h = pd->n4_h;
+      const int step = (1 << tx_size);
+      int row, col;
+      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
+          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
+          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+      for (row = 0; row < max_blocks_high; row += step)
+        for (col = 0; col < max_blocks_wide; col += step)
+          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+                                              row, col, tx_size);
+    }
   } else {
     // Prediction
     dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
@@ -944,7 +893,7 @@
     }
   }
 
-  xd->corrupted |= vp9_reader_has_error(r);
+  xd->corrupted |= vpx_reader_has_error(r);
 }
 
 static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
@@ -974,19 +923,19 @@
 }
 
 static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     vp9_reader *r,
+                                     vpx_reader *r,
                                      int has_rows, int has_cols, int bsl) {
   const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
-  const vp9_prob *const probs = get_partition_probs(xd, ctx);
+  const vpx_prob *const probs = get_partition_probs(xd, ctx);
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
-    p = (PARTITION_TYPE)vp9_read_tree(r, vp9_partition_tree, probs);
+    p = (PARTITION_TYPE)vpx_read_tree(r, vp9_partition_tree, probs);
   else if (!has_rows && has_cols)
-    p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
   else
     p = PARTITION_SPLIT;
 
@@ -999,7 +948,7 @@
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                              int mi_row, int mi_col,
-                             vp9_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
+                             vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
   VP9_COMMON *const cm = &pbi->common;
   const int n8x8_l2 = n4x4_l2 - 1;
   const int num_8x8_wh = 1 << n8x8_l2;
@@ -1059,7 +1008,7 @@
                                 const uint8_t *data_end,
                                 size_t read_size,
                                 struct vpx_internal_error_info *error_info,
-                                vp9_reader *r,
+                                vpx_reader *r,
                                 vpx_decrypt_cb decrypt_cb,
                                 void *decrypt_state) {
   // Validate the calculated partition length. If the buffer
@@ -1069,16 +1018,16 @@
     vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
 
-  if (vp9_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
+  if (vpx_reader_init(r, data, read_size, decrypt_cb, decrypt_state))
     vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
 static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
-                                   vp9_reader *r) {
+                                   vpx_reader *r) {
   int i, j, k, l, m;
 
-  if (vp9_read_bit(r))
+  if (vpx_read_bit(r))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
@@ -1088,7 +1037,7 @@
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
-                            vp9_reader *r) {
+                            vpx_reader *r) {
     const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
     TX_SIZE tx_size;
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
@@ -1096,27 +1045,27 @@
 }
 
 static void setup_segmentation(struct segmentation *seg,
-                               struct vp9_read_bit_buffer *rb) {
+                               struct vpx_read_bit_buffer *rb) {
   int i, j;
 
   seg->update_map = 0;
   seg->update_data = 0;
 
-  seg->enabled = vp9_rb_read_bit(rb);
+  seg->enabled = vpx_rb_read_bit(rb);
   if (!seg->enabled)
     return;
 
   // Segmentation map update
-  seg->update_map = vp9_rb_read_bit(rb);
+  seg->update_map = vpx_rb_read_bit(rb);
   if (seg->update_map) {
     for (i = 0; i < SEG_TREE_PROBS; i++)
-      seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+      seg->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                                : MAX_PROB;
 
-    seg->temporal_update = vp9_rb_read_bit(rb);
+    seg->temporal_update = vpx_rb_read_bit(rb);
     if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++)
-        seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8)
+        seg->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
                                                  : MAX_PROB;
     } else {
       for (i = 0; i < PREDICTION_PROBS; i++)
@@ -1125,21 +1074,21 @@
   }
 
   // Segmentation data update
-  seg->update_data = vp9_rb_read_bit(rb);
+  seg->update_data = vpx_rb_read_bit(rb);
   if (seg->update_data) {
-    seg->abs_delta = vp9_rb_read_bit(rb);
+    seg->abs_delta = vpx_rb_read_bit(rb);
 
     vp9_clearall_segfeatures(seg);
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         int data = 0;
-        const int feature_enabled = vp9_rb_read_bit(rb);
+        const int feature_enabled = vpx_rb_read_bit(rb);
         if (feature_enabled) {
           vp9_enable_segfeature(seg, i, j);
           data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
           if (vp9_is_segfeature_signed(j))
-            data = vp9_rb_read_bit(rb) ? -data : data;
+            data = vpx_rb_read_bit(rb) ? -data : data;
         }
         vp9_set_segdata(seg, i, j, data);
       }
@@ -1148,38 +1097,38 @@
 }
 
 static void setup_loopfilter(struct loopfilter *lf,
-                             struct vp9_read_bit_buffer *rb) {
-  lf->filter_level = vp9_rb_read_literal(rb, 6);
-  lf->sharpness_level = vp9_rb_read_literal(rb, 3);
+                             struct vpx_read_bit_buffer *rb) {
+  lf->filter_level = vpx_rb_read_literal(rb, 6);
+  lf->sharpness_level = vpx_rb_read_literal(rb, 3);
 
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
   lf->mode_ref_delta_update = 0;
 
-  lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb);
+  lf->mode_ref_delta_enabled = vpx_rb_read_bit(rb);
   if (lf->mode_ref_delta_enabled) {
-    lf->mode_ref_delta_update = vp9_rb_read_bit(rb);
+    lf->mode_ref_delta_update = vpx_rb_read_bit(rb);
     if (lf->mode_ref_delta_update) {
       int i;
 
       for (i = 0; i < MAX_REF_LF_DELTAS; i++)
-        if (vp9_rb_read_bit(rb))
-          lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
+        if (vpx_rb_read_bit(rb))
+          lf->ref_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
 
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
-        if (vp9_rb_read_bit(rb))
-          lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6);
+        if (vpx_rb_read_bit(rb))
+          lf->mode_deltas[i] = vpx_rb_read_signed_literal(rb, 6);
     }
   }
 }
 
-static INLINE int read_delta_q(struct vp9_read_bit_buffer *rb) {
-  return vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
+static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
+  return vpx_rb_read_bit(rb) ? vpx_rb_read_signed_literal(rb, 4) : 0;
 }
 
 static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               struct vp9_read_bit_buffer *rb) {
-  cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
+                               struct vpx_read_bit_buffer *rb) {
+  cm->base_qindex = vpx_rb_read_literal(rb, QINDEX_BITS);
   cm->y_dc_delta_q = read_delta_q(rb);
   cm->uv_dc_delta_q = read_delta_q(rb);
   cm->uv_ac_delta_q = read_delta_q(rb);
@@ -1221,19 +1170,19 @@
   }
 }
 
-static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
+static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
   const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
                                               EIGHTTAP,
                                               EIGHTTAP_SHARP,
                                               BILINEAR };
-  return vp9_rb_read_bit(rb) ? SWITCHABLE
-                             : literal_to_filter[vp9_rb_read_literal(rb, 2)];
+  return vpx_rb_read_bit(rb) ? SWITCHABLE
+                             : literal_to_filter[vpx_rb_read_literal(rb, 2)];
 }
 
-static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_display_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   cm->display_width = cm->width;
   cm->display_height = cm->height;
-  if (vp9_rb_read_bit(rb))
+  if (vpx_rb_read_bit(rb))
     vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
@@ -1277,7 +1226,7 @@
   }
 }
 
-static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   int width, height;
   BufferPool *const pool = cm->buffer_pool;
   vp9_read_frame_size(rb, &width, &height);
@@ -1316,13 +1265,13 @@
 }
 
 static void setup_frame_size_with_refs(VP9_COMMON *cm,
-                                       struct vp9_read_bit_buffer *rb) {
+                                       struct vpx_read_bit_buffer *rb) {
   int width, height;
   int found = 0, i;
   int has_valid_ref_frame = 0;
   BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
-    if (vp9_rb_read_bit(rb)) {
+    if (vpx_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
@@ -1388,14 +1337,14 @@
   pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
-static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+static void setup_tile_info(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   int min_log2_tile_cols, max_log2_tile_cols, max_ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   // columns
   max_ones = max_log2_tile_cols - min_log2_tile_cols;
   cm->log2_tile_cols = min_log2_tile_cols;
-  while (max_ones-- && vp9_rb_read_bit(rb))
+  while (max_ones-- && vpx_rb_read_bit(rb))
     cm->log2_tile_cols++;
 
   if (cm->log2_tile_cols > 6)
@@ -1403,9 +1352,9 @@
                        "Invalid number of tile columns");
 
   // rows
-  cm->log2_tile_rows = vp9_rb_read_bit(rb);
+  cm->log2_tile_rows = vpx_rb_read_bit(rb);
   if (cm->log2_tile_rows)
-    cm->log2_tile_rows += vp9_rb_read_bit(rb);
+    cm->log2_tile_rows += vpx_rb_read_bit(rb);
 }
 
 typedef struct TileBuffer {
@@ -1607,7 +1556,7 @@
 
   if (pbi->frame_parallel_decode)
     vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
-  return vp9_reader_find_end(&tile_data->bit_reader);
+  return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
 static int tile_worker_hook(TileWorkerData *const tile_data,
@@ -1786,7 +1735,7 @@
     if (final_worker > -1) {
       TileWorkerData *const tile_data =
           (TileWorkerData*)pbi->tile_workers[final_worker].data1;
-      bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
+      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
       final_worker = -1;
     }
 
@@ -1809,9 +1758,9 @@
 }
 
 static void read_bitdepth_colorspace_sampling(
-    VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+    VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   if (cm->profile >= PROFILE_2) {
-    cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
+    cm->bit_depth = vpx_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
 #if CONFIG_VP9_HIGHBITDEPTH
     cm->use_highbitdepth = 1;
 #endif
@@ -1821,16 +1770,16 @@
     cm->use_highbitdepth = 0;
 #endif
   }
-  cm->color_space = vp9_rb_read_literal(rb, 3);
+  cm->color_space = vpx_rb_read_literal(rb, 3);
   if (cm->color_space != VPX_CS_SRGB) {
-    vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+    vpx_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      cm->subsampling_x = vp9_rb_read_bit(rb);
-      cm->subsampling_y = vp9_rb_read_bit(rb);
+      cm->subsampling_x = vpx_rb_read_bit(rb);
+      cm->subsampling_y = vpx_rb_read_bit(rb);
       if (cm->subsampling_x == 1 && cm->subsampling_y == 1)
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "4:2:0 color not supported in profile 1 or 3");
-      if (vp9_rb_read_bit(rb))
+      if (vpx_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Reserved bit set");
     } else {
@@ -1841,7 +1790,7 @@
       // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
       // 4:2:2 or 4:4:0 chroma sampling is not allowed.
       cm->subsampling_y = cm->subsampling_x = 0;
-      if (vp9_rb_read_bit(rb))
+      if (vpx_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Reserved bit set");
     } else {
@@ -1852,7 +1801,7 @@
 }
 
 static size_t read_uncompressed_header(VP9Decoder *pbi,
-                                       struct vp9_read_bit_buffer *rb) {
+                                       struct vpx_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
@@ -1862,20 +1811,25 @@
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
-  if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
+  if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
 
   cm->profile = vp9_read_profile(rb);
-
+#if CONFIG_VP9_HIGHBITDEPTH
   if (cm->profile >= MAX_PROFILES)
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Unsupported bitstream profile");
+#else
+  if (cm->profile >= PROFILE_2)
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported bitstream profile");
+#endif
 
-  cm->show_existing_frame = vp9_rb_read_bit(rb);
+  cm->show_existing_frame = vpx_rb_read_bit(rb);
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
-    const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+    const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
     lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
       unlock_buffer_pool(pool);
@@ -1897,9 +1851,9 @@
     return 0;
   }
 
-  cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb);
-  cm->show_frame = vp9_rb_read_bit(rb);
-  cm->error_resilient_mode = vp9_rb_read_bit(rb);
+  cm->frame_type = (FRAME_TYPE) vpx_rb_read_bit(rb);
+  cm->show_frame = vpx_rb_read_bit(rb);
+  cm->error_resilient_mode = vpx_rb_read_bit(rb);
 
   if (cm->frame_type == KEY_FRAME) {
     if (!vp9_read_sync_code(rb))
@@ -1920,10 +1874,10 @@
       pbi->need_resync = 0;
     }
   } else {
-    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
+    cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
 
     cm->reset_frame_context = cm->error_resilient_mode ?
-        0 : vp9_rb_read_literal(rb, 2);
+        0 : vpx_rb_read_literal(rb, 2);
 
     if (cm->intra_only) {
       if (!vp9_read_sync_code(rb))
@@ -1944,26 +1898,26 @@
 #endif
       }
 
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
       if (pbi->need_resync) {
         memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
         pbi->need_resync = 0;
       }
     } else if (pbi->need_resync != 1) {  /* Skip if need resync */
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
+      pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       for (i = 0; i < REFS_PER_FRAME; ++i) {
-        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
         RefBuffer *const ref_frame = &cm->frame_refs[i];
         ref_frame->idx = idx;
         ref_frame->buf = &frame_bufs[idx].buf;
-        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vpx_rb_read_bit(rb);
       }
 
       setup_frame_size_with_refs(cm, rb);
 
-      cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
+      cm->allow_high_precision_mv = vpx_rb_read_bit(rb);
       cm->interp_filter = read_interp_filter(rb);
 
       for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1995,8 +1949,8 @@
   }
 
   if (!cm->error_resilient_mode) {
-    cm->refresh_frame_context = vp9_rb_read_bit(rb);
-    cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
+    cm->refresh_frame_context = vpx_rb_read_bit(rb);
+    cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
   } else {
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
@@ -2004,7 +1958,7 @@
 
   // This flag will be overridden by the call to vp9_setup_past_independence
   // below, forcing the use of context 0 for those frame types.
-  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
+  cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
   // Generate next_ref_frame_map.
   lock_buffer_pool(pool);
@@ -2039,7 +1993,7 @@
   setup_segmentation_dequant(cm);
 
   setup_tile_info(cm, rb);
-  sz = vp9_rb_read_literal(rb, 16);
+  sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -2053,10 +2007,10 @@
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   FRAME_CONTEXT *const fc = cm->fc;
-  vp9_reader r;
+  vpx_reader r;
   int k;
 
-  if (vp9_reader_init(&r, data, partition_size, pbi->decrypt_cb,
+  if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                       pbi->decrypt_state))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
@@ -2097,7 +2051,7 @@
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
   }
 
-  return vp9_reader_has_error(&r);
+  return vpx_reader_has_error(&r);
 }
 
 #ifdef NDEBUG
@@ -2137,9 +2091,9 @@
 }
 #endif  // NDEBUG
 
-static struct vp9_read_bit_buffer *init_read_bit_buffer(
+static struct vpx_read_bit_buffer *init_read_bit_buffer(
     VP9Decoder *pbi,
-    struct vp9_read_bit_buffer *rb,
+    struct vpx_read_bit_buffer *rb,
     const uint8_t *data,
     const uint8_t *data_end,
     uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
@@ -2160,23 +2114,23 @@
 
 //------------------------------------------------------------------------------
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
-  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb) {
+  return vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vpx_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
 }
 
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
                          int *width, int *height) {
-  *width = vp9_rb_read_literal(rb, 16) + 1;
-  *height = vp9_rb_read_literal(rb, 16) + 1;
+  *width = vpx_rb_read_literal(rb, 16) + 1;
+  *height = vpx_rb_read_literal(rb, 16) + 1;
 }
 
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
-  int profile = vp9_rb_read_bit(rb);
-  profile |= vp9_rb_read_bit(rb) << 1;
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb) {
+  int profile = vpx_rb_read_bit(rb);
+  profile |= vpx_rb_read_bit(rb) << 1;
   if (profile > 2)
-    profile += vp9_rb_read_bit(rb);
+    profile += vpx_rb_read_bit(rb);
   return (BITSTREAM_PROFILE) profile;
 }
 
@@ -2185,7 +2139,7 @@
                       const uint8_t **p_data_end) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb;
+  struct vpx_read_bit_buffer rb;
   int context_updated = 0;
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
@@ -2201,7 +2155,7 @@
     return;
   }
 
-  data += vp9_rb_bytes_read(&rb);
+  data += vpx_rb_bytes_read(&rb);
   if (!read_is_valid(data, first_partition_size, data_end))
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");

diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index a876e7c..05af706 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h

@@ -17,12 +17,12 @@
 #endif
 
 struct VP9Decoder;
-struct vp9_read_bit_buffer;
+struct vpx_read_bit_buffer;
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+int vp9_read_sync_code(struct vpx_read_bit_buffer *const rb);
+void vp9_read_frame_size(struct vpx_read_bit_buffer *rb,
                          int *width, int *height);
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
+BITSTREAM_PROFILE vp9_read_profile(struct vpx_read_bit_buffer *rb);
 
 void vp9_decode_frame(struct VP9Decoder *pbi,
                       const uint8_t *data, const uint8_t *data_end,

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d42a654..341e6d7 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -21,14 +21,13 @@
 
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_reader.h"
 
-static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
+static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vpx_read_tree(r, vp9_intra_mode_tree, p);
 }
 
 static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                         vp9_reader *r, int size_group) {
+                                         vpx_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
       read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
   FRAME_COUNTS *counts = xd->counts;
@@ -38,7 +37,7 @@
 }
 
 static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                          vp9_reader *r,
+                                          vpx_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
                                          cm->fc->uv_mode_prob[y_mode]);
@@ -49,8 +48,8 @@
 }
 
 static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                       vp9_reader *r, int ctx) {
-  const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
+                                       vpx_reader *r, int ctx) {
+  const int mode = vpx_read_tree(r, vp9_inter_mode_tree,
                                  cm->fc->inter_mode_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
@@ -59,20 +58,20 @@
   return NEARESTMV + mode;
 }
 
-static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
-  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
+static int read_segment_id(vpx_reader *r, const struct segmentation *seg) {
+  return vpx_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, vp9_reader *r) {
+                                     TX_SIZE max_tx_size, vpx_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
-  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
-  int tx_size = vp9_read(r, tx_probs[0]);
+  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
+  int tx_size = vpx_read(r, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    tx_size += vp9_read(r, tx_probs[1]);
+    tx_size += vpx_read(r, tx_probs[1]);
     if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      tx_size += vp9_read(r, tx_probs[2]);
+      tx_size += vpx_read(r, tx_probs[2]);
   }
 
   if (counts)
@@ -81,7 +80,7 @@
 }
 
 static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int allow_select, vp9_reader *r) {
+                            int allow_select, vpx_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -129,7 +128,7 @@
 
 static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
                                  int x_mis, int y_mis,
-                                 vp9_reader *r) {
+                                 vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
   int segment_id;
 
@@ -148,7 +147,7 @@
 }
 
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vp9_reader *r) {
+                                 int mi_row, int mi_col, vpx_reader *r) {
   struct segmentation *const seg = &cm->seg;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int predicted_segment_id, segment_id;
@@ -174,8 +173,8 @@
   }
 
   if (seg->temporal_update) {
-    const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-    mbmi->seg_id_predicted = vp9_read(r, pred_prob);
+    const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
     segment_id = mbmi->seg_id_predicted ? predicted_segment_id
                                         : read_segment_id(r, seg);
   } else {
@@ -186,12 +185,12 @@
 }
 
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                     int segment_id, vp9_reader *r) {
+                     int segment_id, vpx_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
-    const int skip = vp9_read(r, cm->fc->skip_probs[ctx]);
+    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->skip[ctx][skip];
@@ -201,7 +200,7 @@
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *above_mi = xd->above_mi;
@@ -249,16 +248,16 @@
   mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
 }
 
-static int read_mv_component(vp9_reader *r,
+static int read_mv_component(vpx_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
-  const int sign = vp9_read(r, mvcomp->sign);
-  const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
+  const int sign = vpx_read(r, mvcomp->sign);
+  const int mv_class = vpx_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    d = vpx_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
     mag = 0;
   } else {
     int i;
@@ -266,16 +265,16 @@
 
     d = 0;
     for (i = 0; i < n; ++i)
-      d |= vp9_read(r, mvcomp->bits[i]) << i;
+      d |= vpx_read(r, mvcomp->bits[i]) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
-  fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+  fr = vpx_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
                                                : mvcomp->fp);
 
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
 
   // Result
@@ -283,11 +282,11 @@
   return sign ? -mag : mag;
 }
 
-static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
+static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE joint_type =
-      (MV_JOINT_TYPE)vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints);
+      (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints);
   const int use_hp = allow_hp && vp9_use_mv_hp(ref);
   MV diff = {0, 0};
 
@@ -305,11 +304,11 @@
 
 static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd,
-                                                vp9_reader *r) {
+                                                vpx_reader *r) {
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp9_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]);
+        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->comp_inter[ctx][mode];
@@ -321,7 +320,7 @@
 
 // Read the referncence frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            vp9_reader *r,
+                            vpx_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = xd->counts;
@@ -336,19 +335,19 @@
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
       const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
+      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
       if (counts)
         ++counts->comp_ref[ctx][bit];
       ref_frame[idx] = cm->comp_fixed_ref;
       ref_frame[!idx] = cm->comp_var_ref[bit];
     } else if (mode == SINGLE_REFERENCE) {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
-      const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
         ++counts->single_ref[ctx0][0][bit0];
       if (bit0) {
         const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
-        const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
         if (counts)
           ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
@@ -366,10 +365,10 @@
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
     VP9_COMMON *const cm, MACROBLOCKD *const xd,
-    vp9_reader *r) {
+    vpx_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
-      (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree,
+      (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
                                    cm->fc->switchable_interp_prob[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
@@ -379,7 +378,7 @@
 
 static void read_intra_block_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd, MODE_INFO *mi,
-                                       vp9_reader *r) {
+                                       vpx_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   int i;
@@ -421,7 +420,7 @@
                             PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
-                            int is_compound, int allow_hp, vp9_reader *r) {
+                            int is_compound, int allow_hp, vpx_reader *r) {
   int i;
   int ret = 1;
 
@@ -462,12 +461,12 @@
 }
 
 static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vp9_reader *r) {
+                               int segment_id, vpx_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
-    const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
+    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->intra_inter[ctx][is_inter];
@@ -484,7 +483,7 @@
 static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        MODE_INFO *const mi,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
@@ -586,7 +585,7 @@
 
 static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vp9_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -605,8 +604,8 @@
     read_intra_block_mode_info(cm, xd, mi, r);
 }
 
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vp9_reader *r,
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
                         int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];

diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 53bac8c..75f568c 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h

@@ -11,15 +11,16 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_
 
+#include "vpx_dsp/bitreader.h"
+
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vp9_reader *r,
+void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col, vpx_reader *r,
                         int x_mis, int y_mis);
 
 #ifdef __cplusplus

diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index af47f85..915f9dc 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h

@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
+#include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_util/vpx_thread.h"
 
@@ -21,7 +22,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,7 +30,7 @@
 // TODO(hkuang): combine this with TileWorkerData.
 typedef struct TileData {
   VP9_COMMON *cm;
-  vp9_reader bit_reader;
+  vpx_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
@@ -38,7 +38,7 @@
 
 typedef struct TileWorkerData {
   struct VP9Decoder *pbi;
-  vp9_reader bit_reader;
+  vpx_reader bit_reader;
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 5596c9e..e4412dc 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -38,10 +38,10 @@
        ++coef_counts[band][ctx][token];                     \
   } while (0)
 
-static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
+static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
   int i, val = 0;
   for (i = 0; i < n; ++i)
-    val = (val << 1) | vp9_read(r, probs[i]);
+    val = (val << 1) | vpx_read(r, probs[i]);
   return val;
 }
 
@@ -49,15 +49,15 @@
                         PLANE_TYPE type,
                         tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
                         int ctx, const int16_t *scan, const int16_t *nb,
-                        vp9_reader *r) {
+                        vpx_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = xd->fc;
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
   int band, c = 0;
-  const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+  const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
-  const vp9_prob *prob;
+  const vpx_prob *prob;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
   uint8_t token_cache[32 * 32];
@@ -117,12 +117,12 @@
     prob = coef_probs[band][ctx];
     if (counts)
       ++eob_branch_count[band][ctx];
-    if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
 
-    while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
@@ -134,13 +134,13 @@
       prob = coef_probs[band][ctx];
     }
 
-    if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vp9_read_tree(r, vp9_coef_con_tree,
+      token = vpx_read_tree(r, vp9_coef_con_tree,
                             vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
       switch (token) {
         case TWO_TOKEN:
@@ -188,13 +188,13 @@
     v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VP9_HIGHBITDEPTH
-    dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
                                           xd->bd);
 #else
-    dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
-    dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
+    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
@@ -253,7 +253,7 @@
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
                             int x, int y,
-                            TX_SIZE tx_size, vp9_reader *r,
+                            TX_SIZE tx_size, vpx_reader *r,
                             int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index cf0e48a..d242d44 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -12,8 +12,8 @@
 #ifndef VP9_DECODER_VP9_DETOKENIZE_H_
 #define VP9_DECODER_VP9_DETOKENIZE_H_
 
+#include "vpx_dsp/bitreader.h"
 #include "vp9/decoder/vp9_decoder.h"
-#include "vp9/decoder/vp9_reader.h"
 #include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus
@@ -23,7 +23,7 @@
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
                             int plane, const scan_order *sc,
                             int x, int y,
-                            TX_SIZE tx_size, vp9_reader *r,
+                            TX_SIZE tx_size, vpx_reader *r,
                             int seg_id);
 
 #ifdef __cplusplus

diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index b33c3b7..4fbc6db 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c

@@ -21,11 +21,11 @@
   return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
-static int decode_uniform(vp9_reader *r) {
+static int decode_uniform(vpx_reader *r) {
   const int l = 8;
   const int m = (1 << l) - 191;
-  const int v = vp9_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
+  const int v = vpx_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
 }
 
 static int inv_remap_prob(int v, int m) {
@@ -58,19 +58,19 @@
   }
 }
 
-static int decode_term_subexp(vp9_reader *r) {
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 4);
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 4) + 16;
-  if (!vp9_read_bit(r))
-    return vp9_read_literal(r, 5) + 32;
+static int decode_term_subexp(vpx_reader *r) {
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4);
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 4) + 16;
+  if (!vpx_read_bit(r))
+    return vpx_read_literal(r, 5) + 32;
   return decode_uniform(r) + 64;
 }
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
-  if (vp9_read(r, DIFF_UPDATE_PROB)) {
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p) {
+  if (vpx_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r);
-    *p = (vp9_prob)inv_remap_prob(delp, *p);
+    *p = (vpx_prob)inv_remap_prob(delp, *p);
   }
 }

diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
index 436f434..a8bcc70 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/decoder/vp9_dsubexp.h

@@ -12,13 +12,13 @@
 #ifndef VP9_DECODER_VP9_DSUBEXP_H_
 #define VP9_DECODER_VP9_DSUBEXP_H_
 
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx_dsp/bitreader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
+void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
deleted file mode 100644
index fc88bd7..0000000
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ /dev/null

@@ -1,45 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_
-
-#include <limits.h>
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*vp9_rb_error_handler)(void *data);
-
-struct vp9_read_bit_buffer {
-  const uint8_t *bit_buffer;
-  const uint8_t *bit_buffer_end;
-  size_t bit_offset;
-
-  void *error_handler_data;
-  vp9_rb_error_handler error_handler;
-};
-
-size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb);
-
-int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb);
-
-int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits);
-
-int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, int bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_READ_BIT_BUFFER_H_

diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index fecab57..40d7e87 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c

@@ -100,3 +100,17 @@
   hbuf += 8;
   vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }
+
+int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
+  int i;
+  uint16x8_t vec_sum = vdupq_n_u16(0);
+
+  for (i = 0; i < width; i += 16) {
+    const uint8x16_t vec_row = vld1q_u8(ref);
+    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
+    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
+    ref += 16;
+  }
+
+  return horizontal_add_u16x8(vec_sum);
+}

diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index a6d4797..97106ff 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c

@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vp9/common/vp9_blockd.h"
@@ -49,193 +50,3 @@
                        quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
                        dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
 }
-
-void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
-  int i;
-  // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
-                                            vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
-                                            vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
-                                            vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
-                                            vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }  // for
-  {
-    // from vp9_dct_sse2.c
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
-    // store results
-    vst1q_s16(&final_output[0 * 8], input_0);
-    vst1q_s16(&final_output[1 * 8], input_1);
-    vst1q_s16(&final_output[2 * 8], input_2);
-    vst1q_s16(&final_output[3 * 8], input_3);
-    vst1q_s16(&final_output[4 * 8], input_4);
-    vst1q_s16(&final_output[5 * 8], input_5);
-    vst1q_s16(&final_output[6 * 8], input_6);
-    vst1q_s16(&final_output[7 * 8], input_7);
-  }
-}
-

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 6270bf4..b619063 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -347,7 +347,10 @@
   // For video conference clips, if the background has high motion in current
   // frame because of the camera movement, set this frame as the golden frame.
   // Use 70% and 5% as the thresholds for golden frame refreshing.
-  if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
+  // Also, force this frame as a golden update frame if this frame will change
+  // the resolution (resize_pending != 0).
+  if (cpi->resize_pending != 0 ||
+     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
     vp9_cyclic_refresh_set_golden_update(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -562,4 +565,5 @@
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
   cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
 }

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 4ca4083..f06bd56 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -13,6 +13,7 @@
 #include <limits.h>
 
 #include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 
@@ -32,7 +33,6 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
 static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
   {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
@@ -44,27 +44,27 @@
 static const struct vp9_token inter_mode_encodings[INTER_MODES] =
   {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
 
-static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
-                             const vp9_prob *probs) {
+static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
   vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
-static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode,
-                             const vp9_prob *probs) {
+static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
+                             const vpx_prob *probs) {
   assert(is_inter_mode(mode));
   vp9_write_token(w, vp9_inter_mode_tree, probs,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
                                 int data, int max) {
-  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
+  vpx_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void prob_diff_update(const vp9_tree_index *tree,
-                             vp9_prob probs[/*n - 1*/],
+static void prob_diff_update(const vpx_tree_index *tree,
+                             vpx_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/],
-                             int n, vp9_writer *w) {
+                             int n, vpx_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -77,32 +77,32 @@
 }
 
 static void write_selected_tx_size(const VP9_COMMON *cm,
-                                   const MACROBLOCKD *xd, vp9_writer *w) {
+                                   const MACROBLOCKD *xd, vpx_writer *w) {
   TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
                                                  &cm->fc->tx_probs);
-  vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
+  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
+    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
     if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
+      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
 
 static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, vp9_writer *w) {
+                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
-    vp9_write(w, skip, vp9_get_skip_prob(cm, xd));
+    vpx_write(w, skip, vp9_get_skip_prob(cm, xd));
     return skip;
   }
 }
 
-static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w,
+static void update_skip_probs(VP9_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
   int k;
 
@@ -110,7 +110,7 @@
     vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w,
+static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
                                            FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
@@ -119,7 +119,7 @@
                      counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_mb_tokens(vp9_writer *w,
+static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth) {
   TOKENEXTRA *p = *tp;
@@ -179,12 +179,12 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          vp9_write(w, bb, pb[i >> 1]);
+          vpx_write(w, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vp9_write_bit(w, e & 1);
+      vpx_write_bit(w, e & 1);
     }
     ++p;
   }
@@ -192,7 +192,7 @@
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
+static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
     vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
@@ -200,7 +200,7 @@
 
 // This function encodes the reference frame
 static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                             vp9_writer *w) {
+                             vpx_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
@@ -215,27 +215,27 @@
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
+      vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
       assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
-      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
+      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+        vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
       }
     }
   }
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
-                                vp9_writer *w) {
+                                vpx_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
   const MACROBLOCK *const x = &cpi->td.mb;
@@ -254,8 +254,8 @@
   if (seg->update_map) {
     if (seg->temporal_update) {
       const int pred_flag = mbmi->seg_id_predicted;
-      vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-      vp9_write(w, pred_flag, pred_prob);
+      vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+      vpx_write(w, pred_flag, pred_prob);
       if (!pred_flag)
         write_segment_id(w, seg, segment_id);
     } else {
@@ -266,7 +266,7 @@
   skip = write_skip(cm, xd, segment_id, mi, w);
 
   if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+    vpx_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(is_inter && skip)) {
@@ -290,7 +290,7 @@
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
   } else {
     const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-    const vp9_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
@@ -339,7 +339,7 @@
 }
 
 static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vp9_writer *w) {
+                              MODE_INFO **mi_8x8, vpx_writer *w) {
   const struct segmentation *const seg = &cm->seg;
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = xd->above_mi;
@@ -375,7 +375,7 @@
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          vp9_writer *w, TOKENEXTRA **tok,
+                          vpx_writer *w, TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -405,9 +405,9 @@
 static void write_partition(const VP9_COMMON *const cm,
                             const MACROBLOCKD *const xd,
                             int hbs, int mi_row, int mi_col,
-                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = xd->partition_probs[ctx];
+  const vpx_prob *const probs = xd->partition_probs[ctx];
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
@@ -415,17 +415,17 @@
     vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
-    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
-    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
     assert(p == PARTITION_SPLIT);
   }
 }
 
 static void write_modes_sb(VP9_COMP *cpi,
-                           const TileInfo *const tile, vp9_writer *w,
+                           const TileInfo *const tile, vpx_writer *w,
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -483,7 +483,7 @@
 }
 
 static void write_modes(VP9_COMP *cpi,
-                        const TileInfo *const tile, vp9_writer *w,
+                        const TileInfo *const tile, vpx_writer *w,
                         TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
@@ -528,12 +528,12 @@
   }
 }
 
-static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
+static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp9_coeff_stats *frame_branch_ct,
                                      vp9_coeff_probs_model *new_coef_probs) {
   vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
-  const vp9_prob upd = DIFF_UPDATE_PROB;
+  const vpx_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   int stepsize = cpi->sf.coeff_prob_appx_step;
@@ -548,8 +548,8 @@
           for (k = 0; k < COEF_BANDS; ++k) {
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                const vp9_prob oldp = old_coef_probs[i][j][k][l][t];
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
                 int s;
                 int u = 0;
                 if (t == PIVOT_NODE)
@@ -575,19 +575,19 @@
       // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
-        vp9_write_bit(bc, 0);
+        vpx_write_bit(bc, 0);
         return;
       }
-      vp9_write_bit(bc, 1);
+      vpx_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vp9_prob upd = DIFF_UPDATE_PROB;
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                const vpx_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
                 if (t == PIVOT_NODE)
@@ -600,7 +600,7 @@
                       *oldp, &newp, upd);
                 if (s > 0 && newp != *oldp)
                   u = 1;
-                vp9_write(bc, u, upd);
+                vpx_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -623,8 +623,8 @@
             for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
-                vp9_prob newp = new_coef_probs[i][j][k][l][t];
-                vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
 
@@ -648,11 +648,11 @@
                 if (u == 1 && updates == 1) {
                   int v;
                   // first update
-                  vp9_write_bit(bc, 1);
+                  vpx_write_bit(bc, 1);
                   for (v = 0; v < noupdates_before_first; ++v)
-                    vp9_write(bc, 0, upd);
+                    vpx_write(bc, 0, upd);
                 }
-                vp9_write(bc, u, upd);
+                vpx_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp9_write_prob_diff_update(bc, newp, *oldp);
@@ -664,7 +664,7 @@
         }
       }
       if (updates == 0) {
-        vp9_write_bit(bc, 0);  // no updates
+        vpx_write_bit(bc, 0);  // no updates
       }
       return;
     }
@@ -673,7 +673,7 @@
   }
 }
 
-static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
+static void update_coef_probs(VP9_COMP *cpi, vpx_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
@@ -682,7 +682,7 @@
     vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
     if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
-      vp9_write_bit(w, 0);
+      vpx_write_bit(w, 0);
     } else {
       build_tree_distribution(cpi, tx_size, frame_branch_ct,
                               frame_coef_probs);
@@ -693,75 +693,75 @@
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
-                              struct vp9_write_bit_buffer *wb) {
+                              struct vpx_write_bit_buffer *wb) {
   int i;
 
   // Encode the loop filter level and type
-  vp9_wb_write_literal(wb, lf->filter_level, 6);
-  vp9_wb_write_literal(wb, lf->sharpness_level, 3);
+  vpx_wb_write_literal(wb, lf->filter_level, 6);
+  vpx_wb_write_literal(wb, lf->sharpness_level, 3);
 
   // Write out loop filter deltas applied at the MB level based on mode or
   // ref frame (if they are enabled).
-  vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+  vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled);
 
   if (lf->mode_ref_delta_enabled) {
-    vp9_wb_write_bit(wb, lf->mode_ref_delta_update);
+    vpx_wb_write_bit(wb, lf->mode_ref_delta_update);
     if (lf->mode_ref_delta_update) {
       for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
         const int delta = lf->ref_deltas[i];
         const int changed = delta != lf->last_ref_deltas[i];
-        vp9_wb_write_bit(wb, changed);
+        vpx_wb_write_bit(wb, changed);
         if (changed) {
           lf->last_ref_deltas[i] = delta;
-          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vp9_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
         }
       }
 
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
         const int delta = lf->mode_deltas[i];
         const int changed = delta != lf->last_mode_deltas[i];
-        vp9_wb_write_bit(wb, changed);
+        vpx_wb_write_bit(wb, changed);
         if (changed) {
           lf->last_mode_deltas[i] = delta;
-          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
-          vp9_wb_write_bit(wb, delta < 0);
+          vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vpx_wb_write_bit(wb, delta < 0);
         }
       }
     }
   }
 }
 
-static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) {
+static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
-    vp9_wb_write_bit(wb, 1);
-    vp9_wb_write_literal(wb, abs(delta_q), 4);
-    vp9_wb_write_bit(wb, delta_q < 0);
+    vpx_wb_write_bit(wb, 1);
+    vpx_wb_write_literal(wb, abs(delta_q), 4);
+    vpx_wb_write_bit(wb, delta_q < 0);
   } else {
-    vp9_wb_write_bit(wb, 0);
+    vpx_wb_write_bit(wb, 0);
   }
 }
 
 static void encode_quantization(const VP9_COMMON *const cm,
-                                struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+                                struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
   write_delta_q(wb, cm->y_dc_delta_q);
   write_delta_q(wb, cm->uv_dc_delta_q);
   write_delta_q(wb, cm->uv_ac_delta_q);
 }
 
 static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                struct vp9_write_bit_buffer *wb) {
+                                struct vpx_write_bit_buffer *wb) {
   int i, j;
 
   const struct segmentation *seg = &cm->seg;
 
-  vp9_wb_write_bit(wb, seg->enabled);
+  vpx_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
     return;
 
   // Segmentation map
-  vp9_wb_write_bit(wb, seg->update_map);
+  vpx_wb_write_bit(wb, seg->update_map);
   if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
     vp9_choose_segmap_coding_method(cm, xd);
@@ -769,40 +769,40 @@
     for (i = 0; i < SEG_TREE_PROBS; i++) {
       const int prob = seg->tree_probs[i];
       const int update = prob != MAX_PROB;
-      vp9_wb_write_bit(wb, update);
+      vpx_wb_write_bit(wb, update);
       if (update)
-        vp9_wb_write_literal(wb, prob, 8);
+        vpx_wb_write_literal(wb, prob, 8);
     }
 
     // Write out the chosen coding method.
-    vp9_wb_write_bit(wb, seg->temporal_update);
+    vpx_wb_write_bit(wb, seg->temporal_update);
     if (seg->temporal_update) {
       for (i = 0; i < PREDICTION_PROBS; i++) {
         const int prob = seg->pred_probs[i];
         const int update = prob != MAX_PROB;
-        vp9_wb_write_bit(wb, update);
+        vpx_wb_write_bit(wb, update);
         if (update)
-          vp9_wb_write_literal(wb, prob, 8);
+          vpx_wb_write_literal(wb, prob, 8);
       }
     }
   }
 
   // Segmentation data
-  vp9_wb_write_bit(wb, seg->update_data);
+  vpx_wb_write_bit(wb, seg->update_data);
   if (seg->update_data) {
-    vp9_wb_write_bit(wb, seg->abs_delta);
+    vpx_wb_write_bit(wb, seg->abs_delta);
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
         const int active = segfeature_active(seg, i, j);
-        vp9_wb_write_bit(wb, active);
+        vpx_wb_write_bit(wb, active);
         if (active) {
           const int data = get_segdata(seg, i, j);
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
             encode_unsigned_max(wb, abs(data), data_max);
-            vp9_wb_write_bit(wb, data < 0);
+            vpx_wb_write_bit(wb, data < 0);
           } else {
             encode_unsigned_max(wb, data, data_max);
           }
@@ -812,12 +812,12 @@
   }
 }
 
-static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w,
+static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w,
                               FRAME_COUNTS *counts) {
   // Mode
-  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
+  vpx_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
-    vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+    vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
 
   // Probabilities
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -850,12 +850,12 @@
 }
 
 static void write_interp_filter(INTERP_FILTER filter,
-                                struct vp9_write_bit_buffer *wb) {
+                                struct vpx_write_bit_buffer *wb) {
   const int filter_to_literal[] = { 1, 0, 2, 3 };
 
-  vp9_wb_write_bit(wb, filter == SWITCHABLE);
+  vpx_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
-    vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
+    vpx_wb_write_literal(wb, filter_to_literal[filter], 2);
 }
 
 static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
@@ -882,22 +882,22 @@
 }
 
 static void write_tile_info(const VP9_COMMON *const cm,
-                            struct vp9_write_bit_buffer *wb) {
+                            struct vpx_write_bit_buffer *wb) {
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   // columns
   ones = cm->log2_tile_cols - min_log2_tile_cols;
   while (ones--)
-    vp9_wb_write_bit(wb, 1);
+    vpx_wb_write_bit(wb, 1);
 
   if (cm->log2_tile_cols < max_log2_tile_cols)
-    vp9_wb_write_bit(wb, 0);
+    vpx_wb_write_bit(wb, 0);
 
   // rows
-  vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0)
-    vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);
+    vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
 }
 
 static int get_refresh_mask(VP9_COMP *cpi) {
@@ -928,7 +928,7 @@
 
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
-  vp9_writer residual_bc;
+  vpx_writer residual_bc;
   int tile_row, tile_col;
   TOKENEXTRA *tok_end;
   size_t total_size = 0;
@@ -947,14 +947,14 @@
           cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
-        vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
+        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
       else
-        vp9_start_encode(&residual_bc, data_ptr + total_size);
+        vpx_start_encode(&residual_bc, data_ptr + total_size);
 
       write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
                   &residual_bc, &tok, tok_end);
       assert(tok == tok_end);
-      vp9_stop_encode(&residual_bc);
+      vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
         mem_put_be32(data_ptr + total_size, residual_bc.pos);
@@ -969,26 +969,26 @@
 }
 
 static void write_display_size(const VP9_COMMON *cm,
-                               struct vp9_write_bit_buffer *wb) {
+                               struct vpx_write_bit_buffer *wb) {
   const int scaling_active = cm->width != cm->display_width ||
                              cm->height != cm->display_height;
-  vp9_wb_write_bit(wb, scaling_active);
+  vpx_wb_write_bit(wb, scaling_active);
   if (scaling_active) {
-    vp9_wb_write_literal(wb, cm->display_width - 1, 16);
-    vp9_wb_write_literal(wb, cm->display_height - 1, 16);
+    vpx_wb_write_literal(wb, cm->display_width - 1, 16);
+    vpx_wb_write_literal(wb, cm->display_height - 1, 16);
   }
 }
 
 static void write_frame_size(const VP9_COMMON *cm,
-                             struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, cm->width - 1, 16);
-  vp9_wb_write_literal(wb, cm->height - 1, 16);
+                             struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, cm->width - 1, 16);
+  vpx_wb_write_literal(wb, cm->height - 1, 16);
 
   write_display_size(cm, wb);
 }
 
 static void write_frame_size_with_refs(VP9_COMP *cpi,
-                                       struct vp9_write_bit_buffer *wb) {
+                                       struct vpx_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
   int found = 0;
 
@@ -1011,40 +1011,40 @@
       found = cm->width == cfg->y_crop_width &&
               cm->height == cfg->y_crop_height;
     }
-    vp9_wb_write_bit(wb, found);
+    vpx_wb_write_bit(wb, found);
     if (found) {
       break;
     }
   }
 
   if (!found) {
-    vp9_wb_write_literal(wb, cm->width - 1, 16);
-    vp9_wb_write_literal(wb, cm->height - 1, 16);
+    vpx_wb_write_literal(wb, cm->width - 1, 16);
+    vpx_wb_write_literal(wb, cm->height - 1, 16);
   }
 
   write_display_size(cm, wb);
 }
 
-static void write_sync_code(struct vp9_write_bit_buffer *wb) {
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
-  vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
+static void write_sync_code(struct vpx_write_bit_buffer *wb) {
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+  vpx_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
 }
 
 static void write_profile(BITSTREAM_PROFILE profile,
-                          struct vp9_write_bit_buffer *wb) {
+                          struct vpx_write_bit_buffer *wb) {
   switch (profile) {
     case PROFILE_0:
-      vp9_wb_write_literal(wb, 0, 2);
+      vpx_wb_write_literal(wb, 0, 2);
       break;
     case PROFILE_1:
-      vp9_wb_write_literal(wb, 2, 2);
+      vpx_wb_write_literal(wb, 2, 2);
       break;
     case PROFILE_2:
-      vp9_wb_write_literal(wb, 1, 2);
+      vpx_wb_write_literal(wb, 1, 2);
       break;
     case PROFILE_3:
-      vp9_wb_write_literal(wb, 6, 3);
+      vpx_wb_write_literal(wb, 6, 3);
       break;
     default:
       assert(0);
@@ -1052,41 +1052,41 @@
 }
 
 static void write_bitdepth_colorspace_sampling(
-    VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) {
+    VP9_COMMON *const cm, struct vpx_write_bit_buffer *wb) {
   if (cm->profile >= PROFILE_2) {
     assert(cm->bit_depth > VPX_BITS_8);
-    vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
+    vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
-  vp9_wb_write_literal(wb, cm->color_space, 3);
+  vpx_wb_write_literal(wb, cm->color_space, 3);
   if (cm->color_space != VPX_CS_SRGB) {
-    vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    vpx_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
-      vp9_wb_write_bit(wb, cm->subsampling_x);
-      vp9_wb_write_bit(wb, cm->subsampling_y);
-      vp9_wb_write_bit(wb, 0);  // unused
+      vpx_wb_write_bit(wb, cm->subsampling_x);
+      vpx_wb_write_bit(wb, cm->subsampling_y);
+      vpx_wb_write_bit(wb, 0);  // unused
     } else {
       assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
     }
   } else {
     assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
-    vp9_wb_write_bit(wb, 0);  // unused
+    vpx_wb_write_bit(wb, 0);  // unused
   }
 }
 
 static void write_uncompressed_header(VP9_COMP *cpi,
-                                      struct vp9_write_bit_buffer *wb) {
+                                      struct vpx_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
-  vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
+  vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
   write_profile(cm->profile, wb);
 
-  vp9_wb_write_bit(wb, 0);  // show_existing_frame
-  vp9_wb_write_bit(wb, cm->frame_type);
-  vp9_wb_write_bit(wb, cm->show_frame);
-  vp9_wb_write_bit(wb, cm->error_resilient_mode);
+  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  vpx_wb_write_bit(wb, cm->frame_type);
+  vpx_wb_write_bit(wb, cm->show_frame);
+  vpx_wb_write_bit(wb, cm->error_resilient_mode);
 
   if (cm->frame_type == KEY_FRAME) {
     write_sync_code(wb);
@@ -1102,10 +1102,10 @@
     // show_existing_frame flag which tells the decoder which frame we want to
     // show.
     if (!cm->show_frame)
-      vp9_wb_write_bit(wb, cm->intra_only);
+      vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)
-      vp9_wb_write_literal(wb, cm->reset_frame_context, 2);
+      vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
 
     if (cm->intra_only) {
       write_sync_code(wb);
@@ -1115,21 +1115,21 @@
         write_bitdepth_colorspace_sampling(cm, wb);
       }
 
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-        vp9_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+        vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                              REF_FRAMES_LOG2);
-        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+        vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
 
       write_frame_size_with_refs(cpi, wb);
 
-      vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
+      vpx_wb_write_bit(wb, cm->allow_high_precision_mv);
 
       fix_interp_filter(cm, cpi->td.counts);
       write_interp_filter(cm->interp_filter, wb);
@@ -1137,11 +1137,11 @@
   }
 
   if (!cm->error_resilient_mode) {
-    vp9_wb_write_bit(wb, cm->refresh_frame_context);
-    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+    vpx_wb_write_bit(wb, cm->refresh_frame_context);
+    vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+  vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
@@ -1155,9 +1155,9 @@
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
-  vp9_writer header_bc;
+  vpx_writer header_bc;
 
-  vp9_start_encode(&header_bc, data);
+  vpx_start_encode(&header_bc, data);
 
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
@@ -1185,9 +1185,9 @@
       const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 
-      vp9_write_bit(&header_bc, use_compound_pred);
+      vpx_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
-        vp9_write_bit(&header_bc, use_hybrid_pred);
+        vpx_write_bit(&header_bc, use_hybrid_pred);
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
@@ -1221,7 +1221,7 @@
                         &counts->mv);
   }
 
-  vp9_stop_encode(&header_bc);
+  vpx_stop_encode(&header_bc);
   assert(header_bc.pos <= 0xffff);
 
   return header_bc.pos;
@@ -1230,14 +1230,14 @@
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
   size_t first_part_size, uncompressed_hdr_size;
-  struct vp9_write_bit_buffer wb = {data, 0};
-  struct vp9_write_bit_buffer saved_wb;
+  struct vpx_write_bit_buffer wb = {data, 0};
+  struct vpx_write_bit_buffer saved_wb;
 
   write_uncompressed_header(cpi, &wb);
   saved_wb = wb;
-  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  uncompressed_hdr_size = vp9_wb_bytes_written(&wb);
+  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
 
   vp9_clear_system_state();
@@ -1245,7 +1245,7 @@
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
   // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
-  vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
 
   data += encode_tiles(cpi, data);
 

diff --git a/vp9/encoder/vp9_cost.c b/vp9/encoder/vp9_cost.c
index 1c3c3d2..e2fbb34 100644
--- a/vp9/encoder/vp9_cost.c
+++ b/vp9/encoder/vp9_cost.c

@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <assert.h>
 
 #include "vp9/encoder/vp9_cost.h"
 
@@ -34,14 +35,14 @@
   22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
   4,    3,    1,    1};
 
-static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
+static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
                  int i, int c) {
-  const vp9_prob prob = probs[i / 2];
+  const vpx_prob prob = probs[i / 2];
   int b;
 
   for (b = 0; b <= 1; ++b) {
     const int cc = c + vp9_cost_bit(prob, b);
-    const vp9_tree_index ii = tree[i + b];
+    const vpx_tree_index ii = tree[i + b];
 
     if (ii <= 0)
       costs[-ii] = cc;
@@ -50,11 +51,11 @@
   }
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
   cost(costs, tree, probs, 0, 0);
 }
 
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) {
   assert(tree[0] <= 0 && tree[1] > 0);
 
   costs[-tree[0]] = vp9_cost_bit(probs[0], 0);

diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h
index 6d2b940..eac74c4 100644
--- a/vp9/encoder/vp9_cost.h
+++ b/vp9/encoder/vp9_cost.h

@@ -11,7 +11,7 @@
 #ifndef VP9_ENCODER_VP9_COST_H_
 #define VP9_ENCODER_VP9_COST_H_
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -21,20 +21,20 @@
 
 #define vp9_cost_zero(prob) (vp9_prob_cost[prob])
 
-#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob))
+#define vp9_cost_one(prob) vp9_cost_zero(vpx_complement(prob))
 
-#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vpx_complement(prob) \
                                                     : (prob))
 
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vp9_prob p) {
+                                          vpx_prob p) {
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
-static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs,
                              int bits, int len) {
   int cost = 0;
-  vp9_tree_index i = 0;
+  vpx_tree_index i = 0;
 
   do {
     const int bit = (bits >> --len) & 1;
@@ -45,8 +45,8 @@
   return cost;
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 414d2bb..5dcbd1f 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c

@@ -13,22 +13,16 @@
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
-#include "vpx_ports/mem.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_dct.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/fwd_txfm.h"
 
-static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert
-  // and make the bounds consts.
-  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
-  return rv;
-}
-
-void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t step[4];
   tran_high_t temp1, temp2;
 
@@ -47,157 +41,7 @@
   output[3] = (tran_low_t)fdct_round_shift(temp2);
 }
 
-void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c)
-      sum += input[r * stride + c];
-
-  output[0] = sum << 1;
-  output[1] = 0;
-}
-
-void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[4 * 4];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t input[4];      // canbe16
-    tran_high_t step[4];       // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 4; ++i) {
-      // Load inputs.
-      if (0 == pass) {
-        input[0] = in_pass0[0 * stride] * 16;
-        input[1] = in_pass0[1 * stride] * 16;
-        input[2] = in_pass0[2 * stride] * 16;
-        input[3] = in_pass0[3 * stride] * 16;
-        if (i == 0 && input[0]) {
-          input[0] += 1;
-        }
-      } else {
-        input[0] = in[0 * 4];
-        input[1] = in[1 * 4];
-        input[2] = in[2 * 4];
-        input[3] = in[3 * 4];
-      }
-      // Transform.
-      step[0] = input[0] + input[3];
-      step[1] = input[1] + input[2];
-      step[2] = input[1] - input[2];
-      step[3] = input[0] - input[3];
-      temp1 = (step[0] + step[1]) * cospi_16_64;
-      temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = (tran_low_t)fdct_round_shift(temp1);
-      out[2] = (tran_low_t)fdct_round_shift(temp2);
-      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = (tran_low_t)fdct_round_shift(temp1);
-      out[3] = (tran_low_t)fdct_round_shift(temp2);
-      // Do next column (which is a transposed row in second/horizontal pass)
-      in_pass0++;
-      in++;
-      out += 4;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
-
-  {
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
-    }
-  }
-}
-
-void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t x0, x1, x2, x3;
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  x0 = input[0];
-  x1 = input[1];
-  x2 = input[2];
-  x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_4_9 * x0;
-  s2 = sinpi_2_9 * x1;
-  s3 = sinpi_1_9 * x1;
-  s4 = sinpi_3_9 * x2;
-  s5 = sinpi_4_9 * x3;
-  s6 = sinpi_2_9 * x3;
-  s7 = x0 + x1 - x3;
-
-  x0 = s0 + s2 + s5;
-  x1 = sinpi_3_9 * s7;
-  x2 = s1 - s3 + s6;
-  x3 = s4;
-
-  s0 = x0 + x3;
-  s1 = x1;
-  s2 = x2 - x3;
-  s3 = x2 - x0 + x3;
-
-  // 1-D transform scaling factor is sqrt(2).
-  output[0] = (tran_low_t)fdct_round_shift(s0);
-  output[1] = (tran_low_t)fdct_round_shift(s1);
-  output[2] = (tran_low_t)fdct_round_shift(s2);
-  output[3] = (tran_low_t)fdct_round_shift(s3);
-}
-
-void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_fdct4x4_c(input, output, stride);
-  } else {
-    tran_low_t out[4 * 4];
-    int i, j;
-    tran_low_t temp_in[4], temp_out[4];
-    const transform_2d ht = FHT_4[tx_type];
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = input[j * stride + i] * 16;
-      if (i == 0 && temp_in[0])
-        temp_in[0] += 1;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        out[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j + i * 4];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (temp_out[j] + 1) >> 2;
-    }
-  }
-}
-
-void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
   tran_high_t t0, t1, t2, t3;                  // needs32
   tran_high_t x0, x1, x2, x3;                  // canbe16
@@ -249,550 +93,7 @@
   output[7] = (tran_low_t)fdct_round_shift(t3);
 }
 
-void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 8; ++r)
-    for (c = 0; c < 8; ++c)
-      sum += input[r * stride + c];
-
-  output[0] = sum;
-  output[1] = 0;
-}
-
-void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
-  int i, j;
-  tran_low_t intermediate[64];
-
-  // Transform columns
-  {
-    tran_low_t *output = intermediate;
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    int i;
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) * 4;
-      s1 = (input[1 * stride] + input[6 * stride]) * 4;
-      s2 = (input[2 * stride] + input[5 * stride]) * 4;
-      s3 = (input[3 * stride] + input[4 * stride]) * 4;
-      s4 = (input[3 * stride] - input[4 * stride]) * 4;
-      s5 = (input[2 * stride] - input[5 * stride]) * 4;
-      s6 = (input[1 * stride] - input[6 * stride]) * 4;
-      s7 = (input[0 * stride] - input[7 * stride]) * 4;
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
-      input++;
-      output++;
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
-    for (j = 0; j < 8; ++j)
-      final_output[j + i * 8] /= 2;
-  }
-}
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
-                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr,
-                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                         const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr,
-                         const int16_t *scan, const int16_t *iscan) {
-  int eob = -1;
-
-  int i, j;
-  tran_low_t intermediate[64];
-
-  // Transform columns
-  {
-    tran_low_t *output = intermediate;
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    int i;
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) * 4;
-      s1 = (input[1 * stride] + input[6 * stride]) * 4;
-      s2 = (input[2 * stride] + input[5 * stride]) * 4;
-      s3 = (input[3 * stride] + input[4 * stride]) * 4;
-      s4 = (input[3 * stride] - input[4 * stride]) * 4;
-      s5 = (input[2 * stride] - input[5 * stride]) * 4;
-      s6 = (input[1 * stride] - input[6 * stride]) * 4;
-      s7 = (input[0 * stride] - input[7 * stride]) * 4;
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
-      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
-      input++;
-      output++;
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    vp9_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
-    for (j = 0; j < 8; ++j)
-      coeff_ptr[j + i * 8] /= 2;
-  }
-
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-      if (tmp)
-        eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 16; ++r)
-    for (c = 0; c < 16; ++c)
-      sum += input[r * stride + c];
-
-  output[0] = sum >> 1;
-  output[1] = 0;
-}
-
-void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[256];
-  const int16_t *in_pass0 = input;
-  const tran_low_t *in = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t step1[8];      // canbe16
-    tran_high_t step2[8];      // canbe16
-    tran_high_t step3[8];      // canbe16
-    tran_high_t input[8];      // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
-        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
-        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
-        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
-        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
-        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
-        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
-        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
-        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
-        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
-        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
-        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
-        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
-        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
-        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
-        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
-        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
-        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
-        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
-        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
-        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
-        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
-        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
-        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
-        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
-        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
-        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
-        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
-        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-        tran_high_t t0, t1, t2, t3;                  // needs32
-        tran_high_t x0, x1, x2, x3;                  // canbe16
-
-        // stage 1
-        s0 = input[0] + input[7];
-        s1 = input[1] + input[6];
-        s2 = input[2] + input[5];
-        s3 = input[3] + input[4];
-        s4 = input[3] - input[4];
-        s5 = input[2] - input[5];
-        s6 = input[1] - input[6];
-        s7 = input[0] - input[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = (tran_low_t)fdct_round_shift(t0);
-        out[4] = (tran_low_t)fdct_round_shift(t2);
-        out[8] = (tran_low_t)fdct_round_shift(t1);
-        out[12] = (tran_low_t)fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-        out[2] = (tran_low_t)fdct_round_shift(t0);
-        out[6] = (tran_low_t)fdct_round_shift(t2);
-        out[10] = (tran_low_t)fdct_round_shift(t1);
-        out[14] = (tran_low_t)fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = (tran_low_t)fdct_round_shift(temp1);
-        out[9] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = (tran_low_t)fdct_round_shift(temp1);
-        out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = (tran_low_t)fdct_round_shift(temp1);
-        out[11] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = (tran_low_t)fdct_round_shift(temp1);
-        out[15] = (tran_low_t)fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      in++;
-      in_pass0++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
-}
-
-void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-
-  x0 = fdct_round_shift(s0 + s4);
-  x1 = fdct_round_shift(s1 + s5);
-  x2 = fdct_round_shift(s2 + s6);
-  x3 = fdct_round_shift(s3 + s7);
-  x4 = fdct_round_shift(s0 - s4);
-  x5 = fdct_round_shift(s1 - s5);
-  x6 = fdct_round_shift(s2 - s6);
-  x7 = fdct_round_shift(s3 - s7);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
-
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
-  x4 = fdct_round_shift(s4 + s6);
-  x5 = fdct_round_shift(s5 + s7);
-  x6 = fdct_round_shift(s4 - s6);
-  x7 = fdct_round_shift(s5 - s7);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = fdct_round_shift(s2);
-  x3 = fdct_round_shift(s3);
-  x6 = fdct_round_shift(s6);
-  x7 = fdct_round_shift(s7);
-
-  output[0] = (tran_low_t)x0;
-  output[1] = (tran_low_t)-x4;
-  output[2] = (tran_low_t)x6;
-  output[3] = (tran_low_t)-x2;
-  output[4] = (tran_low_t)x3;
-  output[5] = (tran_low_t)-x7;
-  output[6] = (tran_low_t)x5;
-  output[7] = (tran_low_t)-x1;
-}
-
-void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_fdct8x8_c(input, output, stride);
-  } else {
-    tran_low_t out[64];
-    int i, j;
-    tran_low_t temp_in[8], temp_out[8];
-    const transform_2d ht = FHT_8[tx_type];
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        out[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j + i * 8];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-    }
-  }
-}
-
-/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
-   pixel. */
-void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  int i;
-  tran_high_t a1, b1, c1, d1, e1;
-  const int16_t *ip_pass0 = input;
-  const tran_low_t *ip = NULL;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip_pass0[0 * stride];
-    b1 = ip_pass0[1 * stride];
-    c1 = ip_pass0[2 * stride];
-    d1 = ip_pass0[3 * stride];
-
-    a1 += b1;
-    d1 = d1 - c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = (tran_low_t)a1;
-    op[4] = (tran_low_t)c1;
-    op[8] = (tran_low_t)d1;
-    op[12] = (tran_low_t)b1;
-
-    ip_pass0++;
-    op++;
-  }
-  ip = output;
-  op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0];
-    b1 = ip[1];
-    c1 = ip[2];
-    d1 = ip[3];
-
-    a1 += b1;
-    d1 -= c1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= c1;
-    d1 += b1;
-    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
-    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
-    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
-    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
-
-    ip += 4;
-    op += 4;
-  }
-}
-
-// Rewrote to use same algorithm as others.
-void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
   tran_high_t step1[8];      // canbe16
   tran_high_t step2[8];      // canbe16
   tran_high_t step3[8];      // canbe16
@@ -933,7 +234,118 @@
   out[15] = (tran_low_t)fdct_round_shift(temp2);
 }
 
-void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
@@ -1096,6 +508,290 @@
   output[15] = (tran_low_t)-x1;
 }
 
+static const transform_2d FHT_4[] = {
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
+};
+
+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 4; ++r)
+    for (c = 0; c < 4; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum << 1;
+  output[1] = 0;
+}
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct4x4_c(input, output, stride);
+  } else {
+    tran_low_t out[4 * 4];
+    int i, j;
+    tran_low_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
+
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        out[j * 4 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
+  }
+}
+
+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 8; ++r)
+    for (c = 0; c < 8; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum;
+  output[1] = 0;
+}
+
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
+                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
+  int eob = -1;
+
+  int i, j;
+  tran_low_t intermediate[64];
+
+  // Transform columns
+  {
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    for (j = 0; j < 8; ++j)
+      coeff_ptr[j + i * 8] /= 2;
+  }
+
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
+  int r, c;
+  tran_low_t sum = 0;
+  for (r = 0; r < 16; ++r)
+    for (c = 0; c < 16; ++c)
+      sum += input[r * stride + c];
+
+  output[0] = sum >> 1;
+  output[1] = 0;
+}
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct8x8_c(input, output, stride);
+  } else {
+    tran_low_t out[64];
+    int i, j;
+    tran_low_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        out[j * 8 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
+  }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
@@ -1531,11 +1227,6 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  vp9_fdct4x4_c(input, output, stride);
-}
-
 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output,
                          int stride, int tx_type) {
   vp9_fht4x4_c(input, output, stride, tx_type);
@@ -1546,21 +1237,11 @@
   vp9_fdct8x8_1_c(input, final_output, stride);
 }
 
-void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
-                          int stride) {
-  vp9_fdct8x8_c(input, final_output, stride);
-}
-
 void vp9_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
                               int stride) {
   vp9_fdct16x16_1_c(input, output, stride);
 }
 
-void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
-  vp9_fdct16x16_c(input, output, stride);
-}
-
 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output,
                          int stride, int tx_type) {
   vp9_fht8x8_c(input, output, stride, tx_type);

diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h
index 49afcbb..6ce7e96 100644
--- a/vp9/encoder/vp9_dct.h
+++ b/vp9/encoder/vp9_dct.h

@@ -11,49 +11,12 @@
 #ifndef VP9_ENCODER_VP9_DCT_H_
 #define VP9_ENCODER_VP9_DCT_H_
 
-#include "vp9/common/vp9_idct.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride);
-void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
-void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride);
-
-void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
-void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
-void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
-void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
-void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
-void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
 void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
 
-static const transform_2d FHT_4[] = {
-  { vp9_fdct4,  vp9_fdct4  },  // DCT_DCT  = 0
-  { vp9_fadst4, vp9_fdct4  },  // ADST_DCT = 1
-  { vp9_fdct4,  vp9_fadst4 },  // DCT_ADST = 2
-  { vp9_fadst4, vp9_fadst4 }   // ADST_ADST = 3
-};
-
-static const transform_2d FHT_8[] = {
-  { vp9_fdct8,  vp9_fdct8  },  // DCT_DCT  = 0
-  { vp9_fadst8, vp9_fdct8  },  // ADST_DCT = 1
-  { vp9_fdct8,  vp9_fadst8 },  // DCT_ADST = 2
-  { vp9_fadst8, vp9_fadst8 }   // ADST_ADST = 3
-};
-
-static const transform_2d FHT_16[] = {
-  { vp9_fdct16,  vp9_fdct16  },  // DCT_DCT  = 0
-  { vp9_fadst16, vp9_fdct16  },  // ADST_DCT = 1
-  { vp9_fdct16,  vp9_fadst16 },  // DCT_ADST = 2
-  { vp9_fadst16, vp9_fadst16 }   // ADST_ADST = 3
-};
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 659ce72..9c3c510 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -23,7 +24,6 @@
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -795,7 +795,7 @@
       case TX_32X32:
         scan_order = &vp9_default_scan_orders[TX_32X32];
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+        vp9_predict_intra_block(xd, bwl, TX_32X32, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
@@ -816,14 +816,17 @@
         tx_type = get_tx_type(pd->plane_type, xd);
         scan_order = &vp9_scan_orders[TX_16X16][tx_type];
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+        vp9_predict_intra_block(xd, bwl, TX_16X16, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
-          vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+          if (tx_type == DCT_DCT)
+            vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob,
@@ -838,14 +841,17 @@
         tx_type = get_tx_type(pd->plane_type, xd);
         scan_order = &vp9_scan_orders[TX_8X8][tx_type];
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+        vp9_predict_intra_block(xd, bwl, TX_8X8, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
-          vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+          if (tx_type == DCT_DCT)
+            vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
+          else
+            vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob,
@@ -860,7 +866,7 @@
         tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
         scan_order = &vp9_scan_orders[TX_4X4][tx_type];
         mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-        vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+        vp9_predict_intra_block(xd, bwl, TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
@@ -903,7 +909,7 @@
     case TX_32X32:
       scan_order = &vp9_default_scan_orders[TX_32X32];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+      vp9_predict_intra_block(xd, bwl, TX_32X32, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
@@ -923,7 +929,7 @@
       tx_type = get_tx_type(pd->plane_type, xd);
       scan_order = &vp9_scan_orders[TX_16X16][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+      vp9_predict_intra_block(xd, bwl, TX_16X16, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
@@ -943,7 +949,7 @@
       tx_type = get_tx_type(pd->plane_type, xd);
       scan_order = &vp9_scan_orders[TX_8X8][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+      vp9_predict_intra_block(xd, bwl, TX_8X8, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
@@ -963,7 +969,7 @@
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
       mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
+      vp9_predict_intra_block(xd, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);

diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 10180f2..a1d77db 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c

@@ -29,7 +29,7 @@
   vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
 }
 
-static void encode_mv_component(vp9_writer* w, int comp,
+static void encode_mv_component(vpx_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
   const int sign = comp < 0;
@@ -42,7 +42,7 @@
   assert(comp != 0);
 
   // Sign
-  vp9_write(w, sign, mvcomp->sign);
+  vpx_write(w, sign, mvcomp->sign);
 
   // Class
   vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
@@ -56,7 +56,7 @@
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < n; ++i)
-      vp9_write(w, (d >> i) & 1, mvcomp->bits[i]);
+      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
   // Fractional bits
@@ -66,7 +66,7 @@
 
   // High precision bit
   if (usehp)
-    vp9_write(w, hp,
+    vpx_write(w, hp,
               mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
 }
 
@@ -133,23 +133,23 @@
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
-                     vp9_prob upd_p) {
-  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+                     vpx_prob upd_p) {
+  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
   const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
                      cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
-  vp9_write(w, update, upd_p);
+  vpx_write(w, update, upd_p);
   if (update) {
     *cur_p = new_p;
-    vp9_write_literal(w, new_p >> 1, 7);
+    vpx_write_literal(w, new_p >> 1, 7);
   }
   return update;
 }
 
-static void write_mv_update(const vp9_tree_index *tree,
-                            vp9_prob probs[/*n - 1*/],
+static void write_mv_update(const vpx_tree_index *tree,
+                            vpx_prob probs[/*n - 1*/],
                             const unsigned int counts[/*n - 1*/],
-                            int n, vp9_writer *w) {
+                            int n, vpx_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -161,7 +161,7 @@
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
                          nmv_context_counts *const counts) {
   int i, j;
   nmv_context *const mvc = &cm->fc->nmvc;
@@ -199,7 +199,7 @@
   }
 }
 
-void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
+void vp9_encode_mv(VP9_COMP* cpi, vpx_writer* w,
                    const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,

diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index e8ee5ab..5fb114c 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h

@@ -20,10 +20,10 @@
 
 void vp9_entropy_mv_init(void);
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
                          nmv_context_counts *const counts);
 
-void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
+void vp9_encode_mv(VP9_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6c595b7..781204d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -19,6 +19,7 @@
 #include "vpx/internal/vpx_psnr.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
@@ -2627,9 +2628,10 @@
   const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
   int force_recode = 0;
 
-  if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (frame_is_kfgfarf &&
-      (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+       (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
     if (frame_is_kfgfarf &&
         (oxcf->resize_mode == RESIZE_DYNAMIC) &&
         scale_down(cpi, q)) {
@@ -3059,17 +3061,17 @@
       oxcf->rc_mode == VPX_CBR &&
       !cpi->use_svc &&
       oxcf->resize_mode == RESIZE_DYNAMIC) {
-      if (cpi->resize_state == 1) {
+      if (cpi->resize_pending == 1) {
         oxcf->scaled_frame_width =
             (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
         oxcf->scaled_frame_height =
             (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
-      } else if (cpi->resize_state == -1) {
+      } else if (cpi->resize_pending == -1) {
         // Go back up to original size.
         oxcf->scaled_frame_width = oxcf->width;
         oxcf->scaled_frame_height = oxcf->height;
       }
-      if (cpi->resize_state != 0) {
+      if (cpi->resize_pending != 0) {
         // There has been a change in frame size.
         vp9_set_size_literal(cpi,
                              oxcf->scaled_frame_width,
@@ -3140,12 +3142,27 @@
 
   set_frame_size(cpi);
 
-  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
-
-  if (cpi->unscaled_last_source != NULL)
-    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
+  // For 1 pass CBR under dynamic resize mode: use faster scaling for source.
+  // Only for 2x2 scaling for now.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      cpi->un_scaled_source->y_width == (cm->width << 1) &&
+      cpi->un_scaled_source->y_height == (cm->height << 1)) {
+    cpi->Source = vp9_scale_if_required_fast(cm,
+                                             cpi->un_scaled_source,
+                                             &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+       cpi->Last_Source = vp9_scale_if_required_fast(cm,
+                                                     cpi->unscaled_last_source,
+                                                     &cpi->scaled_last_source);
+  } else {
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+  }
 
   if (frame_is_intra_only(cm) == 0) {
     vp9_scale_references(cpi);
@@ -3491,6 +3508,21 @@
   }
 }
 
+YV12_BUFFER_CONFIG *vp9_scale_if_required_fast(VP9_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+    // For 2x2 scaling down.
+    vpx_scale_frame(unscaled, scaled, unscaled->y_buffer, 9, 2, 1,
+                    2, 1, 0);
+    vp9_extend_frame_borders(scaled);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled) {
@@ -4275,7 +4307,7 @@
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
-    double samples;
+    double samples = 0.0;
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index f095cad..b76b6b7 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -55,7 +55,7 @@
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
-  vp9_prob segment_pred_probs[PREDICTION_PROBS];
+  vpx_prob segment_pred_probs[PREDICTION_PROBS];
 
   unsigned char *last_frame_seg_map_copy;
 
@@ -614,6 +614,10 @@
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+YV12_BUFFER_CONFIG *vp9_scale_if_required_fast(VP9_COMMON *cm,
+                                               YV12_BUFFER_CONFIG *unscaled,
+                                               YV12_BUFFER_CONFIG *scaled);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled);

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 61279f8..5caf2cb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -1883,7 +1883,7 @@
   double gf_group_error_left;
   int gf_arf_bits;
   const int is_key_frame = frame_is_intra_only(cm);
-  const int kf_or_arf_active = is_key_frame || rc->source_alt_ref_active;
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -1903,7 +1903,7 @@
 
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
-  if (is_key_frame || rc->source_alt_ref_active) {
+  if (arf_active_or_kf) {
     gf_group_err -= gf_first_frame_err;
 #if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error -= this_frame->coded_error;
@@ -1936,7 +1936,7 @@
       // bits to spare and are better with a smaller interval and smaller boost.
       // At high Q when there are few bits to spare we are better with a longer
       // interval to spread the cost of the GF.
-      active_max_gf_interval = rc->max_gf_interval - 4 + MIN(4, (int_lbq / 6));
+      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
       if (active_max_gf_interval < active_min_gf_interval)
         active_max_gf_interval = active_min_gf_interval;
 
@@ -2001,11 +2001,11 @@
     // Break out conditions.
     if (
       // Break at active_max_gf_interval unless almost totally static.
-      ((i >= active_max_gf_interval + kf_or_arf_active) &&
-       (zero_motion_accumulator < 0.995)) ||
+      (i >= (active_max_gf_interval + arf_active_or_kf) &&
+            zero_motion_accumulator < 0.995) ||
       (
         // Don't break out with a very short interval.
-        (i >= active_min_gf_interval + kf_or_arf_active) &&
+        (i >= active_min_gf_interval + arf_active_or_kf) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
@@ -2043,10 +2043,7 @@
   }
 
   // Set the interval until the next gf.
-  if (is_key_frame || rc->source_alt_ref_pending)
-    rc->baseline_gf_interval = i - 1;
-  else
-    rc->baseline_gf_interval = i;
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
 
   // Only encode alt reference frame in temporal base layer. So
   // baseline_gf_interval should be multiple of a temporal layer group

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index d5eeb9c..acbd7dd 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -145,7 +145,7 @@
     unsigned int err;
 
     xd->mi[0]->mbmi.mode = mode;
-    vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
+    vp9_predict_intra_block(xd, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index e99cbc7..6d09dbe 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -902,8 +902,7 @@
   p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
   pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
   // Use source buffer as an approximation for the fully reconstructed buffer.
-  vp9_predict_intra_block(xd, block >> (2 * tx_size),
-                          b_width_log2_lookup[plane_bsize],
+  vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
                           tx_size, args->mode,
                           x->skip_encode ? p->src.buf : pd->dst.buf,
                           x->skip_encode ? src_stride : dst_stride,
@@ -1020,9 +1019,9 @@
 static void init_ref_frame_cost(VP9_COMMON *const cm,
                                 MACROBLOCKD *const xd,
                                 int ref_frame_cost[MAX_REF_FRAMES]) {
-  vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
-  vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
-  vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+  vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
 
   ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
   ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 32c1f76..d53d95d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -9,7 +9,7 @@
  */
 
 #include <math.h>
-
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -20,113 +20,6 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
-    if (abs_qcoeff)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
-    if (abs_qcoeff)
-      eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -298,224 +191,6 @@
 }
 #endif
 
-void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp)
-          eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr,
-                             uint16_t *eob_ptr, const int16_t *scan,
-                             const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff)
-          eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-               quant_shift_ptr[rc != 0]) >> 15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
-                                   intptr_t n_coeffs, int skip_block,
-                                   const int16_t *zbin_ptr,
-                                   const int16_t *round_ptr,
-                                   const int16_t *quant_ptr,
-                                   const int16_t *quant_shift_ptr,
-                                   tran_low_t *qcoeff_ptr,
-                                   tran_low_t *dqcoeff_ptr,
-                                   const int16_t *dequant_ptr,
-                                   uint16_t *eob_ptr,
-                                   const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
-  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 = abs_coeff
-                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff)
-        eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 55e5469..6132036 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -37,34 +37,9 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr,
-                     int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
-                            int n_coeffs, int skip_block,
-                            const int16_t *round_ptr, const int16_t quant_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
-                                  int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr);
-#endif
-
 struct VP9_COMP;
 struct VP9Common;
 

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index be09bca..2be2a64 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -281,11 +281,14 @@
   // Assume we do not need any constraint lower than 4K 20 fps
   static const double factor_safe = 3840 * 2160 * 20.0;
   const double factor = width * height * framerate;
+  const double default_interval =
+      MIN(MAX_GF_INTERVAL, MAX(MIN_GF_INTERVAL, (int)(framerate * 0.125)));
 
   if (factor <= factor_safe)
-    return MIN_GF_INTERVAL;
+    return (int)default_interval;
   else
-    return (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5);
+    return (int)MAX(default_interval,
+                    (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
   // Note this logic makes:
   // 4K24: 5
   // 4K30: 6
@@ -294,6 +297,7 @@
 
 int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
   int interval = MIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
   return MAX(interval, min_gf_interval);
 }
 
@@ -1351,9 +1355,11 @@
   }
 
   // Trigger the resizing of the next frame if it is scaled.
-  cpi->resize_pending =
-      rc->next_frame_size_selector != rc->frame_size_selector;
-  rc->frame_size_selector = rc->next_frame_size_selector;
+  if (oxcf->pass != 0) {
+    cpi->resize_pending =
+        rc->next_frame_size_selector != rc->frame_size_selector;
+    rc->frame_size_selector = rc->next_frame_size_selector;
+  }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1628,9 +1634,9 @@
 
   vp9_rc_set_frame_target(cpi, target);
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
-    cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
+    cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
   else
-    cpi->resize_state = 0;
+    cpi->resize_pending = 0;
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1693,7 +1699,6 @@
   if (rc->max_gf_interval == 0)
     rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
         cpi->framerate, rc->min_gf_interval);
-  rc->max_gf_interval += (rc->max_gf_interval & 0x01);
 
   // Extended interval for genuinely static scenes
   rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
@@ -1805,12 +1810,12 @@
     cpi->resize_count = 0;
     return 0;
   }
-  // Resize based on average QP over some window.
+  // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
   if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
     const int window = (int)(5 * cpi->framerate);
     cpi->resize_avg_qp += cm->base_qindex;
-    if (cpi->rc.buffer_level < 0)
+    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
       ++cpi->resize_buffer_underflow;
     ++cpi->resize_count;
     // Check for resize action every "window" frames.
@@ -1821,11 +1826,13 @@
       // Resize back up if average QP is low, and we are currently in a resized
       // down state.
       if (cpi->resize_state == 0 &&
-          cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) {
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
         resize_now = 1;
+        cpi->resize_state = 1;
       } else if (cpi->resize_state == 1 &&
                  avg_qp < 40 * cpi->rc.worst_quality / 100) {
         resize_now = -1;
+        cpi->resize_state = 0;
       }
       // Reset for next window measurement.
       cpi->resize_avg_qp = 0;

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index bc7cb34..3f6de42 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -93,7 +93,7 @@
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-            vp9_prob probs[ENTROPY_NODES];
+            vpx_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9985f89..8ae6783 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -641,7 +641,7 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
@@ -654,7 +654,7 @@
   int64_t best_rd = INT64_MAX;
   TX_SIZE best_tx = max_tx_size;
 
-  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
@@ -760,7 +760,8 @@
   return 0;
 }
 
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
+                                     int row, int col,
                                      PREDICTION_MODE *best_mode,
                                      const int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
@@ -770,18 +771,14 @@
   PREDICTION_MODE mode;
   MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
-                                                                src_stride)];
-  uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
-                                                           dst_stride)];
+  const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+  uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4];
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
-
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -790,8 +787,6 @@
   uint16_t best_dst16[8 * 8];
 #endif
 
-  assert(ib < 4);
-
   memcpy(ta, a, sizeof(ta));
   memcpy(tl, l, sizeof(tl));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
@@ -819,7 +814,7 @@
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-          const int block = ib + idy * 2 + idx;
+          const int block = (row + idy) * 2 + (col + idx);
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
           int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
@@ -827,11 +822,11 @@
                                                                   p->src_diff);
           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
           xd->mi[0]->bmi[block].as_mode = mode;
-          vp9_predict_intra_block(xd, block, 1,
-                                  TX_4X4, mode,
+          vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                   x->skip_encode ? src : dst,
                                   x->skip_encode ? src_stride : dst_stride,
-                                  dst, dst_stride, idx, idy, 0);
+                                  dst, dst_stride,
+                                  col + idx, row + idy, 0);
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
                                     dst, dst_stride, xd->bd);
           if (xd->lossless) {
@@ -850,7 +845,10 @@
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
             const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
-            vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+            if (tx_type == DCT_DCT)
+              vp9_highbd_fdct4x4(src_diff, coeff, 8);
+            else
+              vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                                  so->scan, so->neighbors,
@@ -920,18 +918,17 @@
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        const int block = ib + idy * 2 + idx;
+        const int block = (row + idy) * 2 + (col + idx);
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi[0]->bmi[block].as_mode = mode;
-        vp9_predict_intra_block(xd, block, 1,
-                                TX_4X4, mode,
+        vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
-                                dst, dst_stride, idx, idy, 0);
+                                dst, dst_stride, col + idx, row + idy, 0);
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
@@ -1030,9 +1027,9 @@
         bmode_costs  = cpi->y_mode_costs[A][L];
       }
 
-      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + idx, t_left + idy, &r, &ry, &d,
-                                      bsize, best_rd - total_rd);
+      this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
+                                      bmode_costs, t_above + idx, t_left + idy,
+                                      &r, &ry, &d, bsize, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
 
@@ -2121,7 +2118,7 @@
                                      int segment_id,
                                      unsigned int *ref_costs_single,
                                      unsigned int *ref_costs_comp,
-                                     vp9_prob *comp_mode_p) {
+                                     vpx_prob *comp_mode_p) {
   int seg_ref_active = segfeature_active(&cm->seg, segment_id,
                                          SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
@@ -2129,8 +2126,8 @@
     memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
-    vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
-    vp9_prob comp_inter_p = 128;
+    vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+    vpx_prob comp_inter_p = 128;
 
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
@@ -2142,8 +2139,8 @@
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
-      vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
-      vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+      vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+      vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -2162,7 +2159,7 @@
       ref_costs_single[ALTREF_FRAME] = 512;
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
-      vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
+      vpx_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
       if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -3006,7 +3003,7 @@
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   unsigned int best_pred_sse = UINT_MAX;
   PREDICTION_MODE best_intra_mode = DC_PRED;
@@ -3699,7 +3696,7 @@
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   INTERP_FILTER best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
   int rate2 = 0;
@@ -3815,7 +3812,7 @@
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
-  vp9_prob comp_mode_p;
+  vpx_prob comp_mode_p;
   INTERP_FILTER tmp_best_filter = SWITCHABLE;
   int rate_uv_intra, rate_uv_tokenonly;
   int64_t dist_uv;

diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 1f0d4df..c5c50a2 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c

@@ -49,7 +49,7 @@
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(int *segcounts, vp9_prob *segment_tree_probs) {
+static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
   // Work out probabilities of each segment
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
@@ -66,7 +66,7 @@
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(int *segcounts, vp9_prob *probs) {
+static int cost_segmap(int *segcounts, vpx_prob *probs) {
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
   const int c45 = segcounts[4] + segcounts[5];
@@ -207,9 +207,9 @@
   int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
   int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
 
-  vp9_prob no_pred_tree[SEG_TREE_PROBS];
-  vp9_prob t_pred_tree[SEG_TREE_PROBS];
-  vp9_prob t_nopred_prob[PREDICTION_PROBS];
+  vpx_prob no_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_pred_tree[SEG_TREE_PROBS];
+  vpx_prob t_nopred_prob[PREDICTION_PROBS];
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index e544f9b..bf06fd0 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -118,10 +118,11 @@
   if (speed >= 1) {
     if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
         vp9_internal_image_edge(cpi)) {
-      sf->use_square_partition_only = frame_is_boosted(cpi);
+      sf->use_square_partition_only = !frame_is_boosted(cpi);
     } else {
       sf->use_square_partition_only = !frame_is_intra_only(cm);
     }
+
     sf->less_rectangular_check  = 1;
 
     sf->use_rd_breakout = 1;

diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index b345b16..799f179 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c

@@ -7,13 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "vpx_dsp/bitwriter.h"
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
-
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_subexp.h"
-#include "vp9/encoder/vp9_writer.h"
 
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
@@ -79,50 +78,50 @@
   return i;
 }
 
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
   int delp = remap_prob(newp, oldp);
   return update_bits[delp] * 256;
 }
 
-static void encode_uniform(vp9_writer *w, int v) {
+static void encode_uniform(vpx_writer *w, int v) {
   const int l = 8;
   const int m = (1 << l) - 191;
   if (v < m) {
-    vp9_write_literal(w, v, l - 1);
+    vpx_write_literal(w, v, l - 1);
   } else {
-    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vp9_write_literal(w, (v - m) & 1, 1);
+    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vpx_write_literal(w, (v - m) & 1, 1);
   }
 }
 
-static INLINE int write_bit_gte(vp9_writer *w, int word, int test) {
-  vp9_write_literal(w, word >= test, 1);
+static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
+  vpx_write_literal(w, word >= test, 1);
   return word >= test;
 }
 
-static void encode_term_subexp(vp9_writer *w, int word) {
+static void encode_term_subexp(vpx_writer *w, int word) {
   if (!write_bit_gte(w, word, 16)) {
-    vp9_write_literal(w, word, 4);
+    vpx_write_literal(w, word, 4);
   } else if (!write_bit_gte(w, word, 32)) {
-    vp9_write_literal(w, word - 16, 4);
+    vpx_write_literal(w, word - 16, 4);
   } else if (!write_bit_gte(w, word, 64)) {
-    vp9_write_literal(w, word - 32, 5);
+    vpx_write_literal(w, word - 32, 5);
   } else {
     encode_uniform(w, word - 64);
   }
 }
 
-void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
+void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
   const int delp = remap_prob(newp, oldp);
   encode_term_subexp(w, delp);
 }
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
-                                        vp9_prob oldp, vp9_prob *bestp,
-                                        vp9_prob upd) {
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd) {
   const int old_b = cost_branch256(ct, oldp);
   int bestsavings = 0;
-  vp9_prob newp, bestnewp = oldp;
+  vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
 
   for (newp = *bestp; newp != oldp; newp += step) {
@@ -139,15 +138,15 @@
 }
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vp9_prob *oldp,
-                                              vp9_prob *bestp,
-                                              vp9_prob upd,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
                                               int stepsize) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
-  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
   vp9_model_to_full_probs(oldp, oldplist);
-  memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
   for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
     old_b += cost_branch256(ct + 2 * i, oldplist[i]);
   old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
@@ -197,18 +196,18 @@
   return bestsavings;
 }
 
-void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
+void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
-  const vp9_prob upd = DIFF_UPDATE_PROB;
-  vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  vpx_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
                                                           upd);
   assert(newp >= 1);
   if (savings > 0) {
-    vp9_write(w, 1, upd);
+    vpx_write(w, 1, upd);
     vp9_write_prob_diff_update(w, newp, *oldp);
     *oldp = newp;
   } else {
-    vp9_write(w, 0, upd);
+    vpx_write(w, 0, upd);
   }
 }

diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 6fbb747..b968232 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h

@@ -16,25 +16,25 @@
 extern "C" {
 #endif
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
-struct vp9_writer;
+struct vpx_writer;
 
-void vp9_write_prob_diff_update(struct vp9_writer *w,
-                                vp9_prob newp, vp9_prob oldp);
+void vp9_write_prob_diff_update(struct vpx_writer *w,
+                                vpx_prob newp, vpx_prob oldp);
 
-void vp9_cond_prob_diff_update(struct vp9_writer *w, vp9_prob *oldp,
+void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
-                                        vp9_prob oldp, vp9_prob *bestp,
-                                        vp9_prob upd);
+                                        vpx_prob oldp, vpx_prob *bestp,
+                                        vpx_prob upd);
 
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vp9_prob *oldp,
-                                              vp9_prob *bestp,
-                                              vp9_prob upd,
+                                              const vpx_prob *oldp,
+                                              vpx_prob *bestp,
+                                              vpx_prob upd,
                                               int stepsize);
 
 #ifdef __cplusplus

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 181a99c..a1c076a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -52,7 +52,7 @@
     / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -EOB_TOKEN, 2,                       // 0  = EOB
   -ZERO_TOKEN, 4,                      // 1  = ZERO
   -ONE_TOKEN, 6,                       // 2  = ONE
@@ -66,12 +66,12 @@
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
 
-static const vp9_tree_index cat1[2] = {0, 0};
-static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
-static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
-static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+static const vpx_tree_index cat1[2] = {0, 0};
+static const vpx_tree_index cat2[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
     14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
 
 static const int16_t zero_cost[] = {0};
@@ -367,20 +367,20 @@
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static const vp9_tree_index cat1_high10[2] = {0, 0};
-static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
-static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
-static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+static const vpx_tree_index cat1_high10[2] = {0, 0};
+static const vpx_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
   12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
   30, 30, 0, 0};
-static const vp9_tree_index cat1_high12[2] = {0, 0};
-static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0};
-static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
-static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
-static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
-static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+static const vpx_tree_index cat1_high12[2] = {0, 0};
+static const vpx_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vpx_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vpx_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vpx_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vpx_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
   12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
   30, 30, 32, 32, 34, 34, 0, 0};
 #endif
@@ -457,7 +457,7 @@
                    aoff, loff);
 }
 
-static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree,
+static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
                              int32_t extra, uint8_t token,
                              uint8_t skip_eob_node,
                              unsigned int *counts) {
@@ -470,7 +470,7 @@
 }
 
 static INLINE void add_token_no_extra(TOKENEXTRA **t,
-                                      const vp9_prob *context_tree,
+                                      const vpx_prob *context_tree,
                                       uint8_t token,
                                       uint8_t skip_eob_node,
                                       unsigned int *counts) {
@@ -511,7 +511,7 @@
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
-  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+  vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       cpi->common.fc->coef_probs[tx_size][type][ref];
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[tx_size][type][ref];

diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 81cc2e1..11b78ba 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h

@@ -35,14 +35,14 @@
 } TOKENVALUE;
 
 typedef struct {
-  const vp9_prob *context_tree;
+  const vpx_prob *context_tree;
   EXTRABIT extra;
   uint8_t token;
   uint8_t skip_eob_node;
 } TOKENEXTRA;
 
-extern const vp9_tree_index vp9_coef_tree[];
-extern const vp9_tree_index vp9_coef_con_tree[];
+extern const vpx_tree_index vp9_coef_tree[];
+extern const vpx_tree_index vp9_coef_con_tree[];
 extern const struct vp9_token vp9_coef_encodings[];
 
 int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);

diff --git a/vp9/encoder/vp9_treewriter.c b/vp9/encoder/vp9_treewriter.c
index bb04b40..0fc078e 100644
--- a/vp9/encoder/vp9_treewriter.c
+++ b/vp9/encoder/vp9_treewriter.c

@@ -10,13 +10,13 @@
 
 #include "vp9/encoder/vp9_treewriter.h"
 
-static void tree2tok(struct vp9_token *tokens, const vp9_tree_index *tree,
+static void tree2tok(struct vp9_token *tokens, const vpx_tree_index *tree,
                      int i, int v, int l) {
   v += v;
   ++l;
 
   do {
-    const vp9_tree_index j = tree[i++];
+    const vpx_tree_index j = tree[i++];
     if (j <= 0) {
       tokens[-j].value = v;
       tokens[-j].len = l;
@@ -27,11 +27,11 @@
 }
 
 void vp9_tokens_from_tree(struct vp9_token *tokens,
-                          const vp9_tree_index *tree) {
+                          const vpx_tree_index *tree) {
   tree2tok(tokens, tree, 0, 0, 0);
 }
 
-static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
+static unsigned int convert_distribution(unsigned int i, vpx_tree tree,
                                          unsigned int branch_ct[][2],
                                          const unsigned int num_events[]) {
   unsigned int left, right;
@@ -51,7 +51,7 @@
   return left + right;
 }
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
+void vp9_tree_probs_from_distribution(vpx_tree tree,
                                       unsigned int branch_ct[/* n-1 */][2],
                                       const unsigned int num_events[/* n */]) {
   convert_distribution(0, tree, branch_ct, num_events);

diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index 4a76d87..0f89350 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h

@@ -11,13 +11,13 @@
 #ifndef VP9_ENCODER_VP9_TREEWRITER_H_
 #define VP9_ENCODER_VP9_TREEWRITER_H_
 
-#include "vp9/encoder/vp9_writer.h"
+#include "vpx_dsp/bitwriter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
+void vp9_tree_probs_from_distribution(vpx_tree tree,
                                       unsigned int branch_ct[ /* n - 1 */ ][2],
                                       const unsigned int num_events[ /* n */ ]);
 
@@ -26,20 +26,20 @@
   int len;
 };
 
-void vp9_tokens_from_tree(struct vp9_token*, const vp9_tree_index *);
+void vp9_tokens_from_tree(struct vp9_token*, const vpx_tree_index *);
 
-static INLINE void vp9_write_tree(vp9_writer *w, const vp9_tree_index *tree,
-                                  const vp9_prob *probs, int bits, int len,
-                                  vp9_tree_index i) {
+static INLINE void vp9_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+                                  const vpx_prob *probs, int bits, int len,
+                                  vpx_tree_index i) {
   do {
     const int bit = (bits >> --len) & 1;
-    vp9_write(w, bit, probs[i >> 1]);
+    vpx_write(w, bit, probs[i >> 1]);
     i = tree[i + bit];
   } while (len);
 }
 
-static INLINE void vp9_write_token(vp9_writer *w, const vp9_tree_index *tree,
-                                   const vp9_prob *probs,
+static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
+                                   const vpx_prob *probs,
                                    const struct vp9_token *token) {
   vp9_write_tree(w, tree, probs, token->value, token->len, 0);
 }

diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
index 5074d31..02773d8 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2_impl.h

@@ -10,9 +10,6 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_idct.h"  // for cospi constants
-#include "vp9/encoder/x86/vp9_dct_sse2.h"
 #include "vp9/encoder/vp9_dct.h"
 #include "vpx_ports/mem.h"
 

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index cff4fcb..c82f154 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -2266,108 +2266,6 @@
   store_output(&in1, output);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-/* These SSE2 versions of the FHT functions only actually use SSE2 in the
- * DCT_DCT case in all other cases, they revert to C code which is identical
- * to that used by the C versions of them.
- */
-
-void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output,
-                            int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct4x4_sse2(input, output, stride);
-  } else {
-    tran_low_t out[4 * 4];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[4], temp_out[4];
-    const transform_2d ht = FHT_4[tx_type];
-
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = input[j * stride + i] * 16;
-      if (i == 0 && temp_in[0])
-        temp_in[0] += 1;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        outptr[j * 4 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j)
-        temp_in[j] = out[j + i * 4];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 4; ++j)
-        output[j + i * 4] = (temp_out[j] + 1) >> 2;
-    }
-  }
-}
-
-void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output,
-                            int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct8x8_sse2(input, output, stride);
-  } else {
-    tran_low_t out[64];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[8], temp_out[8];
-    const transform_2d ht = FHT_8[tx_type];
-
-    // Columns
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        outptr[j * 8 + i] = temp_out[j];
-    }
-
-    // Rows
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j)
-        temp_in[j] = out[j + i * 8];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 8; ++j)
-        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
-    }
-  }
-}
-
-void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output,
-                              int stride, int tx_type) {
-  if (tx_type == DCT_DCT) {
-    vp9_highbd_fdct16x16_sse2(input, output, stride);
-  } else {
-    tran_low_t out[256];
-    tran_low_t *outptr = &out[0];
-    int i, j;
-    tran_low_t temp_in[16], temp_out[16];
-    const transform_2d ht = FHT_16[tx_type];
-
-    // Columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-    }
-
-    // Rows
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j + i * 16];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j + i * 16] = temp_out[j];
-    }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 /*
  * The DCTnxn functions are defined using the macros below. The main code for
  * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h &
@@ -2377,14 +2275,6 @@
 
 #define DCT_HIGH_BIT_DEPTH 0
 
-#define FDCT4x4_2D vp9_fdct4x4_sse2
-#define FDCT8x8_2D vp9_fdct8x8_sse2
-#define FDCT16x16_2D vp9_fdct16x16_sse2
-#include "vp9/encoder/x86/vp9_dct_sse2_impl.h"
-#undef  FDCT4x4_2D
-#undef  FDCT8x8_2D
-#undef  FDCT16x16_2D
-
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h"
@@ -2404,14 +2294,6 @@
 
 #define DCT_HIGH_BIT_DEPTH 1
 
-#define FDCT4x4_2D vp9_highbd_fdct4x4_sse2
-#define FDCT8x8_2D vp9_highbd_fdct8x8_sse2
-#define FDCT16x16_2D vp9_highbd_fdct16x16_sse2
-#include "vp9/encoder/x86/vp9_dct_sse2_impl.h" // NOLINT
-#undef  FDCT4x4_2D
-#undef  FDCT8x8_2D
-#undef  FDCT16x16_2D
-
 #define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2_impl.h" // NOLINT

diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 3a29aba..29074e7 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm

@@ -13,49 +13,9 @@
 ; of the macro definitions are originally derived from the ffmpeg project.
 ; The current version applies to x86 64-bit only.
 
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585,  11585
-TRANSFORM_COEFFS 15137,   6270
-TRANSFORM_COEFFS 16069,   3196
-TRANSFORM_COEFFS  9102,  13623
-
 SECTION .text
 
 %if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
 ; matrix transpose
 %macro INTERLEAVE_2X 4
   punpckh%1          m%4, m%2, m%3
@@ -83,103 +43,6 @@
   SWAP  %4, %7
 %endmacro
 
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
-  pxor              m11, m11
-
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  ; left shift by 2 to increase forward transformation precision
-  psllw              m0, 2
-  psllw              m1, 2
-  psllw              m2, 2
-  psllw              m3, 2
-  psllw              m4, 2
-  psllw              m5, 2
-  psllw              m6, 2
-  psllw              m7, 2
-
-  ; column transform
-  FDCT8_1D  0
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
-
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-
-  RET
-
 %macro HMD8_1D 0
   psubw              m8, m0, m1
   psubw              m9, m2, m3

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 71fdfd7..2071dfe 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -14,214 +14,6 @@
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t* zbin_ptr,
-                         const int16_t* round_ptr, const int16_t* quant_ptr,
-                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
-                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                         uint16_t* eob_ptr,
-                         const int16_t* scan_ptr,
-                         const int16_t* iscan_ptr) {
-  __m128i zero;
-  (void)scan_ptr;
-
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
-        round = _mm_load_si128((const __m128i*)round_ptr);
-        quant = _mm_load_si128((const __m128i*)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
-        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-
-        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
-
 void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t* zbin_ptr,
                           const int16_t* round_ptr, const int16_t* quant_ptr,

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 449d52b..ec2e87c 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -15,206 +15,6 @@
 
 SECTION .text
 
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
 %macro QUANTIZE_FP 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
                                 shift, qcoeff, dqcoeff, dequant, \

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index bcab558..c8cf973 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -38,8 +38,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.c
-VP9_COMMON_SRCS-yes += common/vp9_prob.h
-VP9_COMMON_SRCS-yes += common/vp9_prob.c
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.h
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
@@ -54,7 +52,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
-VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
 VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
@@ -69,14 +66,11 @@
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
@@ -95,7 +89,6 @@
 endif
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
@@ -123,13 +116,6 @@
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 # common (msa)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
@@ -147,10 +133,6 @@
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
 
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
@@ -165,16 +147,12 @@
 endif
 endif
 
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
 endif
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
 
 # neon with assembly and intrinsics implementations. If both are available
 # prefer assembly.
@@ -193,7 +171,6 @@
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
 else
 ifeq ($(HAVE_NEON), yes)
@@ -211,11 +188,6 @@
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
 VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
-# TODO(johannkoenig): re-enable when chromium build is fixed
-# # https://code.google.com/p/chromium/issues/detail?id=443839
-#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d2d9288..f155b9a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -173,9 +173,12 @@
   RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
   RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
+  }
   if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
     RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
-                (MAX_LAG_BUFFERS - 1));
+      (MAX_LAG_BUFFERS - 1));
   }
 
   if (cfg->rc_resize_allowed == 1) {

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 5b62c3ec..96ede3c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -17,6 +17,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
+#include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -24,7 +25,6 @@
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_decodeframe.h"
-#include "vp9/decoder/vp9_read_bit_buffer.h"
 
 #include "vp9/vp9_iface_common.h"
 
@@ -145,11 +145,11 @@
 }
 
 static int parse_bitdepth_colorspace_sampling(
-    BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
+    BITSTREAM_PROFILE profile, struct vpx_read_bit_buffer *rb) {
   vpx_color_space_t color_space;
   if (profile >= PROFILE_2)
     rb->bit_offset += 1;  // Bit-depth 10 or 12.
-  color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3);
+  color_space = (vpx_color_space_t)vpx_rb_read_literal(rb, 3);
   if (color_space != VPX_CS_SRGB) {
     rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
     if (profile == PROFILE_1 || profile == PROFILE_3) {
@@ -191,8 +191,8 @@
   {
     int show_frame;
     int error_resilient;
-    struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
-    const int frame_marker = vp9_rb_read_literal(&rb, 2);
+    struct vpx_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+    const int frame_marker = vpx_rb_read_literal(&rb, 2);
     const BITSTREAM_PROFILE profile = vp9_read_profile(&rb);
 
     if (frame_marker != VP9_FRAME_MARKER)
@@ -204,17 +204,17 @@
     if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (vp9_rb_read_bit(&rb)) {  // show an existing frame
-      vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
+    if (vpx_rb_read_bit(&rb)) {  // show an existing frame
+      vpx_rb_read_literal(&rb, 3);  // Frame buffer to show.
       return VPX_CODEC_OK;
     }
 
     if (data_sz <= 8)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    si->is_kf = !vp9_rb_read_bit(&rb);
-    show_frame = vp9_rb_read_bit(&rb);
-    error_resilient = vp9_rb_read_bit(&rb);
+    si->is_kf = !vpx_rb_read_bit(&rb);
+    show_frame = vpx_rb_read_bit(&rb);
+    error_resilient = vpx_rb_read_bit(&rb);
 
     if (si->is_kf) {
       if (!vp9_read_sync_code(&rb))
@@ -224,7 +224,7 @@
         return VPX_CODEC_UNSUP_BITSTREAM;
       vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
     } else {
-      intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb);
+      intra_only_flag = show_frame ? 0 : vpx_rb_read_bit(&rb);
 
       rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
 

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 94cc7ba..50d52ed 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -37,10 +37,6 @@
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_fastssim.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.h
-VP9_CX_SRCS-yes += encoder/vp9_writer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.c
-VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
@@ -104,7 +100,6 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 
@@ -125,7 +120,6 @@
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2_impl.h
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2_impl.h
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c

diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index c105adb..0e9cf16 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk

@@ -21,10 +21,6 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_reader.h
-VP9_DX_SRCS-yes += decoder/vp9_reader.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
-VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_dthread.c

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 013c67a..0c6846b 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -733,9 +733,19 @@
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL,  unsigned int)
+
+/*!\brief
+ *
+ * TODO(debargha) : add support of the control in ffmpeg
+ */
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
 
+
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL,  unsigned int)
+/*!\brief
+ *
+ * TODO(debargha) : add support of the control in ffmpeg
+ */
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
 
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)

diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 0000000..66faea5
--- /dev/null
+++ b/vpx_dsp/arm/fwd_txfm_neon.c

@@ -0,0 +1,202 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+                                            vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+                                            vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+                                            vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+                                            vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vp9_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm b/vpx_dsp/arm/loopfilter_16_neon.asm
similarity index 94%
rename from vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_16_neon.asm
index 5b8ec20..5a8fdd6 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_16_neon.asm

@@ -8,12 +8,12 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_dual_neon|
+    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 ;                                    const uint8_t *blimit0,
 ;                                    const uint8_t *limit0,
 ;                                    const uint8_t *thresh0,
@@ -29,7 +29,7 @@
 ; sp+8  const uint8_t *limit1,
 ; sp+12 const uint8_t *thresh1,
 
-|vp9_lpf_horizontal_4_dual_neon| PROC
+|vpx_lpf_horizontal_4_dual_neon| PROC
     push        {lr}
 
     ldr         r12, [sp, #4]              ; load thresh0
@@ -66,7 +66,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon_16
+    bl          vpx_loop_filter_neon_16
 
     vst1.u8     {q5}, [r2@64], r1          ; store op1
     vst1.u8     {q6}, [r3@64], r1          ; store op0
@@ -76,9 +76,9 @@
     vpop        {d8-d15}                   ; restore neon registers
 
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_dual_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
 
-; void vp9_loop_filter_neon_16();
+; void vpx_loop_filter_neon_16();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. This function uses
 ; registers d8-d15, so the calling function must save those registers.
@@ -101,7 +101,7 @@
 ; q6    op0
 ; q7    oq0
 ; q8    oq1
-|vp9_loop_filter_neon_16| PROC
+|vpx_loop_filter_neon_16| PROC
 
     ; filter_mask
     vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
@@ -194,6 +194,6 @@
     veor        q8, q12, q10                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon_16|
+    ENDP        ; |vpx_loop_filter_neon_16|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c
similarity index 93%
rename from vp9/common/arm/neon/vp9_loopfilter_16_neon.c
rename to vpx_dsp/arm/loopfilter_16_neon.c
index c69ee10..d24e6ad 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vpx_dsp/arm/loopfilter_16_neon.c

@@ -10,11 +10,11 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static INLINE void vp9_loop_filter_neon_16(
+static INLINE void loop_filter_neon_16(
         uint8x16_t qblimit,  // blimit
         uint8x16_t qlimit,   // limit
         uint8x16_t qthresh,  // thresh
@@ -124,7 +124,7 @@
     return;
 }
 
-void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
@@ -163,9 +163,9 @@
     s += p;
     q10u8 = vld1q_u8(s);
 
-    vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
-                            q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
-                            &q5u8, &q6u8, &q7u8, &q8u8);
+    loop_filter_neon_16(qblimit, qlimit, qthresh,
+                        q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                        &q5u8, &q6u8, &q7u8, &q8u8);
 
     s -= (p * 5);
     vst1q_u8(s, q5u8);

diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vpx_dsp/arm/loopfilter_4_neon.asm
similarity index 91%
rename from vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_4_neon.asm
index 7738e0d..e45e34c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_4_neon.asm

@@ -8,18 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_neon|
-    EXPORT  |vp9_lpf_vertical_4_neon|
+    EXPORT  |vpx_lpf_horizontal_4_neon|
+    EXPORT  |vpx_lpf_vertical_4_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; void vpx_lpf_horizontal_4_neon(uint8_t *s,
 ;                                int p /* pitch */,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
@@ -32,7 +32,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_4_neon| PROC
+|vpx_lpf_horizontal_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -41,7 +41,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_lf_h_edge
+    beq         end_vpx_lf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -62,7 +62,7 @@
     sub         r2, r2, r1, lsl #1
     sub         r3, r3, r1, lsl #1
 
-    bl          vp9_loop_filter_neon
+    bl          vpx_loop_filter_neon
 
     vst1.u8     {d4}, [r2@64], r1          ; store op1
     vst1.u8     {d5}, [r3@64], r1          ; store op0
@@ -73,16 +73,16 @@
     subs        r12, r12, #1
     bne         count_lf_h_loop
 
-end_vp9_lf_h_edge
+end_vpx_lf_h_edge
     pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_neon|
+    ENDP        ; |vpx_lpf_horizontal_4_neon|
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; void vpx_lpf_vertical_4_neon(uint8_t *s,
 ;                              int p /* pitch */,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -95,7 +95,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_4_neon| PROC
+|vpx_lpf_vertical_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -105,7 +105,7 @@
     ldr         r3, [sp, #4]              ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_lf_v_edge
+    beq         end_vpx_lf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -135,7 +135,7 @@
     vtrn.8      d7, d16
     vtrn.8      d17, d18
 
-    bl          vp9_loop_filter_neon
+    bl          vpx_loop_filter_neon
 
     sub         r0, r0, #2
 
@@ -154,11 +154,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_lf_v_loop
 
-end_vp9_lf_v_edge
+end_vpx_lf_v_edge
     pop         {pc}
-    ENDP        ; |vp9_lpf_vertical_4_neon|
+    ENDP        ; |vpx_lpf_vertical_4_neon|
 
-; void vp9_loop_filter_neon();
+; void vpx_loop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -182,7 +182,7 @@
 ; d5    op0
 ; d6    oq0
 ; d7    oq1
-|vp9_loop_filter_neon| PROC
+|vpx_loop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
@@ -272,6 +272,6 @@
     veor        d7, d20, d18                ; *oq1 = u^0x80
 
     bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
+    ENDP        ; |vpx_loop_filter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
similarity index 89%
rename from vp9/common/arm/neon/vp9_loopfilter_4_neon.c
rename to vpx_dsp/arm/loopfilter_4_neon.c
index fd9db61..7ad411a 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c

@@ -10,9 +10,9 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
-static INLINE void vp9_loop_filter_neon(
+static INLINE void loop_filter_neon(
         uint8x8_t dblimit,    // flimit
         uint8x8_t dlimit,     // limit
         uint8x8_t dthresh,    // thresh
@@ -110,19 +110,19 @@
     return;
 }
 
-void vp9_lpf_horizontal_4_neon(
-        unsigned char *src,
+void vpx_lpf_horizontal_4_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vp9_lf_h_edge
+    if (count == 0)  // end_vpx_lf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -149,9 +149,9 @@
         s += pitch;
         d18u8 = vld1_u8(s);
 
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
 
         s -= (pitch * 5);
         vst1_u8(s, d4u8);
@@ -165,12 +165,12 @@
     return;
 }
 
-void vp9_lpf_vertical_4_neon(
-        unsigned char *src,
+void vpx_lpf_vertical_4_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i, pitch8;
     uint8_t *s;
@@ -181,7 +181,7 @@
     uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
     uint8x8x4_t d4Result;
 
-    if (count == 0)  // end_vp9_lf_h_edge
+    if (count == 0)  // end_vpx_lf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -244,9 +244,9 @@
         d17u8 = d2tmp11.val[0];
         d18u8 = d2tmp11.val[1];
 
-        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d4u8, &d5u8, &d6u8, &d7u8);
+        loop_filter_neon(dblimit, dlimit, dthresh,
+                         d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                         &d4u8, &d5u8, &d6u8, &d7u8);
 
         d4Result.val[0] = d4u8;
         d4Result.val[1] = d5u8;

diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
similarity index 94%
rename from vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
rename to vpx_dsp/arm/loopfilter_8_neon.asm
index 91aaec0..e81734c 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm

@@ -8,18 +8,18 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_8_neon|
-    EXPORT  |vp9_lpf_vertical_8_neon|
+    EXPORT  |vpx_lpf_horizontal_8_neon|
+    EXPORT  |vpx_lpf_vertical_8_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
 ;                                const uint8_t *thresh,
@@ -30,7 +30,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_horizontal_8_neon| PROC
+|vpx_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -39,7 +39,7 @@
     add         r1, r1, r1                 ; double pitch
 
     cmp         r12, #0
-    beq         end_vp9_mblf_h_edge
+    beq         end_vpx_mblf_h_edge
 
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
@@ -60,7 +60,7 @@
     sub         r3, r3, r1, lsl #1
     sub         r2, r2, r1, lsl #2
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     vst1.u8     {d0}, [r2@64], r1          ; store op2
     vst1.u8     {d1}, [r3@64], r1          ; store op1
@@ -73,12 +73,12 @@
     subs        r12, r12, #1
     bne         count_mblf_h_loop
 
-end_vp9_mblf_h_edge
+end_vpx_mblf_h_edge
     pop         {r4-r5, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_8_neon|
+    ENDP        ; |vpx_lpf_horizontal_8_neon|
 
-; void vp9_lpf_vertical_8_neon(uint8_t *s,
+; void vpx_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -91,7 +91,7 @@
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
 ; sp+4  int count
-|vp9_lpf_vertical_8_neon| PROC
+|vpx_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -101,7 +101,7 @@
     ldr         r3, [sp, #12]             ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
     cmp         r12, #0
-    beq         end_vp9_mblf_v_edge
+    beq         end_vpx_mblf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
@@ -134,7 +134,7 @@
     sub         r2, r0, #3
     add         r3, r0, #1
 
-    bl          vp9_mbloop_filter_neon
+    bl          vpx_mbloop_filter_neon
 
     ;store op2, op1, op0, oq0
     vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
@@ -161,11 +161,11 @@
     subne       r2, r0, #4                 ; move s pointer down by 4 columns
     bne         count_mblf_v_loop
 
-end_vp9_mblf_v_edge
+end_vpx_mblf_v_edge
     pop         {r4-r5, pc}
-    ENDP        ; |vp9_lpf_vertical_8_neon|
+    ENDP        ; |vpx_lpf_vertical_8_neon|
 
-; void vp9_mbloop_filter_neon();
+; void vpx_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -191,7 +191,7 @@
 ; d3    oq0
 ; d4    oq1
 ; d5    oq2
-|vp9_mbloop_filter_neon| PROC
+|vpx_mbloop_filter_neon| PROC
     ; filter_mask
     vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
@@ -446,6 +446,6 @@
 
     bx          lr
 
-    ENDP        ; |vp9_mbloop_filter_neon|
+    ENDP        ; |vpx_mbloop_filter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
similarity index 93%
rename from vp9/common/arm/neon/vp9_loopfilter_8_neon.c
rename to vpx_dsp/arm/loopfilter_8_neon.c
index 33068a8..a887e2e 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c

@@ -10,9 +10,9 @@
 
 #include <arm_neon.h>
 
-#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
-static INLINE void vp9_mbloop_filter_neon(
+static INLINE void mbloop_filter_neon(
         uint8x8_t dblimit,   // mblimit
         uint8x8_t dlimit,    // limit
         uint8x8_t dthresh,   // thresh
@@ -263,12 +263,12 @@
     return;
 }
 
-void vp9_lpf_horizontal_8_neon(
-        unsigned char *src,
+void vpx_lpf_horizontal_8_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s, *psrc;
@@ -276,7 +276,7 @@
     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
     uint8x8_t d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vp9_mblf_h_edge
+    if (count == 0)  // end_vpx_mblf_h_edge
         return;
 
     dblimit = vld1_u8(blimit);
@@ -303,9 +303,9 @@
         s += pitch;
         d18u8 = vld1_u8(s);
 
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 
         s -= (pitch * 6);
         vst1_u8(s, d0u8);
@@ -323,12 +323,12 @@
     return;
 }
 
-void vp9_lpf_vertical_8_neon(
-        unsigned char *src,
+void vpx_lpf_vertical_8_neon(
+        uint8_t *src,
         int pitch,
-        unsigned char *blimit,
-        unsigned char *limit,
-        unsigned char *thresh,
+        const uint8_t *blimit,
+        const uint8_t *limit,
+        const uint8_t *thresh,
         int count) {
     int i;
     uint8_t *s;
@@ -403,9 +403,9 @@
         d17u8 = d2tmp11.val[0];
         d18u8 = d2tmp11.val[1];
 
-        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
-                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
-                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+        mbloop_filter_neon(dblimit, dlimit, dthresh,
+                           d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                           &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
 
         d4Result.val[0] = d0u8;
         d4Result.val[1] = d1u8;

diff --git a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm
similarity index 96%
rename from vp9/common/arm/neon/vp9_mb_lpf_neon.asm
rename to vpx_dsp/arm/loopfilter_mb_neon.asm
index 5fe2bba..20d9cfb 100644
--- a/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm

@@ -8,13 +8,13 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_16_neon|
-    EXPORT  |vp9_lpf_vertical_16_neon|
+    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
+; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
 ;                                 const uint8_t *blimit,
 ;                                 const uint8_t *limit,
 ;                                 const uint8_t *thresh
@@ -24,7 +24,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_horizontal_16_neon| PROC
+|vpx_lpf_horizontal_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -54,7 +54,7 @@
     vld1.u8     {d14}, [r8@64], r1         ; q6
     vld1.u8     {d15}, [r8@64], r1         ; q7
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         h_mbfilter
@@ -115,9 +115,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_horizontal_16_neon|
+    ENDP        ; |vpx_lpf_horizontal_16_neon|
 
-; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
+; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
 ;                               const uint8_t *limit,
 ;                               const uint8_t *thresh)
@@ -126,7 +126,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vp9_lpf_vertical_16_neon| PROC
+|vpx_lpf_vertical_16_neon| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
@@ -176,7 +176,7 @@
     vtrn.8      d12, d13
     vtrn.8      d14, d15
 
-    bl          vp9_wide_mbfilter_neon
+    bl          vpx_wide_mbfilter_neon
 
     tst         r7, #1
     beq         v_mbfilter
@@ -279,9 +279,9 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vp9_lpf_vertical_16_neon|
+    ENDP        ; |vpx_lpf_vertical_16_neon|
 
-; void vp9_wide_mbfilter_neon();
+; void vpx_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 ;
@@ -305,7 +305,7 @@
 ; d13   q5
 ; d14   q6
 ; d15   q7
-|vp9_wide_mbfilter_neon| PROC
+|vpx_wide_mbfilter_neon| PROC
     mov         r7, #0
 
     ; filter_mask
@@ -601,6 +601,6 @@
     vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
 
     bx          lr
-    ENDP        ; |vp9_wide_mbfilter_neon|
+    ENDP        ; |vpx_wide_mbfilter_neon|
 
     END

diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
similarity index 70%
rename from vp9/common/arm/neon/vp9_loopfilter_neon.c
rename to vpx_dsp/arm/loopfilter_neon.c
index 31fcc63..eff87d2 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c

@@ -10,49 +10,49 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
 #if HAVE_NEON_ASM
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
                                    const uint8_t *thresh) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // HAVE_NEON_ASM

diff --git a/vp9/decoder/vp9_reader.c b/vpx_dsp/bitreader.c
similarity index 79%
rename from vp9/decoder/vp9_reader.c
rename to vpx_dsp/bitreader.c
index 9a46cd7..4420fad 100644
--- a/vp9/decoder/vp9_reader.c
+++ b/vpx_dsp/bitreader.c

@@ -9,23 +9,15 @@
  */
 #include <stdlib.h>
 
+#include "./vpx_config.h"
+
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/prob.h"
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
-
-#include "./vpx_config.h"
-#include "vp9/decoder/vp9_reader.h"
-
 #include "vpx_util/endian_inl.h"
 
-#if CONFIG_BIG_ENDIAN
-#define BIGENDIFY64(X) (X)
-#define BIGENDIFY32(X) (X)
-#else
-#define BIGENDIFY64(X) BSwap64(X)
-#define BIGENDIFY32(X) BSwap32(X)
-#endif
-
-int vp9_reader_init(vp9_reader *r,
+int vpx_reader_init(vpx_reader *r,
                     const uint8_t *buffer,
                     size_t size,
                     vpx_decrypt_cb decrypt_cb,
@@ -40,12 +32,12 @@
     r->range = 255;
     r->decrypt_cb = decrypt_cb;
     r->decrypt_state = decrypt_state;
-    vp9_reader_fill(r);
-    return vp9_read_bit(r) != 0;  // marker bit
+    vpx_reader_fill(r);
+    return vpx_read_bit(r) != 0;  // marker bit
   }
 }
 
-void vp9_reader_fill(vp9_reader *r) {
+void vpx_reader_fill(vpx_reader *r) {
   const uint8_t *const buffer_end = r->buffer_end;
   const uint8_t *buffer = r->buffer;
   const uint8_t *buffer_start = buffer;
@@ -62,13 +54,16 @@
     buffer_start = r->clear_buffer;
   }
   if (bits_left > BD_VALUE_SIZE) {
-#if UINTPTR_MAX == 0xffffffffffffffff
-      BD_VALUE big_endian_values = BIGENDIFY64(*((const BD_VALUE *) buffer));
-#else
-      BD_VALUE big_endian_values = BIGENDIFY32(*((const BD_VALUE *) buffer));
-#endif
       const int bits = (shift & 0xfffffff8) + CHAR_BIT;
-      const BD_VALUE nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+      BD_VALUE nv;
+      BD_VALUE big_endian_values;
+      memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+#if SIZE_MAX == 0xffffffffffffffffULL
+        big_endian_values = HToBE64(big_endian_values);
+#else
+        big_endian_values = HToBE32(big_endian_values);
+#endif
+      nv = big_endian_values >> (BD_VALUE_SIZE - bits);
       count += bits;
       buffer += (bits >> 3);
       value = r->value | (nv << (shift & 0x7));
@@ -97,7 +92,7 @@
   r->count = count;
 }
 
-const uint8_t *vp9_reader_find_end(vp9_reader *r) {
+const uint8_t *vpx_reader_find_end(vpx_reader *r) {
   // Find the end of the coded buffer
   while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
     r->count -= CHAR_BIT;

diff --git a/vp9/decoder/vp9_reader.h b/vpx_dsp/bitreader.h
similarity index 77%
rename from vp9/decoder/vp9_reader.h
rename to vpx_dsp/bitreader.h
index 4959985..e817c8b 100644
--- a/vp9/decoder/vp9_reader.h
+++ b/vpx_dsp/bitreader.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_READER_H_
-#define VP9_DECODER_VP9_READER_H_
+#ifndef VPX_DSP_BITREADER_H_
+#define VPX_DSP_BITREADER_H_
 
 #include <stddef.h>
 #include <limits.h>
@@ -18,8 +18,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -44,19 +43,19 @@
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
   uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
-} vp9_reader;
+} vpx_reader;
 
-int vp9_reader_init(vp9_reader *r,
+int vpx_reader_init(vpx_reader *r,
                     const uint8_t *buffer,
                     size_t size,
                     vpx_decrypt_cb decrypt_cb,
                     void *decrypt_state);
 
-void vp9_reader_fill(vp9_reader *r);
+void vpx_reader_fill(vpx_reader *r);
 
-const uint8_t *vp9_reader_find_end(vp9_reader *r);
+const uint8_t *vpx_reader_find_end(vpx_reader *r);
 
-static INLINE int vp9_reader_has_error(vp9_reader *r) {
+static INLINE int vpx_reader_has_error(vpx_reader *r) {
   // Check if we have reached the end of the buffer.
   //
   // Variable 'count' stores the number of bits in the 'value' buffer, minus
@@ -74,7 +73,7 @@
   return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
 }
 
-static INLINE int vp9_read(vp9_reader *r, int prob) {
+static INLINE int vpx_read(vpx_reader *r, int prob) {
   unsigned int bit = 0;
   BD_VALUE value;
   BD_VALUE bigsplit;
@@ -83,7 +82,7 @@
   unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
 
   if (r->count < 0)
-    vp9_reader_fill(r);
+    vpx_reader_fill(r);
 
   value = r->value;
   count = r->count;
@@ -99,7 +98,7 @@
   }
 
   {
-    register unsigned int shift = vp9_norm[range];
+    register unsigned int shift = vpx_norm[range];
     range <<= shift;
     value <<= shift;
     count -= shift;
@@ -111,24 +110,24 @@
   return bit;
 }
 
-static INLINE int vp9_read_bit(vp9_reader *r) {
-  return vp9_read(r, 128);  // vp9_prob_half
+static INLINE int vpx_read_bit(vpx_reader *r) {
+  return vpx_read(r, 128);  // vpx_prob_half
 }
 
-static INLINE int vp9_read_literal(vp9_reader *r, int bits) {
+static INLINE int vpx_read_literal(vpx_reader *r, int bits) {
   int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    literal |= vp9_read_bit(r) << bit;
+    literal |= vpx_read_bit(r) << bit;
 
   return literal;
 }
 
-static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
-                                const vp9_prob *probs) {
-  vp9_tree_index i = 0;
+static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
+                                const vpx_prob *probs) {
+  vpx_tree_index i = 0;
 
-  while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
+  while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0)
     continue;
 
   return -i;
@@ -138,4 +137,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_READER_H_
+#endif  // VPX_DSP_BITREADER_H_

diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vpx_dsp/bitreader_buffer.c
similarity index 67%
rename from vp9/decoder/vp9_read_bit_buffer.c
rename to vpx_dsp/bitreader_buffer.c
index c3b38a9..fb04ee6 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c

@@ -7,13 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vp9/decoder/vp9_read_bit_buffer.h"
+#include "./bitreader_buffer.h"
 
-size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb) {
   return (rb->bit_offset + 7) >> 3;
 }
 
-int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
   const size_t off = rb->bit_offset;
   const size_t p = off >> 3;
   const int q = 7 - (int)(off & 0x7);
@@ -27,15 +27,15 @@
   }
 }
 
-int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) {
   int value = 0, bit;
   for (bit = bits - 1; bit >= 0; bit--)
-    value |= vp9_rb_read_bit(rb) << bit;
+    value |= vpx_rb_read_bit(rb) << bit;
   return value;
 }
 
-int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb,
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb,
                                int bits) {
-  const int value = vp9_rb_read_literal(rb, bits);
-  return vp9_rb_read_bit(rb) ? -value : value;
+  const int value = vpx_rb_read_literal(rb, bits);
+  return vpx_rb_read_bit(rb) ? -value : value;
 }

diff --git a/vpx_dsp/bitreader_buffer.h b/vpx_dsp/bitreader_buffer.h
new file mode 100644
index 0000000..03b156b
--- /dev/null
+++ b/vpx_dsp/bitreader_buffer.h

@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*vpx_rb_error_handler)(void *data);
+
+struct vpx_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  vpx_rb_error_handler error_handler;
+};
+
+size_t vpx_rb_bytes_read(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb);
+
+int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_BITREADER_BUFFER_H_

diff --git a/vp9/encoder/vp9_writer.c b/vpx_dsp/bitwriter.c
similarity index 76%
rename from vp9/encoder/vp9_writer.c
rename to vpx_dsp/bitwriter.c
index ff461f2..5b232e3 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vpx_dsp/bitwriter.c

@@ -9,23 +9,23 @@
  */
 
 #include <assert.h>
-#include "vp9/encoder/vp9_writer.h"
-#include "vp9/common/vp9_entropy.h"
 
-void vp9_start_encode(vp9_writer *br, uint8_t *source) {
+#include "./bitwriter.h"
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
   br->count    = -24;
   br->buffer   = source;
   br->pos      = 0;
-  vp9_write_bit(br, 0);
+  vpx_write_bit(br, 0);
 }
 
-void vp9_stop_encode(vp9_writer *br) {
+void vpx_stop_encode(vpx_writer *br) {
   int i;
 
   for (i = 0; i < 32; i++)
-    vp9_write_bit(br, 0);
+    vpx_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)

diff --git a/vp9/encoder/vp9_writer.h b/vpx_dsp/bitwriter.h
similarity index 70%
rename from vp9/encoder/vp9_writer.h
rename to vpx_dsp/bitwriter.h
index e347ea4..f6ca9b9 100644
--- a/vp9/encoder/vp9_writer.h
+++ b/vpx_dsp/bitwriter.h

@@ -8,29 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITER_H_
-#define VP9_ENCODER_VP9_WRITER_H_
+#ifndef VPX_DSP_BITWRITER_H_
+#define VPX_DSP_BITWRITER_H_
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_prob.h"
+#include "vpx_dsp/prob.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct vp9_writer {
+typedef struct vpx_writer {
   unsigned int lowvalue;
   unsigned int range;
   int count;
   unsigned int pos;
   uint8_t *buffer;
-} vp9_writer;
+} vpx_writer;
 
-void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
-void vp9_stop_encode(vp9_writer *bc);
+void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
+void vpx_stop_encode(vpx_writer *bc);
 
-static INLINE void vp9_write(vp9_writer *br, int bit, int probability) {
+static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -46,7 +46,7 @@
     range = br->range - split;
   }
 
-  shift = vp9_norm[range];
+  shift = vpx_norm[range];
 
   range <<= shift;
   count += shift;
@@ -78,21 +78,21 @@
   br->range = range;
 }
 
-static INLINE void vp9_write_bit(vp9_writer *w, int bit) {
-  vp9_write(w, bit, 128);  // vp9_prob_half
+static INLINE void vpx_write_bit(vpx_writer *w, int bit) {
+  vpx_write(w, bit, 128);  // vpx_prob_half
 }
 
-static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) {
+static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    vp9_write_bit(w, 1 & (data >> bit));
+    vpx_write_bit(w, 1 & (data >> bit));
 }
 
-#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
+#define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8)
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITER_H_
+#endif  // VPX_DSP_BITWRITER_H_

diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vpx_dsp/bitwriter_buffer.c
similarity index 75%
rename from vp9/encoder/vp9_write_bit_buffer.c
rename to vpx_dsp/bitwriter_buffer.c
index 6d55e84..0dfb859 100644
--- a/vp9/encoder/vp9_write_bit_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c

@@ -9,13 +9,14 @@
  */
 
 #include <limits.h>
-#include "vp9/encoder/vp9_write_bit_buffer.h"
 
-size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) {
+#include "./bitwriter_buffer.h"
+
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 
-void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
   const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
@@ -28,8 +29,8 @@
   wb->bit_offset = off + 1;
 }
 
-void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) {
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
   int bit;
   for (bit = bits - 1; bit >= 0; bit--)
-    vp9_wb_write_bit(wb, (data >> bit) & 1);
+    vpx_wb_write_bit(wb, (data >> bit) & 1);
 }

diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vpx_dsp/bitwriter_buffer.h
similarity index 62%
rename from vp9/encoder/vp9_write_bit_buffer.h
rename to vpx_dsp/bitwriter_buffer.h
index 59f9bbe..9397668 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vpx_dsp/bitwriter_buffer.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
-#define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#ifndef VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_DSP_BITWRITER_BUFFER_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -17,20 +17,20 @@
 extern "C" {
 #endif
 
-struct vp9_write_bit_buffer {
+struct vpx_write_bit_buffer {
   uint8_t *bit_buffer;
   size_t bit_offset;
 };
 
-size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb);
+size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb);
 
-void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
+void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
 
-void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits);
+void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits);
 
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
+#endif  // VPX_DSP_BITWRITER_BUFFER_H_

diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
new file mode 100644
index 0000000..bdb55ee
--- /dev/null
+++ b/vpx_dsp/fwd_txfm.c

@@ -0,0 +1,361 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/fwd_txfm.h"
+
+void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (0 == pass) {
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
+        if (i == 0 && input[0]) {
+          input[0] += 1;
+        }
+      } else {
+        input[0] = in[0 * 4];
+        input[1] = in[1 * 4];
+        input[2] = in[2 * 4];
+        input[3] = in[3 * 4];
+      }
+      // Transform.
+      step[0] = input[0] + input[3];
+      step[1] = input[1] + input[2];
+      step[2] = input[1] - input[2];
+      step[3] = input[0] - input[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
+      in++;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+
+  {
+    int i, j;
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+    }
+  }
+}
+
+void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in  = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      final_output[j + i * 8] /= 2;
+  }
+}
+
+void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    int i;
+    for (i = 0; i < 16; i++) {
+      if (0 == pass) {
+        // Calculate input for the first 8 results.
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
+        // Calculate input for the next 8 results.
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
+      } else {
+        // Calculate input for the first 8 results.
+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+        // Calculate input for the next 8 results.
+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
+
+        // stage 1
+        s0 = input[0] + input[7];
+        s1 = input[1] + input[6];
+        s2 = input[2] + input[5];
+        s3 = input[3] + input[4];
+        s4 = input[3] - input[4];
+        s5 = input[2] - input[5];
+        s6 = input[1] - input[6];
+        s7 = input[0] - input[7];
+
+        // fdct4(step, step);
+        x0 = s0 + s3;
+        x1 = s1 + s2;
+        x2 = s1 - s2;
+        x3 = s0 - s3;
+        t0 = (x0 + x1) * cospi_16_64;
+        t1 = (x0 - x1) * cospi_16_64;
+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+        out[0] = (tran_low_t)fdct_round_shift(t0);
+        out[4] = (tran_low_t)fdct_round_shift(t2);
+        out[8] = (tran_low_t)fdct_round_shift(t1);
+        out[12] = (tran_low_t)fdct_round_shift(t3);
+
+        // Stage 2
+        t0 = (s6 - s5) * cospi_16_64;
+        t1 = (s6 + s5) * cospi_16_64;
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
+
+        // Stage 3
+        x0 = s4 + t2;
+        x1 = s4 - t2;
+        x2 = s7 - t3;
+        x3 = s7 + t3;
+
+        // Stage 4
+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+        out[2] = (tran_low_t)fdct_round_shift(t0);
+        out[6] = (tran_low_t)fdct_round_shift(t2);
+        out[10] = (tran_low_t)fdct_round_shift(t1);
+        out[14] = (tran_low_t)fdct_round_shift(t3);
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        temp1 = (step1[5] - step1[2]) * cospi_16_64;
+        temp2 = (step1[4] - step1[3]) * cospi_16_64;
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
+        temp1 = (step1[4] + step1[3]) * cospi_16_64;
+        temp2 = (step1[5] + step1[2]) * cospi_16_64;
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
+        // step 3
+        step3[0] = step1[0] + step2[3];
+        step3[1] = step1[1] + step2[2];
+        step3[2] = step1[1] - step2[2];
+        step3[3] = step1[0] - step2[3];
+        step3[4] = step1[7] - step2[4];
+        step3[5] = step1[6] - step2[5];
+        step3[6] = step1[6] + step2[5];
+        step3[7] = step1[7] + step2[4];
+        // step 4
+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
+        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
+        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
+        // step 5
+        step1[0] = step3[0] + step2[1];
+        step1[1] = step3[0] - step2[1];
+        step1[2] = step3[3] + step2[2];
+        step1[3] = step3[3] - step2[2];
+        step1[4] = step3[4] - step2[5];
+        step1[5] = step3[4] + step2[5];
+        step1[6] = step3[7] - step2[6];
+        step1[7] = step3[7] + step2[6];
+        // step 6
+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+        out[1] = (tran_low_t)fdct_round_shift(temp1);
+        out[9] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
+        out[5] = (tran_low_t)fdct_round_shift(temp1);
+        out[13] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+        out[3] = (tran_low_t)fdct_round_shift(temp1);
+        out[11] = (tran_low_t)fdct_round_shift(temp2);
+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
+        out[7] = (tran_low_t)fdct_round_shift(temp1);
+        out[15] = (tran_low_t)fdct_round_shift(temp2);
+      }
+      // Do next column (which is a transposed row in second/horizontal pass)
+      in++;
+      in_pass0++;
+      out += 16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp9_fdct4x4_c(input, output, stride);
+}
+
+void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vp9_fdct8x8_c(input, final_output, stride);
+}
+
+void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vp9_fdct16x16_c(input, output, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/fwd_txfm.h b/vpx_dsp/fwd_txfm.h
new file mode 100644
index 0000000..fa54403
--- /dev/null
+++ b/vpx_dsp/fwd_txfm.h

@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_idct.h"
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert
+  // and make the bounds consts.
+  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}

diff --git a/vp9/common/vp9_loopfilter_filters.c b/vpx_dsp/loopfilter.c
similarity index 91%
rename from vp9/common/vp9_loopfilter_filters.c
rename to vpx_dsp/loopfilter.c
index 3cf4c32..dc8aca5 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vpx_dsp/loopfilter.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -9,10 +9,8 @@
  */
 
 #include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 static INLINE int8_t signed_char_clamp(int t) {
   return (int8_t)clamp(t, -128, 127);
@@ -117,7 +115,7 @@
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
                             const uint8_t *thresh, int count) {
   int i;
@@ -134,15 +132,15 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
                           int count) {
   int i;
@@ -159,12 +157,12 @@
   }
 }
 
-void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
                                   thresh1, 1);
 }
 
@@ -189,7 +187,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh,
                             int count) {
   int i;
@@ -209,15 +207,15 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
                           int count) {
   int i;
@@ -234,12 +232,12 @@
   }
 }
 
-void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
                                     thresh1, 1);
 }
 
@@ -294,7 +292,7 @@
   }
 }
 
-void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh,
                              int count) {
   int i;
@@ -343,12 +341,12 @@
   }
 }
 
-void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
 }
 
-void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
 }
@@ -448,7 +446,7 @@
   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
 }
 
-void vp9_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int count, int bd) {
   int i;
@@ -471,7 +469,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
                                         const uint8_t *blimit0,
                                         const uint8_t *limit0,
                                         const uint8_t *thresh0,
@@ -479,11 +477,11 @@
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vp9_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
 }
 
-void vp9_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int count, int bd) {
   int i;
@@ -500,7 +498,7 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *blimit0,
                                       const uint8_t *limit0,
                                       const uint8_t *thresh0,
@@ -508,8 +506,8 @@
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vp9_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
                               thresh1, 1, bd);
 }
 
@@ -534,7 +532,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count, int bd) {
   int i;
@@ -556,7 +554,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
                                         const uint8_t *blimit0,
                                         const uint8_t *limit0,
                                         const uint8_t *thresh0,
@@ -564,11 +562,11 @@
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vp9_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
 }
 
-void vp9_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int count, int bd) {
   int i;
@@ -588,7 +586,7 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *blimit0,
                                       const uint8_t *limit0,
                                       const uint8_t *thresh0,
@@ -596,8 +594,8 @@
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vp9_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
-  vp9_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
                               thresh1, 1, bd);
 }
 
@@ -664,7 +662,7 @@
   }
 }
 
-void vp9_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int count, int bd) {
   int i;
@@ -729,13 +727,13 @@
   }
 }
 
-void vp9_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
   highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
 }
 
-void vp9_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh,

diff --git a/vpx_dsp/mips/common_dspr2.h b/vpx_dsp/mips/common_dspr2.h
new file mode 100644
index 0000000..8278101
--- /dev/null
+++ b/vpx_dsp/mips/common_dspr2.h

@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_COMMON_MIPS_DSPR2_H_
+#define VPX_COMMON_MIPS_DSPR2_H_
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   0,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   1,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__ (
+      "pref   4,  0(%[src])   \n\t"
+      :
+      : [src] "r" (src)
+  );
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__ (
+      "pref   5,  0(%[dst])   \n\t"
+      :
+      : [dst] "r" (dst)
+  );
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_COMMON_MIPS_DSPR2_H_

diff --git a/vp9/common/mips/msa/vp9_loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
similarity index 95%
rename from vp9/common/mips/msa/vp9_loopfilter_16_msa.c
rename to vpx_dsp/mips/loopfilter_16_msa.c
index aeaa48e..b7c9f7b 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c

@@ -9,9 +9,9 @@
  */
 
 #include "vpx_ports/mem.h"
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
                                  uint8_t *filter48,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
@@ -79,7 +79,7 @@
   }
 }
 
-void vp9_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -405,7 +405,7 @@
   }
 }
 
-void vp9_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
                                     const uint8_t *b_limit_ptr,
                                     const uint8_t *limit_ptr,
                                     const uint8_t *thresh_ptr,
@@ -415,15 +415,15 @@
 
   (void)count;
 
-  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
                                         limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    vp9_hz_lpf_t16_16w(src, pitch, filter48);
+    vpx_hz_lpf_t16_16w(src, pitch, filter48);
   }
 }
 
-void vp9_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
                                const uint8_t *b_limit_ptr,
                                const uint8_t *limit_ptr,
                                const uint8_t *thresh_ptr,
@@ -643,13 +643,13 @@
       }
     }
   } else {
-    vp9_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
+    vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
                                    thresh_ptr, count);
   }
 }
 
-static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
-                                       uint8_t *output, int32_t out_pitch) {
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -673,8 +673,8 @@
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
-                                       uint8_t *output, int32_t out_pitch) {
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
 
@@ -685,8 +685,8 @@
   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
 }
 
-static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
-                                uint8_t *output, int32_t out_pitch) {
+static void transpose_16x16(uint8_t *input, int32_t in_pitch,
+                            uint8_t *output, int32_t out_pitch) {
   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -744,7 +744,7 @@
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
                                 uint8_t *src_org, int32_t pitch_org,
                                 const uint8_t *b_limit_ptr,
                                 const uint8_t *limit_ptr,
@@ -812,7 +812,7 @@
   }
 }
 
-int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
                           uint8_t *filter48) {
   v16i8 zero = { 0 };
   v16u8 filter8, flat, flat2;
@@ -1032,7 +1032,7 @@
   }
 }
 
-void vp9_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr) {
@@ -1040,23 +1040,23 @@
   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   uint8_t *filter48 = &transposed_input[16 * 16];
 
-  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
-  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
+  early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
                                        &filter48[0], src, pitch, b_limit_ptr,
                                        limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
                                    &filter48[0]);
 
     if (0 == early_exit) {
-      vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
     }
   }
 }
 
-int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
                                  uint8_t *src_org, int32_t pitch,
                                  const uint8_t *b_limit_ptr,
                                  const uint8_t *limit_ptr,
@@ -1134,7 +1134,7 @@
   }
 }
 
-int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
                            uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
@@ -1455,7 +1455,7 @@
   }
 }
 
-void vp9_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit_ptr,
                                   const uint8_t *limit_ptr,
                                   const uint8_t *thresh_ptr) {
@@ -1463,18 +1463,18 @@
   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
   uint8_t *filter48 = &transposed_input[16 * 16];
 
-  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
-  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
+  early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
                                         &filter48[0], src, pitch, b_limit_ptr,
                                         limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
                                     &filter48[0]);
 
     if (0 == early_exit) {
-      vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
     }
   }
 }

diff --git a/vp9/common/mips/msa/vp9_loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
similarity index 94%
rename from vp9/common/mips/msa/vp9_loopfilter_4_msa.c
rename to vpx_dsp/mips/loopfilter_4_msa.c
index 7f69135..daf5f38 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
                               const uint8_t *thresh_ptr,
@@ -39,7 +39,7 @@
   SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
 }
 
-void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit0_ptr,
                                    const uint8_t *limit0_ptr,
                                    const uint8_t *thresh0_ptr,
@@ -71,7 +71,7 @@
   ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
 }
 
-void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
                             const uint8_t *thresh_ptr,
@@ -102,7 +102,7 @@
   ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
 }
 
-void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                  const uint8_t *b_limit0_ptr,
                                  const uint8_t *limit0_ptr,
                                  const uint8_t *thresh0_ptr,

diff --git a/vp9/common/mips/msa/vp9_loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
similarity index 97%
rename from vp9/common/mips/msa/vp9_loopfilter_8_msa.c
rename to vpx_dsp/mips/loopfilter_8_msa.c
index 26a858d..00b6db5 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_loopfilter_msa.h"
+#include "vpx_dsp/mips/loopfilter_msa.h"
 
-void vp9_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
                               const uint8_t *thresh_ptr,
@@ -83,7 +83,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
                                    const uint8_t *b_limit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
@@ -158,7 +158,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
                             const uint8_t *thresh_ptr,
@@ -237,7 +237,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                  const uint8_t *b_limit0,
                                  const uint8_t *limit0,
                                  const uint8_t *thresh0,

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
similarity index 85%
rename from vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
rename to vpx_dsp/mips/loopfilter_filters_dspr2.c
index 3df7f4c..99a96d8 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_4_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -50,7 +49,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   /* loop filter designed to work using chars so that we can make maximum use
      of 8 bit simd instructions. */
@@ -88,14 +87,14 @@
           : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
       );
 
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
-                                pm1, p0, p3, p4, p5, p6,
-                                thresh_vec, &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                            pm1, p0, p3, p4, p5, p6,
+                            thresh_vec, &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         __asm__ __volatile__ (
             "sw     %[p1],  (%[s1])    \n\t"
@@ -114,7 +113,7 @@
   }
 }
 
-void vp9_lpf_vertical_4_dspr2(unsigned char *s,
+void vpx_lpf_vertical_4_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -144,7 +143,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -217,14 +216,14 @@
      * mask will be zero and filtering is not needed
      */
     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
-                                p0, p3, p4, p5, p6, thresh_vec,
-                                &hev, &mask);
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                            p0, p3, p4, p5, p6, thresh_vec,
+                            &hev, &mask);
 
       /* if mask == 0 do filtering is not needed */
       if (mask) {
         /* filtering */
-        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
 
         /* unpack processed 4x4 neighborhood
          * don't use transpose on output data
@@ -307,56 +306,56 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
+void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit0,
                                      const uint8_t *limit0,
                                      const uint8_t *thresh0,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
 }
 
-void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit0,
                                    const uint8_t *limit0,
                                    const uint8_t *thresh0,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
                                        1);
 }
 
-void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
+void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh) {
-  vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // #if HAVE_DSPR2

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/vpx_dsp/mips/loopfilter_filters_dspr2.h
similarity index 85%
rename from vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
rename to vpx_dsp/mips/loopfilter_filters_dspr2.h
index 675db65..4a1506b 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.h

@@ -13,10 +13,10 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,10 +24,10 @@
 
 #if HAVE_DSPR2
 /* inputs & outputs are quad-byte vectors */
-static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
-                                    uint32_t *ps1, uint32_t *ps0,
-                                    uint32_t *qs0, uint32_t *qs1) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev,
+                                uint32_t *ps1, uint32_t *ps0,
+                                uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -73,34 +73,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -113,13 +113,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -142,23 +142,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -196,12 +196,12 @@
   *qs1 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
-                                     uint32_t ps1, uint32_t ps0,
-                                     uint32_t qs0, uint32_t qs1,
-                                     uint32_t *p1_f0, uint32_t *p0_f0,
-                                     uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t   vp9_filter_l, vp9_filter_r;
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev,
+                                 uint32_t ps1, uint32_t ps0,
+                                 uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vpx_filter_l, vpx_filter_r;
   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
   int32_t   subr_r, subr_l;
   uint32_t  t1, t2, HWM, t3;
@@ -247,34 +247,34 @@
   hev_r = hev_r & HWM;
 
   __asm__ __volatile__ (
-      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vpx_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vpx_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
 
       /* qs0 - ps0 */
       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
 
-      /* vp9_filter &= hev; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+      /* vpx_filter &= hev; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[hev_l]        \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[hev_r]        \n\t"
 
-      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      /* vpx_filter = vp8_signed_char_clamp(vpx_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "addq_s.ph    %[vpx_filter_l], %[vpx_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vpx_filter_r], %[vpx_filter_r], %[subr_r]       \n\t"
 
-      /* vp9_filter &= mask; */
-      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
-      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+      /* vpx_filter &= mask; */
+      "and          %[vpx_filter_l], %[vpx_filter_l], %[mask_l]       \n\t"
+      "and          %[vpx_filter_r], %[vpx_filter_r], %[mask_r]       \n\t"
 
-      : [vp9_filter_l] "=&r" (vp9_filter_l),
-        [vp9_filter_r] "=&r" (vp9_filter_r),
+      : [vpx_filter_l] "=&r" (vpx_filter_l),
+        [vpx_filter_r] "=&r" (vpx_filter_r),
         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
@@ -286,13 +286,13 @@
 
   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   __asm__ __volatile__ (
-      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+      /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vpx_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vpx_filter_r], %[t2]           \n\t"
 
-      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      /* Filter1 = vp8_signed_char_clamp(vpx_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vpx_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vpx_filter_r], %[t1]           \n\t"
       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
 
@@ -315,23 +315,23 @@
         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
-        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+        [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r)
   );
 
   __asm__ __volatile__ (
-      /* (vp9_filter += 1) >>= 1 */
+      /* (vpx_filter += 1) >>= 1 */
       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
 
-      /* vp9_filter &= ~hev; */
+      /* vpx_filter &= ~hev; */
       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
 
-      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      /* vps1 = vp8_signed_char_clamp(ps1 + vpx_filter); */
       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
 
-      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vpx_filter); */
       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
 
@@ -369,10 +369,10 @@
   *q1_f0 = vqs1 ^ N128;
 }
 
-static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
-                                      uint32_t *op1, uint32_t *op0,
-                                      uint32_t *oq0, uint32_t *oq1,
-                                      uint32_t *oq2, uint32_t *oq3) {
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                  uint32_t *op1, uint32_t *op0,
+                                  uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
@@ -446,14 +446,14 @@
   *oq2 = res_oq2;
 }
 
-static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
-                                       uint32_t p1, uint32_t p0,
-                                       uint32_t q0, uint32_t q1,
-                                       uint32_t q2, uint32_t q3,
-                                       uint32_t *op2_f1,
-                                       uint32_t *op1_f1, uint32_t *op0_f1,
-                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
-                                       uint32_t *oq2_f1) {
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                   uint32_t p1, uint32_t p0,
+                                   uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3,
+                                   uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   uint32_t  res_op2, res_op1, res_op0;
   uint32_t  res_oq0, res_oq1, res_oq2;
@@ -524,14 +524,14 @@
   *oq2_f1 = res_oq2;
 }
 
-static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
-                                           uint32_t *op5, uint32_t *op4,
-                                           uint32_t *op3, uint32_t *op2,
-                                           uint32_t *op1, uint32_t *op0,
-                                           uint32_t *oq0, uint32_t *oq1,
-                                           uint32_t *oq2, uint32_t *oq3,
-                                           uint32_t *oq4, uint32_t *oq5,
-                                           uint32_t *oq6, uint32_t *oq7) {
+static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                       uint32_t *op5, uint32_t *op4,
+                                       uint32_t *op3, uint32_t *op2,
+                                       uint32_t *op1, uint32_t *op0,
+                                       uint32_t *oq0, uint32_t *oq1,
+                                       uint32_t *oq2, uint32_t *oq3,
+                                       uint32_t *oq4, uint32_t *oq5,
+                                       uint32_t *oq6, uint32_t *oq7) {
   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/vpx_dsp/mips/loopfilter_macros_dspr2.h
similarity index 99%
rename from vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
rename to vpx_dsp/mips/loopfilter_macros_dspr2.h
index ca01a6a..994ff18 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_macros_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/vpx_dsp/mips/loopfilter_masks_dspr2.h
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
rename to vpx_dsp/mips/loopfilter_masks_dspr2.h
index 5b0d9cc..e82dfb7 100644
--- a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+++ b/vpx_dsp/mips/loopfilter_masks_dspr2.h

@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_onyxc_int.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,13 +24,13 @@
 #if HAVE_DSPR2
 /* processing 4 pixels at the same time
  * compute hev and mask in the same function */
-static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                             uint32_t p1, uint32_t p0,
-                                             uint32_t p3, uint32_t p2,
-                                             uint32_t q0, uint32_t q1,
-                                             uint32_t q2, uint32_t q3,
-                                             uint32_t thresh, uint32_t *hev,
-                                             uint32_t *mask) {
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0,
+                                         uint32_t p3, uint32_t p2,
+                                         uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
   uint32_t  c, r, r3, r_k;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -129,16 +129,16 @@
   *mask = s2;
 }
 
-static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
-                                                       uint32_t flimit,
-                                                       uint32_t thresh,
-                                                       uint32_t p1, uint32_t p0,
-                                                       uint32_t p3, uint32_t p2,
-                                                       uint32_t q0, uint32_t q1,
-                                                       uint32_t q2, uint32_t q3,
-                                                       uint32_t *hev,
-                                                       uint32_t *mask,
-                                                       uint32_t *flat) {
+static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                   uint32_t flimit,
+                                                   uint32_t thresh,
+                                                   uint32_t p1, uint32_t p0,
+                                                   uint32_t p3, uint32_t p2,
+                                                   uint32_t q0, uint32_t q1,
+                                                   uint32_t q2, uint32_t q3,
+                                                   uint32_t *hev,
+                                                   uint32_t *mask,
+                                                   uint32_t *flat) {
   uint32_t  c, r, r3, r_k, r_flat;
   uint32_t  s1, s2, s3;
   uint32_t  ones = 0xFFFFFFFF;
@@ -279,12 +279,12 @@
   *flat = flat1;
 }
 
-static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
-                                 uint32_t p2, uint32_t p1,
-                                 uint32_t p0, uint32_t q0,
-                                 uint32_t q1, uint32_t q2,
-                                 uint32_t q3, uint32_t q4,
-                                 uint32_t *flat2) {
+static INLINE void flatmask5(uint32_t p4, uint32_t p3,
+                             uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0,
+                             uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4,
+                             uint32_t *flat2) {
   uint32_t  c, r, r_k, r_flat;
   uint32_t  ones = 0xFFFFFFFF;
   uint32_t  flat_thresh = 0x01010101;

diff --git a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_dspr2.c
index 7cd0b63..4138f56 100644
--- a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_8_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
@@ -53,7 +52,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < 2; i++) {
     sp3 = s - (pitch << 2);
@@ -81,13 +80,13 @@
           [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])    \n\t"
@@ -104,13 +103,13 @@
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -130,18 +129,18 @@
       );
     } else if ((flat != 0) && (mask != 0)) {
       /* filtering */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -319,7 +318,7 @@
   }
 }
 
-void vp9_lpf_vertical_8_dspr2(unsigned char *s,
+void vpx_lpf_vertical_8_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
@@ -351,7 +350,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -451,39 +450,39 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
     if ((flat == 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((mask & flat) == 0xFFFFFFFF) {
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat != 0) && (mask != 0)) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 6c94674..8a48650 100644
--- a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_horizontal_16_dspr2(unsigned char *s,
+void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
                                  int pitch,
                                  const uint8_t *blimit,
                                  const uint8_t *limit,
@@ -58,7 +57,7 @@
   );
 
   /* prefetch data for store */
-  vp9_prefetch_store(s);
+  prefetch_store(s);
 
   for (i = 0; i < (2 * count); i++) {
     sp7 = s - (pitch << 3);
@@ -110,17 +109,17 @@
           [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       __asm__ __volatile__ (
           "sw       %[p1_f0],   (%[sp1])            \n\t"
@@ -139,17 +138,17 @@
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
       COMBINE_LEFT_RIGHT_3TO6()
@@ -189,13 +188,13 @@
       /* f1 */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       COMBINE_LEFT_RIGHT_0TO2()
 
@@ -215,18 +214,18 @@
       );
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -399,36 +398,36 @@
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 + f2 */
       /* f0  function */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* f1  function */
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       /* f2  function */
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
similarity index 90%
rename from vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
rename to vpx_dsp/mips/loopfilter_mb_vert_dspr2.c
index 851fc6c..e580f01 100644
--- a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c

@@ -10,17 +10,16 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
-#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/mips/common_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
+#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
+#include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vp9_lpf_vertical_16_dspr2(uint8_t *s,
+void vpx_lpf_vertical_16_dspr2(uint8_t *s,
                                int pitch,
                                const uint8_t *blimit,
                                const uint8_t *limit,
@@ -55,7 +54,7 @@
       : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
   );
 
-  vp9_prefetch_store(s + pitch);
+  prefetch_store(s + pitch);
 
   for (i = 0; i < 2; i++) {
     s1 = s;
@@ -248,61 +247,61 @@
         :
     );
 
-    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
-                                        p1, p0, p3, p2, q0, q1, q2, q3,
-                                        &hev, &mask, &flat);
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                    p1, p0, p3, p2, q0, q1, q2, q3,
+                                    &hev, &mask, &flat);
 
-    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
 
     /* f0 */
     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
       STORE_F0()
     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
                (mask == 0xFFFFFFFF)) {
       /* f2 */
       PACK_LEFT_0TO3()
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_0TO3()
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       STORE_F2()
     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
       /* f1 */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       STORE_F1()
     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
       /* f0 + f1 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       /* left 2 element operation */
       PACK_LEFT_0TO3()
-      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
-                         &q0_l, &q1_l, &q2_l, &q3_l);
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                     &q0_l, &q1_l, &q2_l, &q3_l);
 
       /* right 2 element operation */
       PACK_RIGHT_0TO3()
-      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
-                         &q0_r, &q1_r, &q2_r, &q3_r);
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                     &q0_r, &q1_r, &q2_r, &q3_r);
 
       if (mask & flat & 0x000000FF) {
         __asm__ __volatile__ (
@@ -466,32 +465,32 @@
       }
     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
       /* f0+f1+f2 */
-      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
-                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                    &p1_f0, &p0_f0, &q0_f0, &q1_f0);
 
       PACK_LEFT_0TO3()
-      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
-                          q0_l, q1_l, q2_l, q3_l,
-                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
-                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                      q0_l, q1_l, q2_l, q3_l,
+                      &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                      &q0_l_f1, &q1_l_f1, &q2_l_f1);
 
       PACK_RIGHT_0TO3()
-      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
-                          q0_r, q1_r, q2_r, q3_r,
-                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
-                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                      q0_r, q1_r, q2_r, q3_r,
+                      &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                      &q0_r_f1, &q1_r_f1, &q2_r_f1);
 
       PACK_LEFT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
-                              &p3_l, &p2_l, &p1_l, &p0_l,
-                              &q0_l, &q1_l, &q2_l, &q3_l,
-                              &q4_l, &q5_l, &q6_l, &q7_l);
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                          &p3_l, &p2_l, &p1_l, &p0_l,
+                          &q0_l, &q1_l, &q2_l, &q3_l,
+                          &q4_l, &q5_l, &q6_l, &q7_l);
 
       PACK_RIGHT_4TO7()
-      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
-                              &p3_r, &p2_r, &p1_r, &p0_r,
-                              &q0_r, &q1_r, &q2_r, &q3_r,
-                              &q4_r, &q5_r, &q6_r, &q7_r);
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                          &p3_r, &p2_r, &p1_r, &p0_r,
+                          &q0_r, &q1_r, &q2_r, &q3_r,
+                          &q4_r, &q5_r, &q6_r, &q7_r);
 
       if (mask & flat & flat2 & 0x000000FF) {
         __asm__ __volatile__ (

diff --git a/vp9/common/mips/msa/vp9_loopfilter_msa.h b/vpx_dsp/mips/loopfilter_msa.h
similarity index 98%
rename from vp9/common/mips/msa/vp9_loopfilter_msa.h
rename to vpx_dsp/mips/loopfilter_msa.h
index bfbe870..62b1706 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h
+++ b/vpx_dsp/mips/loopfilter_msa.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
+#ifndef VPX_DSP_LOOPFILTER_MSA_H_
+#define VPX_DSP_LOOPFILTER_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -243,4 +243,4 @@
   mask_out = limit_in < (v16u8)mask_out;                         \
   mask_out = __msa_xori_b(mask_out, 0xff);                       \
 }
-#endif  /* VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ */
+#endif  /* VPX_DSP_LOOPFILTER_MSA_H_ */

diff --git a/vpx_dsp/mips/sub_pixel_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c
index 5168192..798c343 100644
--- a/vpx_dsp/mips/sub_pixel_variance_msa.c
+++ b/vpx_dsp/mips/sub_pixel_variance_msa.c

@@ -41,6 +41,357 @@
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
   sse - (((int64_t)diff * diff) >> shift)
 
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred,
+                                        int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 pred, src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred,
+                                        int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                src0, src1, ref0, ref1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height,
+                                         int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref, pred;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height,
+                                         int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred,
+                                       int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
 static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
                                                 int32_t src_stride,
                                                 const uint8_t *dst,
@@ -691,6 +1042,756 @@
   return sse;
 }
 
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 out, pred, filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const255;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 out, pred, filt0;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const255;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             const uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const uint8_t *sec_pred,
+                                             const uint8_t *filter,
+                                             int32_t height,
+                                             int32_t *diff,
+                                             int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 pred0, pred1, pred2, pred3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8u16 const255;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    MIN_UH4_UH(out0, out1, out2, out3, const255);
+    MIN_UH4_UH(out4, out5, out6, out7, const255);
+    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
+                tmp0, tmp1, tmp2, tmp3);
+    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
+                tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 out, pred, ref = { 0 };
+  v16u8 src2110, src4332, filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
+                                                    int32_t src_stride,
+                                                    const uint8_t *dst,
+                                                    int32_t dst_stride,
+                                                    const uint8_t *sec_pred,
+                                                    const uint8_t *filter,
+                                                    int32_t height,
+                                                    int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, filt0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+               vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
+                                             int32_t src_stride,
+                                             const uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const uint8_t *sec_pred,
+                                             const uint8_t *filter,
+                                             int32_t height,
+                                             int32_t *diff,
+                                             int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
+                out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
+                                                     int32_t src_stride,
+                                                     const uint8_t *dst,
+                                                     int32_t dst_stride,
+                                                     const uint8_t *sec_pred,
+                                                     const uint8_t *filter,
+                                                     int32_t height,
+                                                     int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                        sec_pred, filter, height,
+                                        &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 out, pred, ref = { 0 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 pred0, pred1, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
+                                              int32_t src_stride,
+                                              const uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const uint8_t *sec_pred,
+                                              const uint8_t *filter_horiz,
+                                              const uint8_t *filter_vert,
+                                              int32_t height,
+                                              int32_t *diff,
+                                              int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 out0, out1, out2, out3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
+                out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                       sec_pred, filter_horiz, filter_vert,
+                                       height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+  const uint8_t *src, int32_t src_stride,
+  const uint8_t *dst, int32_t dst_stride,
+  const uint8_t *sec_pred,
+  const uint8_t *filter_horiz, const uint8_t *filter_vert,
+  int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
@@ -765,3 +1866,138 @@
 
 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
+uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
+  const uint8_t *src_ptr, int32_t src_stride,                                 \
+  int32_t xoffset, int32_t yoffset,                                           \
+  const uint8_t *ref_ptr, int32_t ref_stride,                                 \
+  uint32_t *sse, const uint8_t *sec_pred) {                                   \
+  int32_t diff;                                                               \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                    \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                    \
+                                                                              \
+  if (yoffset) {                                                              \
+    if (xoffset) {                                                            \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
+                                                       ref_ptr, ref_stride,   \
+                                                       sec_pred, h_filter,    \
+                                                       v_filter, ht, &diff);  \
+    } else {                                                                  \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
+                                                      ref_ptr, ref_stride,    \
+                                                      sec_pred, v_filter,     \
+                                                      ht, &diff);             \
+    }                                                                         \
+  } else {                                                                    \
+    if (xoffset) {                                                            \
+      *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
+                                                      ref_ptr, ref_stride,    \
+                                                      sec_pred, h_filter,     \
+                                                      ht, &diff);             \
+    } else {                                                                  \
+      *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
+                                          ref_ptr, ref_stride,                \
+                                          sec_pred, ht, &diff);               \
+    }                                                                         \
+  }                                                                           \
+                                                                              \
+  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
+}
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
+
+uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+                                             int32_t src_stride,
+                                             int32_t xoffset,
+                                             int32_t yoffset,
+                                             const uint8_t *ref_ptr,
+                                             int32_t ref_stride,
+                                             uint32_t *sse,
+                                             const uint8_t *sec_pred) {
+  int32_t diff;
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+
+  if (yoffset) {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
+                                                   ref_ptr, ref_stride,
+                                                   sec_pred, h_filter,
+                                                   v_filter, 64, &diff);
+    } else {
+      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
+                                                  ref_ptr, ref_stride,
+                                                  sec_pred, v_filter,
+                                                  64, &diff);
+    }
+  } else {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
+                                                  ref_ptr, ref_stride,
+                                                  sec_pred, h_filter,
+                                                  64, &diff);
+    } else {
+      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    sec_pred, &diff);
+    }
+  }
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
+uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
+                                                 int32_t src_stride,         \
+                                                 int32_t xoffset,            \
+                                                 int32_t yoffset,            \
+                                                 const uint8_t *ref_ptr,     \
+                                                 int32_t ref_stride,         \
+                                                 uint32_t *sse,              \
+                                                 const uint8_t *sec_pred) {  \
+  int32_t diff;                                                              \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];                   \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];                   \
+                                                                             \
+  if (yoffset) {                                                             \
+    if (xoffset) {                                                           \
+      *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
+                                                   ref_ptr, ref_stride,      \
+                                                   sec_pred, h_filter,       \
+                                                   v_filter, ht, &diff);     \
+    } else {                                                                 \
+      *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
+                                                  ref_ptr, ref_stride,       \
+                                                  sec_pred, v_filter,        \
+                                                  ht, &diff);                \
+    }                                                                        \
+  } else {                                                                   \
+    if (xoffset) {                                                           \
+      *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
+                                                  ref_ptr, ref_stride,       \
+                                                  sec_pred, h_filter,        \
+                                                  ht, &diff);                \
+    } else {                                                                 \
+      *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
+                                        ref_ptr, ref_stride,                 \
+                                        sec_pred, &diff);                    \
+    }                                                                        \
+  }                                                                          \
+                                                                             \
+  return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
+}
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);

diff --git a/vp9/common/vp9_prob.c b/vpx_dsp/prob.c
similarity index 83%
rename from vp9/common/vp9_prob.c
rename to vpx_dsp/prob.c
index 3b7b9bf..639d24d 100644
--- a/vp9/common/vp9_prob.c
+++ b/vpx_dsp/prob.c

@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_prob.h"
+#include "./prob.h"
 
-const uint8_t vp9_norm[256] = {
+const uint8_t vpx_norm[256] = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -30,10 +30,10 @@
 };
 
 static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const vp9_tree_index *tree,
-                                          const vp9_prob *pre_probs,
+                                          const vpx_tree_index *tree,
+                                          const vpx_prob *pre_probs,
                                           const unsigned int *counts,
-                                          vp9_prob *probs) {
+                                          vpx_prob *probs) {
   const int l = tree[i];
   const unsigned int left_count = (l <= 0)
                  ? counts[-l]
@@ -47,7 +47,7 @@
   return left_count + right_count;
 }
 
-void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, vp9_prob *probs) {
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs) {
   tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }

diff --git a/vp9/common/vp9_prob.h b/vpx_dsp/prob.h
similarity index 70%
rename from vp9/common/vp9_prob.h
rename to vpx_dsp/prob.h
index c69c62c..729f90a 100644
--- a/vp9/common/vp9_prob.h
+++ b/vpx_dsp/prob.h

@@ -8,64 +8,63 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PROB_H_
-#define VP9_COMMON_VP9_PROB_H_
+#ifndef VPX_DSP_PROB_H_
+#define VPX_DSP_PROB_H_
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_common.h"
 
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/vp9_common.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef uint8_t vp9_prob;
+typedef uint8_t vpx_prob;
 
 #define MAX_PROB 255
 
-#define vp9_prob_half ((vp9_prob) 128)
+#define vpx_prob_half ((vpx_prob) 128)
 
-typedef int8_t vp9_tree_index;
+typedef int8_t vpx_tree_index;
 
 #define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
 
-#define vp9_complement(x) (255 - x)
+#define vpx_complement(x) (255 - x)
 
 #define MODE_MV_COUNT_SAT 20
 
 /* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of vp9_tree_indices.
+   Each node of the tree is a pair of vpx_tree_indices.
    Array index often references a corresponding probability table.
    Index <= 0 means done encoding/decoding and value = -Index,
    Index > 0 means need another bit, specification at index.
    Nonnegative indices are always even;  processing begins at node 0. */
 
-typedef const vp9_tree_index vp9_tree[];
+typedef const vpx_tree_index vpx_tree[];
 
-static INLINE vp9_prob clip_prob(int p) {
+static INLINE vpx_prob clip_prob(int p) {
   return (p > 255) ? 255 : (p < 1) ? 1 : p;
 }
 
-static INLINE vp9_prob get_prob(int num, int den) {
+static INLINE vpx_prob get_prob(int num, int den) {
   return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
 }
 
-static INLINE vp9_prob get_binary_prob(int n0, int n1) {
+static INLINE vpx_prob get_binary_prob(int n0, int n1) {
   return get_prob(n0, n0 + n1);
 }
 
 /* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
+static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
+static INLINE vpx_prob merge_probs(vpx_prob pre_prob,
                                    const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
-  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
+  const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
   const unsigned int count = MIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
@@ -77,7 +76,7 @@
   70, 76, 83, 89, 96, 102, 108, 115, 121, 128
 };
 
-static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob,
+static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob,
                                            const unsigned int ct[2]) {
   const unsigned int den = ct[0] + ct[1];
   if (den == 0) {
@@ -85,20 +84,20 @@
   } else {
     const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
     const unsigned int factor = count_to_update_factor[count];
-    const vp9_prob prob =
+    const vpx_prob prob =
         clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
     return weighted_prob(pre_prob, prob, factor);
   }
 }
 
-void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, vp9_prob *probs);
+void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs,
+                          const unsigned int *counts, vpx_prob *probs);
 
 
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
+DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PROB_H_
+#endif  // VPX_DSP_PROB_H_

diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
new file mode 100644
index 0000000..137f5bc
--- /dev/null
+++ b/vpx_dsp/quantize.c

@@ -0,0 +1,337 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 16;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant) >> 15;
+    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    if (tmp)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    if (abs_qcoeff)
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const int16_t *iscan) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        if (abs_qcoeff)
+          eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int tmp;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, int skip_block,
+                                   const int16_t *zbin_ptr,
+                                   const int16_t *round_ptr,
+                                   const int16_t *quant_ptr,
+                                   const int16_t *quant_shift_ptr,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr,
+                                   const int16_t *dequant_ptr,
+                                   uint16_t *eob_ptr,
+                                   const int16_t *scan, const int16_t *iscan) {
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 = abs_coeff
+                         + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+      if (abs_qcoeff)
+        eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif

diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
new file mode 100644
index 0000000..0ad1744
--- /dev/null
+++ b/vpx_dsp/quantize.h

@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_QUANTIZE_H_
+#define VPX_DSP_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  int skip_block,
+                                  const int16_t *round_ptr,
+                                  const int16_t quant_ptr,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr,
+                                  uint16_t *eob_ptr);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_QUANTIZE_H_

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 70a131c..07f2d6a 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -9,9 +9,86 @@
 ##
 
 DSP_SRCS-yes += vpx_dsp.mk
+DSP_SRCS-yes += vpx_dsp_common.h
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
+
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_16_neon.c
+DSP_SRCS-yes   += arm/loopfilter_8_neon.c
+DSP_SRCS-yes   += arm/loopfilter_4_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+endif  # CONFIG_VP9_HIGHBITDEPTH
+
+# forward transform
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+DSP_SRCS-yes            += fwd_txfm.c
+DSP_SRCS-yes            += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3.asm
+endif
+DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+endif  # CONFIG_VP9_ENCODER
+
+# quantization
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+DSP_SRCS-yes            += quantize.c
+DSP_SRCS-yes            += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+endif
+endif
+endif  # CONFIG_VP9_ENCODER
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c

diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
new file mode 100644
index 0000000..6793036
--- /dev/null
+++ b/vpx_dsp/vpx_dsp_common.h

@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_COMMON_H_
+#define VPX_DSP_COMMON_H_
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_COMMON_H_

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8e4e966..8a17440 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -5,12 +5,18 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 EOF
 }
 forward_decls qw/vpx_dsp_forward_decls/;
 
-# Functions which use x86inc.asm instead of x86_abi_support.asm
+# x86inc.asm had specific constraints. break it out so it's easy to disable.
+# zero all the variables to avoid tricky else conditions.
+$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
+  $avx2_x86inc = '';
+$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
+  $ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
 if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
   $mmx_x86inc = 'mmx';
   $sse_x86inc = 'sse';
@@ -18,23 +24,165 @@
   $ssse3_x86inc = 'ssse3';
   $avx_x86inc = 'avx';
   $avx2_x86inc = 'avx2';
-} else {
-  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
-  $avx_x86inc = $avx2_x86inc = '';
+  if ($opts{arch} eq "x86_64") {
+    $mmx_x86_64_x86inc = 'mmx';
+    $sse_x86_64_x86inc = 'sse';
+    $sse2_x86_64_x86inc = 'sse2';
+    $ssse3_x86_64_x86inc = 'ssse3';
+    $avx_x86_64_x86inc = 'avx';
+    $avx2_x86_64_x86inc = 'avx2';
+  }
 }
 
-# Functions which are 64 bit only.
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
 if ($opts{arch} eq "x86_64") {
   $mmx_x86_64 = 'mmx';
   $sse2_x86_64 = 'sse2';
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
-} else {
-  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
-  $avx_x86_64 = $avx2_x86_64 = '';
 }
 
+#
+# Loopfilter
+#
+add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16 sse2 neon_asm msa/;
+$vpx_lpf_vertical_16_neon_asm=vpx_lpf_vertical_16_neon;
+
+add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm msa/;
+$vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_8 sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm msa/;
+$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
+
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_vertical_4 mmx neon msa/;
+
+add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm msa/;
+$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_8 sse2 neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm msa/;
+$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
+
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vpx_lpf_horizontal_4 mmx neon msa/;
+
+add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4 sse2/;
+
+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8 sse2/;
+
+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16 sse2/;
+
+  add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct4x4 sse2/;
+
+  add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct8x8 sse2/;
+
+  add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_highbd_fdct16x16 sse2/;
+} else {
+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4 sse2/;
+
+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16 sse2/;
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_VP9_ENCODER
+
+#
+# Quantization
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/;
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+
+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b sse2/;
+
+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
+} else {
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
+}  # CONFIG_VP9_ENCODER
+}  # CONFIG_VP9_HIGHBITDEPTH
+
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
 # Block subtraction
@@ -526,43 +674,43 @@
   specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
 
 #
 # Specialty Subpixel

diff --git a/vp9/encoder/x86/vp9_dct_sse2_impl.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
similarity index 98%
rename from vp9/encoder/x86/vp9_dct_sse2_impl.h
rename to vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 86e9ecf..f67bb7a 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_impl.h
+++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h

@@ -10,12 +10,15 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vp9/encoder/vp9_dct.h"
 #include "vp9/encoder/x86/vp9_dct_sse2.h"
 #include "vpx_ports/mem.h"
 
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
 #if DCT_HIGH_BIT_DEPTH
 #define ADD_EPI16 _mm_adds_epi16
 #define SUB_EPI16 _mm_subs_epi16

diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000..a868d22
--- /dev/null
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c

@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+
+#define DCT_HIGH_BIT_DEPTH 0
+
+#define FDCT4x4_2D vp9_fdct4x4_sse2
+#define FDCT8x8_2D vp9_fdct8x8_sse2
+#define FDCT16x16_2D vp9_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+#undef  DCT_HIGH_BIT_DEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT4x4_2D vp9_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vp9_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vp9_highbd_fdct16x16_sse2
+#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+#undef  DCT_HIGH_BIT_DEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/x86/fwd_txfm_ssse3.asm b/vpx_dsp/x86/fwd_txfm_ssse3.asm
new file mode 100644
index 0000000..5f63546
--- /dev/null
+++ b/vpx_dsp/x86/fwd_txfm_ssse3.asm

@@ -0,0 +1,182 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+; This file provides SSSE3 version of the forward transformation. Part
+; of the macro definitions are originally derived from the ffmpeg project.
+; The current version applies to x86 64-bit only.
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585,  11585
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+SECTION .text
+
+%if ARCH_X86_64
+%macro SUM_SUB 3
+  psubw  m%3, m%1, m%2
+  paddw  m%1, m%2
+  SWAP    %2, %3
+%endmacro
+
+; butterfly operation
+%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
+  pmaddwd            m%1, m%3, %5
+  pmaddwd            m%2, m%3, %6
+  paddd              m%1,  %4
+  paddd              m%2,  %4
+  psrad              m%1,  14
+  psrad              m%2,  14
+%endmacro
+
+%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
+; matrix transpose
+%macro INTERLEAVE_2X 4
+  punpckh%1          m%4, m%2, m%3
+  punpckl%1          m%2, m%3
+  SWAP               %3,  %4
+%endmacro
+
+%macro TRANSPOSE8X8 9
+  INTERLEAVE_2X  wd, %1, %2, %9
+  INTERLEAVE_2X  wd, %3, %4, %9
+  INTERLEAVE_2X  wd, %5, %6, %9
+  INTERLEAVE_2X  wd, %7, %8, %9
+
+  INTERLEAVE_2X  dq, %1, %3, %9
+  INTERLEAVE_2X  dq, %2, %4, %9
+  INTERLEAVE_2X  dq, %5, %7, %9
+  INTERLEAVE_2X  dq, %6, %8, %9
+
+  INTERLEAVE_2X  qdq, %1, %5, %9
+  INTERLEAVE_2X  qdq, %3, %7, %9
+  INTERLEAVE_2X  qdq, %2, %6, %9
+  INTERLEAVE_2X  qdq, %4, %8, %9
+
+  SWAP  %2, %5
+  SWAP  %4, %7
+%endmacro
+
+; 1D forward 8x8 DCT transform
+%macro FDCT8_1D 1
+  SUM_SUB            0,  7,  9
+  SUM_SUB            1,  6,  9
+  SUM_SUB            2,  5,  9
+  SUM_SUB            3,  4,  9
+
+  SUM_SUB            0,  3,  9
+  SUM_SUB            1,  2,  9
+  SUM_SUB            6,  5,  9
+%if %1 == 0
+  SUM_SUB            0,  1,  9
+%endif
+
+  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
+
+  pmulhrsw           m6, m12
+  pmulhrsw           m5, m12
+%if %1 == 0
+  pmulhrsw           m0, m12
+  pmulhrsw           m1, m12
+%else
+  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
+  SWAP               0,  1
+%endif
+
+  SUM_SUB            4,  5,  9
+  SUM_SUB            7,  6,  9
+  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
+  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
+  SWAP               1,  4
+  SWAP               3,  6
+%endmacro
+
+%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
+  psraw              m%3, m%1, 15
+  psraw              m%4, m%2, 15
+  psubw              m%1, m%3
+  psubw              m%2, m%4
+  psraw              m%1, 1
+  psraw              m%2, 1
+%endmacro
+
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [pd_8192]
+  mova              m12, [pw_11585x2]
+  pxor              m11, m11
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  FDCT8_1D  0
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  FDCT8_1D  1
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  DIVIDE_ROUND_2X   0, 1, 9, 10
+  DIVIDE_ROUND_2X   2, 3, 9, 10
+  DIVIDE_ROUND_2X   4, 5, 9, 10
+  DIVIDE_ROUND_2X   6, 7, 9, 10
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
+%endif

diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
similarity index 96%
rename from vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
rename to vpx_dsp/x86/highbd_loopfilter_sse2.c
index b40669c..c4fd5e1 100644
--- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c

@@ -10,9 +10,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
@@ -509,7 +508,7 @@
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
                                        const uint8_t *_blimit,
                                        const uint8_t *_limit,
                                        const uint8_t *_thresh,
@@ -520,7 +519,7 @@
     highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
 }
 
-void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
@@ -688,7 +687,7 @@
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
   filt = signed_char_clamp_bd_sse2(filt, bd);
   filt = _mm_and_si128(filt, mask);
 
@@ -757,7 +756,7 @@
   _mm_store_si128((__m128i *)(s + 2 * p), q2);
 }
 
-void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_blimit0,
                                            const uint8_t *_limit0,
                                            const uint8_t *_thresh0,
@@ -765,12 +764,12 @@
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
                                    1, bd);
 }
 
-void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
@@ -892,7 +891,7 @@
   filt = _mm_adds_epi16(filt, work_a);
   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
 
-  // (vp9_filter + 3 * (qs0 - ps0)) & mask
+  // (vpx_filter + 3 * (qs0 - ps0)) & mask
   filt = _mm_and_si128(filt, mask);
 
   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
@@ -937,7 +936,7 @@
   _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 }
 
-void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_blimit0,
                                            const uint8_t *_limit0,
                                            const uint8_t *_thresh0,
@@ -945,8 +944,8 @@
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
                                    bd);
 }
 
@@ -1055,7 +1054,7 @@
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
@@ -1072,7 +1071,7 @@
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
                                    bd);
 
   src[0] = t_dst;
@@ -1082,7 +1081,7 @@
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *blimit0,
                                          const uint8_t *limit0,
                                          const uint8_t *thresh0,
@@ -1098,7 +1097,7 @@
   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
                                         thresh0, blimit1, limit1, thresh1, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1109,7 +1108,7 @@
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
@@ -1126,7 +1125,7 @@
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
                                    bd);
 
   src[0] = t_dst;
@@ -1136,7 +1135,7 @@
   highbd_transpose(src, 8, dst, p, 1);
 }
 
-void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *blimit0,
                                          const uint8_t *limit0,
                                          const uint8_t *thresh0,
@@ -1152,7 +1151,7 @@
   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
                                         thresh0, blimit1, limit1, thresh1, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1164,7 +1163,7 @@
   highbd_transpose(src, 16, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh,
@@ -1193,7 +1192,7 @@
   highbd_transpose(src, 8, dst, p, 2);
 }
 
-void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
                                           int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,

diff --git a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
similarity index 97%
rename from vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
rename to vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 0174cfe..06c748d 100644
--- a/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,8 +10,9 @@
 
 #include <emmintrin.h>
 
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vp9/common/vp9_common.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 // from vp9_idct.h: typedef int32_t tran_low_t;

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
similarity index 98%
rename from vp9/common/x86/vp9_loopfilter_intrin_avx2.c
rename to vpx_dsp/x86/loopfilter_avx2.c
index 770a65f..23a97dd 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c

@@ -10,7 +10,7 @@
 
 #include <immintrin.h>  /* AVX2 */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
@@ -103,7 +103,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -515,7 +515,7 @@
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
         filt = _mm_adds_epi8(filt, work_a);
-        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
         filt = _mm_and_si128(filt, mask);
 
         filter1 = _mm_adds_epi8(filt, t4);
@@ -976,7 +976,7 @@
     }
 }
 
-void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh, int count) {
     if (count == 1)

diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
similarity index 98%
rename from vp9/common/x86/vp9_loopfilter_mmx.asm
rename to vpx_dsp/x86/loopfilter_mmx.asm
index f5f7d5a..b9c18b6 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm

@@ -12,7 +12,7 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 
-;void vp9_lpf_horizontal_4_mmx
+;void vpx_lpf_horizontal_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int src_pixel_step,
@@ -21,8 +21,8 @@
 ;    const char *thresh,
 ;    int  count
 ;)
-global sym(vp9_lpf_horizontal_4_mmx) PRIVATE
-sym(vp9_lpf_horizontal_4_mmx):
+global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
+sym(vpx_lpf_horizontal_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -224,7 +224,7 @@
     ret
 
 
-;void vp9_lpf_vertical_4_mmx
+;void vpx_lpf_vertical_4_mmx
 ;(
 ;    unsigned char *src_ptr,
 ;    int  src_pixel_step,
@@ -233,8 +233,8 @@
 ;    const char *thresh,
 ;    int count
 ;)
-global sym(vp9_lpf_vertical_4_mmx) PRIVATE
-sym(vp9_lpf_vertical_4_mmx):
+global sym(vpx_lpf_vertical_4_mmx) PRIVATE
+sym(vpx_lpf_vertical_4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
similarity index 97%
rename from vp9/common/x86/vp9_loopfilter_intrin_sse2.c
rename to vpx_dsp/x86/loopfilter_sse2.c
index fe8af54..ed10127 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c

@@ -10,8 +10,8 @@
 
 #include <emmintrin.h>  // SSE2
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_loopfilter.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
@@ -100,7 +100,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -495,7 +495,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
@@ -717,7 +717,7 @@
 }
 
 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
                                 const unsigned char *_blimit,
                                 const unsigned char *_limit,
                                 const unsigned char *_thresh, int count) {
@@ -727,7 +727,7 @@
     mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 }
 
-void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh, int count) {
@@ -874,7 +874,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -943,7 +943,7 @@
   }
 }
 
-void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
+void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
                                     const uint8_t *_blimit0,
                                     const uint8_t *_limit0,
                                     const uint8_t *_thresh0,
@@ -1115,7 +1115,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1190,7 +1190,7 @@
   }
 }
 
-void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
                                     const unsigned char *_blimit0,
                                     const unsigned char *_limit0,
                                     const unsigned char *_thresh0,
@@ -1286,7 +1286,7 @@
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    // (vpx_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -1464,7 +1464,7 @@
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
@@ -1478,7 +1478,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1489,7 +1489,7 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh, int count) {
@@ -1505,7 +1505,7 @@
   transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1514,7 +1514,7 @@
   transpose(src, 8, dst, p, 1);
 }
 
-void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *blimit1,
@@ -1528,7 +1528,7 @@
   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
   // Loop filtering
-  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                  blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
@@ -1540,7 +1540,7 @@
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1568,7 +1568,7 @@
   transpose(src, 8, dst, p, 2);
 }
 
-void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);

diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000..8d51aeb
--- /dev/null
+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t* zbin_ptr,
+                         const int16_t* round_ptr, const int16_t* quant_ptr,
+                         const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                         int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                         uint16_t* eob_ptr,
+                         const int16_t* scan_ptr,
+                         const int16_t* iscan_ptr) {
+  __m128i zero;
+  (void)scan_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+  if (!skip_block) {
+    __m128i eob;
+    __m128i zbin;
+    __m128i round, quant, dequant, shift;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        __m128i pw_1;
+        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        pw_1 = _mm_set1_epi16(1);
+        zbin = _mm_sub_epi16(zbin, pw_1);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        shift = _mm_unpackhi_epi64(shift, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        __m128i cmp_mask0, cmp_mask1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        // Mask out zbin threshold coeffs
+        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}

diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000..3784d9d
--- /dev/null
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  psubw                           m0, [pw_1]
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m8
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m8
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  mova        [qcoeffq+ncoeffq*2+ 0], m14
+  mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova       [dqcoeffq+ncoeffq*2+ 0], m14
+  mova       [dqcoeffq+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova       [dqcoeffq+ncoeffq*2+ 0], m7
+  mova       [dqcoeffq+ncoeffq*2+16], m7
+  mova        [qcoeffq+ncoeffq*2+ 0], m7
+  mova        [qcoeffq+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                    word [eobq], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7

diff --git a/vpx_util/endian_inl.h b/vpx_util/endian_inl.h
index 91753a6..12cc720 100644
--- a/vpx_util/endian_inl.h
+++ b/vpx_util/endian_inl.h

@@ -44,9 +44,13 @@
 #if defined(WORDS_BIGENDIAN)
 #define HToLE32 BSwap32
 #define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
 #else
 #define HToLE32(x) (x)
 #define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
 #endif
 
 // clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
commit	00a35fdedacde01a8a2bb23cae1e1270d7834c3a	[log] [tgz]
author	Parag Salasakar <img.mips1@gmail.com>	Thu Jul 23 01:03:39 2015 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Thu Jul 23 01:03:39 2015 +0000
tree	f50ce4e6fb1ae98eaa4bf6424bb317d0621afffd
parent	963b9b8818d209ad41c704b7e0591b967799275c [diff]
parent	55c0df5ef13f2c828b968788a319f27bf5b944cf [diff]