Merge "vp9_receive_compressed_data: remove unnecessary indent"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index f361021..c6c8660 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1060,9 +1060,11 @@
                 CC=${CC:-icc}
                 LD=${LD:-icc}
                 setup_gnu_toolchain
-                add_cflags -use-msasm -use-asm
-                add_ldflags -i-static
-                enabled x86_64 && add_cflags -ipo -static -O3
+                add_cflags -use-msasm  # remove -use-msasm too?
+                # add -no-intel-extensions to suppress warning #10237
+                # refer to http://software.intel.com/en-us/forums/topic/280199
+                add_ldflags -i-static -no-intel-extensions
+                enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
                 enabled x86_64 && AR=xiar
                 case ${tune_cpu} in
                     atom*)
diff --git a/examples.mk b/examples.mk
index 7b47ade..88327fe 100644
--- a/examples.mk
+++ b/examples.mk
@@ -40,9 +40,9 @@
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
-vpxenc.SRCS                 += libmkv/EbmlIDs.h
-vpxenc.SRCS                 += libmkv/EbmlWriter.c
-vpxenc.SRCS                 += libmkv/EbmlWriter.h
+vpxenc.SRCS                 += third_party/libmkv/EbmlIDs.h
+vpxenc.SRCS                 += third_party/libmkv/EbmlWriter.c
+vpxenc.SRCS                 += third_party/libmkv/EbmlWriter.h
 vpxenc.SRCS                 += $(LIBYUV_SRCS)
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index f0b412d..abeb4bd 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -599,6 +599,28 @@
     make_tuple(32, 64, &convolve8_c),
     make_tuple(64, 64, &convolve8_c)));
 
+#if HAVE_SSE2
+const ConvolveFunctions convolve8_sse2(
+    vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
+    vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
+    vp9_convolve8_sse2, vp9_convolve8_avg_sse2);
+
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+    make_tuple(4, 4, &convolve8_sse2),
+    make_tuple(8, 4, &convolve8_sse2),
+    make_tuple(4, 8, &convolve8_sse2),
+    make_tuple(8, 8, &convolve8_sse2),
+    make_tuple(16, 8, &convolve8_sse2),
+    make_tuple(8, 16, &convolve8_sse2),
+    make_tuple(16, 16, &convolve8_sse2),
+    make_tuple(32, 16, &convolve8_sse2),
+    make_tuple(16, 32, &convolve8_sse2),
+    make_tuple(32, 32, &convolve8_sse2),
+    make_tuple(64, 32, &convolve8_sse2),
+    make_tuple(32, 64, &convolve8_sse2),
+    make_tuple(64, 64, &convolve8_sse2)));
+#endif
+
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
     vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 0d19aa0..3d61d40 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -21,7 +21,7 @@
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
 }
 #include "vpx/vpx_integer.h"
 
@@ -258,9 +258,10 @@
 }
 
 typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
 typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
+                       int tx_type);
 
 void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_short_fdct16x16_c(in, out, stride);
@@ -496,27 +497,27 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
+        make_tuple(&vp9_short_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
-        make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
+        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
+        make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
     ::testing::Values(
         make_tuple(&vp9_short_fdct16x16_sse2,
-                   &vp9_short_idct16x16_add_sse2, 0)));
+                   &vp9_idct16x16_256_add_sse2, 0)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
-        make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
+        make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
 #endif
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index f331886..f456abc 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -75,7 +75,7 @@
 }
 
 typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *dst, int stride);
 
 class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
  public:
@@ -247,16 +247,16 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
-        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
+        make_tuple(&vp9_short_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
+        make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(
         make_tuple(&vp9_short_fdct32x32_sse2,
-                   &vp9_short_idct32x32_add_sse2, 0),
+                   &vp9_idct32x32_1024_add_sse2, 0),
         make_tuple(&vp9_short_fdct32x32_rd_sse2,
-                   &vp9_short_idct32x32_add_sse2, 1)));
+                   &vp9_idct32x32_1024_add_sse2, 1)));
 #endif
 }  // namespace
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index ea40ca6..edc194d 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -31,7 +31,7 @@
 }
 void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                  int stride, int /*tx_type*/) {
-  vp9_short_idct4x4_add_c(out, dst, stride >> 1);
+  vp9_idct4x4_16_add_c(out, dst, stride >> 1);
 }
 void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
             int stride, int tx_type) {
@@ -39,7 +39,7 @@
 }
 void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                 int stride, int tx_type) {
-  vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
+  vp9_iht4x4_16_add_c(out, dst, stride >> 1, tx_type);
 }
 
 class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 7edb4d0..728db6d 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -21,7 +21,7 @@
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
 }
 #include "vpx/vpx_integer.h"
 
@@ -29,9 +29,10 @@
 
 namespace {
 typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
 typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
+              int tx_type);
 
 void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_short_fdct8x8_c(in, out, stride);
@@ -296,26 +297,26 @@
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_short_fdct8x8_c, &vp9_short_idct8x8_add_c, 0)));
+        make_tuple(&vp9_short_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 0),
-        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 1),
-        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 2),
-        make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 3)));
+        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
+        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
+        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
+        make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_short_fdct8x8_sse2, &vp9_short_idct8x8_add_sse2, 0)));
+        make_tuple(&vp9_short_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 0),
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 1),
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 2),
-        make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 3)));
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
+        make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
 #endif
 }  // namespace
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index fc8129e..d8c61ff 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -126,7 +126,7 @@
     reference_dct_2d(input, output_r);
     for (int j = 0; j < 64; ++j)
       coeff[j] = round(output_r[j]);
-    vp9_short_idct8x8_add_c(coeff, dst, 8);
+    vp9_idct8x8_64_add_c(coeff, dst, 8);
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
       const int error = diff * diff;
diff --git a/test/resize_test.cc b/test/resize_test.cc
index d194dfd..e8c2c82 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -208,7 +208,7 @@
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
     if (!frame0_psnr_)
       frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.5);
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
diff --git a/libmkv/EbmlBufferWriter.c b/third_party/libmkv/EbmlBufferWriter.c
similarity index 100%
rename from libmkv/EbmlBufferWriter.c
rename to third_party/libmkv/EbmlBufferWriter.c
diff --git a/libmkv/EbmlBufferWriter.h b/third_party/libmkv/EbmlBufferWriter.h
similarity index 100%
rename from libmkv/EbmlBufferWriter.h
rename to third_party/libmkv/EbmlBufferWriter.h
diff --git a/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h
similarity index 100%
rename from libmkv/EbmlIDs.h
rename to third_party/libmkv/EbmlIDs.h
diff --git a/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c
similarity index 100%
rename from libmkv/EbmlWriter.c
rename to third_party/libmkv/EbmlWriter.c
diff --git a/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h
similarity index 100%
rename from libmkv/EbmlWriter.h
rename to third_party/libmkv/EbmlWriter.h
diff --git a/libmkv/Makefile b/third_party/libmkv/Makefile
similarity index 100%
rename from libmkv/Makefile
rename to third_party/libmkv/Makefile
diff --git a/libmkv/WebMElement.c b/third_party/libmkv/WebMElement.c
similarity index 100%
rename from libmkv/WebMElement.c
rename to third_party/libmkv/WebMElement.c
diff --git a/libmkv/WebMElement.h b/third_party/libmkv/WebMElement.h
similarity index 100%
rename from libmkv/WebMElement.h
rename to third_party/libmkv/WebMElement.h
diff --git a/libmkv/testlibmkv.c b/third_party/libmkv/testlibmkv.c
similarity index 100%
rename from libmkv/testlibmkv.c
rename to third_party/libmkv/testlibmkv.c
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index fb7b5cd..0b9fc09 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,31 +11,31 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+                                      int16_t *output,
+                                      int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+                                      int16_t *output,
+                                      int16_t *pass1Output,
+                                      int16_t skip_adding,
+                                      uint8_t *dest,
+                                      int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+                                     int16_t *output,
+                                     int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+                                     int16_t *output,
+                                     int16_t *pass1Output,
+                                     int16_t skip_adding,
+                                     uint8_t *dest,
+                                     int dest_stride);
 
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 extern void vp9_push_neon(int64_t *store);
 extern void vp9_pop_neon(int64_t *store);
 
-void vp9_short_idct16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_neon(const int16_t *input,
+                                uint8_t *dest, int dest_stride) {
   int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
@@ -46,12 +46,12 @@
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+1,
+  vp9_idct16x16_256_add_neon_pass2(input+1,
                                      row_idct_output,
                                      pass1_output,
                                      0,
@@ -61,12 +61,12 @@
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      0,
@@ -76,12 +76,12 @@
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -91,12 +91,12 @@
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
@@ -109,8 +109,8 @@
   return;
 }
 
-void vp9_short_idct16x16_10_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_10_add_neon(const int16_t *input,
+                               uint8_t *dest, int dest_stride) {
   int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
@@ -121,12 +121,12 @@
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_10_add_neon_pass2(input+1,
+  vp9_idct16x16_10_add_neon_pass2(input+1,
                                         row_idct_output,
                                         pass1_output,
                                         0,
@@ -138,12 +138,12 @@
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -153,12 +153,12 @@
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
index cf5c8f7..b1fd21b 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct16x16_1_add_neon|
+    EXPORT  |vp9_idct16x16_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                    int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct16x16_1_add_neon| PROC
+|vp9_idct16x16_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -193,6 +193,6 @@
     vst1.64          {d31}, [r12], r2
 
     bx               lr
-    ENDP             ; |vp9_short_idct16x16_1_add_neon|
+    ENDP             ; |vp9_idct16x16_1_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index df2a052..a13c0d0 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -8,10 +8,10 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct16x16_add_neon_pass1|
-    EXPORT  |vp9_short_idct16x16_add_neon_pass2|
-    EXPORT  |vp9_short_idct16x16_10_add_neon_pass1|
-    EXPORT  |vp9_short_idct16x16_10_add_neon_pass2|
+    EXPORT  |vp9_idct16x16_256_add_neon_pass1|
+    EXPORT  |vp9_idct16x16_256_add_neon_pass2|
+    EXPORT  |vp9_idct16x16_10_add_neon_pass1|
+    EXPORT  |vp9_idct16x16_10_add_neon_pass2|
     ARM
     REQUIRE8
     PRESERVE8
@@ -36,7 +36,7 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
 ;                                          int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -46,7 +46,7 @@
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass1| PROC
+|vp9_idct16x16_256_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -273,9 +273,9 @@
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_short_idct16x16_add_neon_pass1|
+    ENDP  ; |vp9_idct16x16_256_add_neon_pass1|
 
-;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
 ;                                        int16_t *output,
 ;                                        int16_t *pass1Output,
 ;                                        int16_t skip_adding,
@@ -292,7 +292,7 @@
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass2| PROC
+|vp9_idct16x16_256_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -784,9 +784,9 @@
 end_idct16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct16x16_add_neon_pass2|
+    ENDP  ; |vp9_idct16x16_256_add_neon_pass2|
 
-;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
 ;                                             int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -796,7 +796,7 @@
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_10_add_neon_pass1| PROC
+|vp9_idct16x16_10_add_neon_pass1| PROC
 
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -905,9 +905,9 @@
     vst1.64         {d31}, [r1], r2
 
     bx              lr
-    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass1|
+    ENDP  ; |vp9_idct16x16_10_add_neon_pass1|
 
-;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
 ;                                           int16_t *output,
 ;                                           int16_t *pass1Output,
 ;                                           int16_t skip_adding,
@@ -924,7 +924,7 @@
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_10_add_neon_pass2| PROC
+|vp9_idct16x16_10_add_neon_pass2| PROC
     push            {r3-r9}
 
     ; TODO(hkuang): Find a better way to load the elements.
@@ -1175,5 +1175,5 @@
 end_idct10_16x16_pass2
     pop             {r3-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct16x16_10_add_neon_pass2|
+    ENDP  ; |vp9_idct16x16_10_add_neon_pass2|
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index b5a284b..f00d027 100644
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -43,7 +43,7 @@
 cospi_31_64 EQU   804
 
 
-    EXPORT  |vp9_short_idct32x32_add_neon|
+    EXPORT  |vp9_idct32x32_1024_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -288,7 +288,7 @@
     MEND
     ; --------------------------------------------------------------------------
 
-;void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 ;
 ;   r0  int16_t *input,
 ;   r1  uint8_t *dest,
@@ -303,7 +303,7 @@
 ;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
 ;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
 
-|vp9_short_idct32x32_add_neon| PROC
+|vp9_idct32x32_1024_add_neon| PROC
     ; This function does one pass of idct32x32 transform.
     ;
     ; This is done by transposing the input and then doing a 1d transform on
@@ -1295,5 +1295,5 @@
     vpop {d8-d15}
     pop  {r4-r11}
     bx              lr
-    ENDP  ; |vp9_short_idct32x32_add_neon|
+    ENDP  ; |vp9_idct32x32_1024_add_neon|
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
index 869ee5f..0d4a721 100644
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct4x4_1_add_neon|
+    EXPORT  |vp9_idct4x4_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct4x4_1_add_neon| PROC
+|vp9_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -63,6 +63,6 @@
     vst1.32          {d7[1]}, [r12]
 
     bx               lr
-    ENDP             ; |vp9_short_idct4x4_1_add_neon|
+    ENDP             ; |vp9_idct4x4_1_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
index 640fb93..00283fc 100644
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct4x4_add_neon|
+    EXPORT  |vp9_idct4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -16,13 +16,13 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct4x4_add_neon| PROC
+|vp9_idct4x4_16_add_neon| PROC
 
     ; The 2D transform is done with two passes which are actually pretty
     ; similar. We first transform the rows. This is done by transposing
@@ -185,6 +185,6 @@
     vst1.32 {d26[1]}, [r1], r2
     vst1.32 {d26[0]}, [r1]  ; no post-increment
     bx              lr
-    ENDP  ; |vp9_short_idct4x4_add_neon|
+    ENDP  ; |vp9_idct4x4_16_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
index 923804f..421d202 100644
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -8,21 +8,21 @@
 ;
 
 
-    EXPORT  |vp9_short_idct8x8_1_add_neon|
+    EXPORT  |vp9_idct8x8_1_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct8x8_1_add_neon| PROC
+|vp9_idct8x8_1_add_neon| PROC
     ldrsh            r0, [r0]
 
     ; generate cospi_16_64 = 11585
@@ -83,6 +83,6 @@
     vst1.64          {d31}, [r12], r2
 
     bx               lr
-    ENDP             ; |vp9_short_idct8x8_1_add_neon|
+    ENDP             ; |vp9_idct8x8_1_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index c02251a..5476400 100644
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -8,8 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_idct8x8_add_neon|
-    EXPORT  |vp9_short_idct8x8_10_add_neon|
+    EXPORT  |vp9_idct8x8_64_add_neon|
+    EXPORT  |vp9_idct8x8_10_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -198,13 +198,13 @@
     MEND
 
     AREA    Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct8x8_add_neon| PROC
+|vp9_idct8x8_64_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -308,15 +308,15 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct8x8_add_neon|
+    ENDP  ; |vp9_idct8x8_64_add_neon|
 
-;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
 
-|vp9_short_idct8x8_10_add_neon| PROC
+|vp9_idct8x8_10_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
     vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
     vpop            {d8-d15}
     pop             {r4-r9}
     bx              lr
-    ENDP  ; |vp9_short_idct8x8_10_add_neon|
+    ENDP  ; |vp9_idct8x8_10_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
index 963ef35..2f326e2 100644
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_iht4x4_add_neon|
+    EXPORT  |vp9_iht4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -139,7 +139,7 @@
     MEND
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
 ;                               int dest_stride, int tx_type)
 ;
 ; r0  int16_t input
@@ -147,7 +147,7 @@
 ; r2  int dest_stride
 ; r3  int tx_type)
 ; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
+|vp9_iht4x4_16_add_neon| PROC
 
     ; load the inputs into d16-d19
     vld1.s16    {q8,q9}, [r0]!
@@ -175,7 +175,7 @@
     ; then transform columns
     IADST4x4_1D
 
-    b end_vp9_short_iht4x4_add_neon
+    b end_vp9_iht4x4_16_add_neon
 
 idct_iadst
     ; generate constants
@@ -191,7 +191,7 @@
     ; then transform columns
     IDCT4x4_1D
 
-    b end_vp9_short_iht4x4_add_neon
+    b end_vp9_iht4x4_16_add_neon
 
 iadst_iadst
     ; generate constants
@@ -206,7 +206,7 @@
     ; then transform columns
     IADST4x4_1D
 
-end_vp9_short_iht4x4_add_neon
+end_vp9_iht4x4_16_add_neon
     ; ROUND_POWER_OF_TWO(temp_out[j], 4)
     vrshr.s16   q8, q8, #4
     vrshr.s16   q9, q9, #4
@@ -232,6 +232,6 @@
     vst1.32     {d26[1]}, [r1], r2
     vst1.32     {d26[0]}, [r1]  ; no post-increment
     bx          lr
-    ENDP  ; |vp9_short_iht4x4_add_neon|
+    ENDP  ; |vp9_iht4x4_16_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
index bab9cb4..93d3af3 100644
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_iht8x8_add_neon|
+    EXPORT  |vp9_iht8x8_64_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -559,7 +559,7 @@
 
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
 ;                               int dest_stride, int tx_type)
 ;
 ; r0  int16_t input
@@ -567,7 +567,7 @@
 ; r2  int dest_stride
 ; r3  int tx_type)
 ; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
+|vp9_iht8x8_64_add_neon| PROC
 
     ; load the inputs into d16-d19
     vld1.s16        {q8,q9}, [r0]!
@@ -602,7 +602,7 @@
     ; then transform columns
     IADST8X8_1D
 
-    b end_vp9_short_iht8x8_add_neon
+    b end_vp9_iht8x8_64_add_neon
 
 idct_iadst
     ; generate IADST constants
@@ -620,7 +620,7 @@
     ; then transform columns
     IDCT8x8_1D
 
-    b end_vp9_short_iht8x8_add_neon
+    b end_vp9_iht8x8_64_add_neon
 
 iadst_iadst
     ; generate IADST constants
@@ -635,7 +635,7 @@
     ; then transform columns
     IADST8X8_1D
 
-end_vp9_short_iht8x8_add_neon
+end_vp9_iht8x8_64_add_neon
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
@@ -691,6 +691,6 @@
     vst1.64         {d6}, [r0], r2
     vst1.64         {d7}, [r0], r2
     bx          lr
-    ENDP  ; |vp9_short_iht8x8_add_neon|
+    ENDP  ; |vp9_iht8x8_64_add_neon|
 
     END
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index d2fa4c1..dc88f16 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,5 +81,34 @@
   );
 }
 
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h);
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h);
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h);
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h);
+
 #endif  // #if HAVE_DSPR2
 #endif  // VP9_COMMON_VP9_COMMON_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
new file mode 100644
index 0000000..91d62bc
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -0,0 +1,281 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int16_t *filter_y,
+                                         int32_t w,
+                                         int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm),
+            [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_y,
+                                          int32_t h) {
+  int32_t       x, y;
+  const uint8_t *src_ptr;
+  uint8_t       *dst_ptr;
+  uint8_t       *cm = vp9_ff_cropTbl;
+  uint32_t      vector4a = 64;
+  uint32_t      load1, load2;
+  uint32_t      p1, p2;
+  uint32_t      scratch1, scratch2;
+  uint32_t      store1, store2;
+  int32_t       Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_store(dst + dst_stride);
+    vp9_prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__ (
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
+          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r" (load1), [load2] "=&r" (load2),
+            [p1] "=&r" (p1), [p2] "=&r" (p2),
+            [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+            [store1] "=&r" (store1), [store2] "=&r" (store2),
+            [src_ptr] "+r" (src_ptr)
+          : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+            [src_stride] "r" (src_stride), [cm] "r" (cm),
+            [dst_ptr] "r" (dst_ptr)
+      );
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4,
+                                  int w, int h) {
+  if (16 == y_step_q4) {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_bi_avg_vert_4_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_y, w, h);
+        break;
+      case 64:
+        vp9_prefetch_store(dst + 32);
+        convolve_bi_avg_vert_64_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_y, h);
+        break;
+      default:
+        vp9_convolve8_avg_vert_c(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_avg_vert_c(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000..148b20f
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,833 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t  Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3;
+  uint32_t tn1, tn2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t      filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p3],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
+        "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
+        "dpa.w.ph         $ac2,           %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp4],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
+        "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
+
+        /* clamp */
+        "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
+        "lbux             %[p3],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
+        "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
+
+        "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
+        "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
+
+        "addqh_r.w        %[p2],          %[p2],          %[p3]          \n\t"  /* average odd 2 */
+        "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+                                         int32_t src_stride,
+                                         uint8_t *dst,
+                                         int32_t dst_stride,
+                                         const int16_t *filter_x0,
+                                         int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3, tp4;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
+        "ulw              %[tp3],         8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+        "lbu              %[Temp2],       0(%[dst])                      \n\t"
+        "lbu              %[tp4],         2(%[dst])                      \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac1,           31             \n\t"
+
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+        "addqh_r.w        %[tp4],         %[tp4],         %[st1]         \n\t"
+        "sb               %[Temp2],       0(%[dst])                      \n\t"
+        "sb               %[tp4],         2(%[dst])                      \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+
+        "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
+        "lbu              %[Temp2],       4(%[dst])                      \n\t"
+        "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                           \n\t"
+        "mthi             $zero,          $ac1                           \n\t"
+        "sb               %[Temp2],       4(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp1],         6(%[dst])                      \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],       $ac1,           31             \n\t"
+
+        "lbu              %[tp2],         1(%[dst])                      \n\t"
+        "lbu              %[tp3],         3(%[dst])                      \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]    \n\t"
+        "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
+        "extp             %[Temp2],       $ac3,           31             \n\t"
+
+        "lbu              %[tp4],         5(%[dst])                      \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp2],         1(%[dst])                      \n\t"
+        "sb               %[tp1],         6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac2,           31             \n\t"
+
+        "lbu              %[tp1],         7(%[dst])                      \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
+        "addqh_r.w        %[tp3],         %[tp3],         %[p4]          \n\t"
+
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+        "addqh_r.w        %[tp4],         %[tp4],         %[p2]          \n\t"
+
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "addqh_r.w        %[tp1],         %[tp1],         %[p1]          \n\t"
+
+        /* store bytes */
+        "sb               %[tp3],         3(%[dst])                      \n\t"
+        "sb               %[tp4],         5(%[dst])                      \n\t"
+        "sb               %[tp1],         7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [n1] "=&r" (n1),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+                                          int32_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h,
+                                          int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+                                          int32_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          int32_t dst_stride,
+                                          const int16_t *filter_x0,
+                                          int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+          "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
+          "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
+          "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
+          "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
+          "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
+          "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                   \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
+          "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
+          "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
+          "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
+          "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
+          "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
+
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
+
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+          "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
+
+          "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
+          "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
+          "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  if (16 == x_step_q4) {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src);
+    vp9_prefetch_load(src + 32);
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+        break;
+      case 8:
+        convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+                                     dst, dst_stride,
+                                     filter_x, h);
+        break;
+      case 16:
+        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 1);
+        break;
+      case 32:
+        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h, 2);
+        break;
+      case 64:
+        vp9_prefetch_load(src + 64);
+        vp9_prefetch_store(dst + 32);
+
+        convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+                                      dst, dst_stride,
+                                      filter_x, h);
+        break;
+      default:
+        vp9_convolve8_avg_horiz_c(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_avg_horiz_c(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
new file mode 100644
index 0000000..bc422bc
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
@@ -0,0 +1,784 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const int16_t *filter_x0,

+                                                 int32_t h) {

+  int32_t       y;

+  uint8_t       *cm = vp9_ff_cropTbl;

+  uint8_t       *dst_ptr;

+  int32_t       Temp1, Temp2;

+  uint32_t      vector4a = 64;

+  uint32_t      tp1, tp2;

+  uint32_t      p1, p2;

+  const int16_t *filter = &filter_x0[3];

+  uint32_t      filter45;

+

+  filter45 = ((const int32_t *)filter)[0];

+

+  for (y = h; y--;) {

+    dst_ptr = dst;

+    /* prefetch data to cache memory */

+    vp9_prefetch_load(src + src_stride);

+    vp9_prefetch_load(src + src_stride + 32);

+

+    __asm__ __volatile__ (

+        "ulw              %[tp1],         0(%[src])                      \n\t"

+        "ulw              %[tp2],         4(%[src])                      \n\t"

+

+        /* even 1. pixel */

+        "mtlo             %[vector4a],    $ac3                           \n\t"

+        "mthi             $zero,          $ac3                           \n\t"

+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"

+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"

+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"

+        "extp             %[Temp1],       $ac3,           31             \n\t"

+

+        /* even 2. pixel */

+        "mtlo             %[vector4a],    $ac2                           \n\t"

+        "mthi             $zero,          $ac2                           \n\t"

+        "balign           %[tp2],         %[tp1],         3              \n\t"

+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"

+        "extp             %[Temp2],       $ac2,           31             \n\t"

+

+        /* odd 1. pixel */

+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"

+        "mtlo             %[vector4a],    $ac3                           \n\t"

+        "mthi             $zero,          $ac3                           \n\t"

+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"

+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"

+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"

+        "extp             %[Temp1],       $ac3,           31             \n\t"

+

+        /* odd 2. pixel */

+        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"

+        "mtlo             %[vector4a],    $ac2                           \n\t"

+        "mthi             $zero,          $ac2                           \n\t"

+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"

+        "extp             %[Temp2],       $ac2,           31             \n\t"

+

+        /* clamp */

+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"

+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"

+

+        /* store bytes */

+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"

+

+        "sb               %[p1],          0(%[dst_ptr])                  \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"

+

+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"

+

+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"

+

+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),

+          [p1] "=&r" (p1), [p2] "=&r" (p2),

+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

+          [dst_ptr] "+r" (dst_ptr)

+        : [filter45] "r" (filter45),[vector4a] "r" (vector4a),

+          [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)

+    );

+

+    /* Next row... */

+    src += src_stride;

+    dst += 1;

+  }

+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,

+                                                 int32_t src_stride,

+                                                 uint8_t *dst,

+                                                 int32_t dst_stride,

+                                                 const int16_t *filter_x0,

+                                                 int32_t h) {

+  int32_t y;

+  uint8_t *cm = vp9_ff_cropTbl;

+  uint8_t *dst_ptr;

+  uint32_t vector4a = 64;

+  int32_t Temp1, Temp2, Temp3;

+  uint32_t tp1, tp2, tp3;

+  uint32_t p1, p2, p3, p4;

+  uint8_t *odd_dst;

+  uint32_t dst_pitch_2 = (dst_stride << 1);

+  const int16_t *filter = &filter_x0[3];

+  uint32_t      filter45;

+

+  filter45 = ((const int32_t *)filter)[0];

+

+  for (y = h; y--;) {

+    /* prefetch data to cache memory */

+    vp9_prefetch_load(src + src_stride);

+    vp9_prefetch_load(src + src_stride + 32);

+

+    dst_ptr = dst;

+    odd_dst = (dst_ptr + dst_stride);

+

+    __asm__ __volatile__ (

+        "ulw              %[tp1],         0(%[src])                       \n\t"

+        "ulw              %[tp2],         4(%[src])                       \n\t"

+

+        /* even 1. pixel */

+        "mtlo             %[vector4a],    $ac3                            \n\t"

+        "mthi             $zero,          $ac3                            \n\t"

+        "mtlo             %[vector4a],    $ac2                            \n\t"

+        "mthi             $zero,          $ac2                            \n\t"

+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"

+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"

+        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"

+        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"

+        "ulw              %[tp3],         8(%[src])                       \n\t"

+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"

+        "extp             %[Temp1],       $ac3,           31              \n\t"

+

+        /* even 2. pixel */

+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"

+        "extp             %[Temp3],       $ac2,           31              \n\t"

+

+        /* even 3. pixel */

+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"

+        "mtlo             %[vector4a],    $ac1                            \n\t"

+        "mthi             $zero,          $ac1                            \n\t"

+        "balign           %[tp3],         %[tp2],         3              \n\t"

+        "balign           %[tp2],         %[tp1],         3              \n\t"

+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"

+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"

+        "extp             %[p3],          $ac1,           31              \n\t"

+

+        /* even 4. pixel */

+        "mtlo             %[vector4a],    $ac2                            \n\t"

+        "mthi             $zero,          $ac2                            \n\t"

+        "mtlo             %[vector4a],    $ac3                            \n\t"

+        "mthi             $zero,          $ac3                            \n\t"

+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"

+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"

+

+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"

+        "extp             %[Temp3],       $ac2,           31              \n\t"

+

+        "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"

+

+        /* odd 1. pixel */

+        "mtlo             %[vector4a],    $ac1                            \n\t"

+        "mthi             $zero,          $ac1                            \n\t"

+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"

+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"

+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"

+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"

+        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"

+

+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"

+        "extp             %[Temp2],       $ac3,           31              \n\t"

+

+        /* odd 2. pixel */

+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"

+        "mtlo             %[vector4a],    $ac3                            \n\t"

+        "mthi             $zero,          $ac3                            \n\t"

+        "mtlo             %[vector4a],    $ac2                            \n\t"

+        "mthi             $zero,          $ac2                            \n\t"

+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"

+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"

+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"

+        "extp             %[Temp3],       $ac1,           31              \n\t"

+

+        /* odd 3. pixel */

+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"

+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"

+        "extp             %[Temp2],       $ac3,           31              \n\t"

+

+        /* odd 4. pixel */

+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"

+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"

+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"

+        "extp             %[Temp1],       $ac2,           31              \n\t"

+

+        /* clamp */

+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"

+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"

+        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"

+

+        /* store bytes */

+        "sb               %[p4],          0(%[odd_dst])                   \n\t"

+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"

+

+        "sb               %[p2],          0(%[odd_dst])                   \n\t"

+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"

+

+        "sb               %[p1],          0(%[odd_dst])                   \n\t"

+

+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),

+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),

+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+          [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)

+        : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),

+          [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)

+    );

+

+    /* Next row... */

+    src += src_stride;

+    dst += 1;

+  }

+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,

+                                                  int32_t src_stride,

+                                                  uint8_t *dst_ptr,

+                                                  int32_t dst_stride,

+                                                  const int16_t *filter_x0,

+                                                  int32_t h,

+                                                  int32_t count) {

+  int32_t       c, y;

+  const uint8_t *src;

+  uint8_t       *dst;

+  uint8_t       *cm = vp9_ff_cropTbl;

+  uint32_t      vector_64 = 64;

+  int32_t       Temp1, Temp2, Temp3;

+  uint32_t      qload1, qload2;

+  uint32_t      p1, p2, p3, p4, p5;

+  uint32_t      st1, st2, st3;

+  uint32_t      dst_pitch_2 = (dst_stride << 1);

+  uint8_t       *odd_dst;

+  const int16_t *filter = &filter_x0[3];

+  uint32_t      filter45;

+

+  filter45 = ((const int32_t *)filter)[0];

+

+  for (y = h; y--;) {

+    /* prefetch data to cache memory */

+    vp9_prefetch_load(src_ptr + src_stride);

+    vp9_prefetch_load(src_ptr + src_stride + 32);

+

+    src = src_ptr;

+    dst = dst_ptr;

+

+    odd_dst = (dst + dst_stride);

+

+    for (c = 0; c < count; c++) {

+      __asm__ __volatile__ (

+          "ulw              %[qload1],        0(%[src])                       \n\t"

+          "ulw              %[qload2],        4(%[src])                       \n\t"

+

+          /* even 1. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"

+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"

+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"

+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"

+          "ulw              %[qload1],        8(%[src])                       \n\t"

+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */

+

+          /* even 2. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"

+          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"

+          "ulw              %[qload2],        12(%[src])                      \n\t"

+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */

+

+          /* even 3. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"

+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"

+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */

+

+          /* even 4. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"

+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */

+

+          /* even 5. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */

+

+          /* even 6. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "ulw              %[qload1],        20(%[src])                      \n\t"

+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */

+

+          /* even 7. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"

+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */

+

+          /* even 8. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */

+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */

+

+          /* ODD pixels */

+          "ulw              %[qload1],        1(%[src])                       \n\t"

+          "ulw              %[qload2],        5(%[src])                       \n\t"

+

+          /* odd 1. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"

+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"

+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"

+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"

+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "ulw              %[qload2],        9(%[src])                       \n\t"

+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */

+

+          /* odd 2. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"

+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"

+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */

+          "ulw              %[qload1],        13(%[src])                      \n\t"

+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */

+

+          /* odd 3. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"

+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */

+

+          /* odd 4. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"

+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */

+

+          /* odd 5. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */

+

+          /* odd 6. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "ulw              %[qload1],        21(%[src])                      \n\t"

+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */

+

+          /* odd 7. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"

+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */

+

+          /* odd 8. pixel */

+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */

+

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */

+

+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+

+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+

+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */

+

+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),

+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),

+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),

+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)

+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),

+            [cm] "r" (cm),

+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)

+      );

+

+      src += 16;

+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));

+      odd_dst = (dst + dst_stride);

+    }

+

+    /* Next row... */

+    src_ptr += src_stride;

+    dst_ptr += 1;

+  }

+}

+
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,

+                                                  int32_t src_stride,

+                                                  uint8_t *dst_ptr,

+                                                  int32_t dst_stride,

+                                                  const int16_t *filter_x0,

+                                                  int32_t h) {

+  int32_t       c, y;

+  const uint8_t *src;

+  uint8_t       *dst;

+  uint8_t       *cm = vp9_ff_cropTbl;

+  uint32_t      vector_64 = 64;

+  int32_t       Temp1, Temp2, Temp3;

+  uint32_t      qload1, qload2;

+  uint32_t      p1, p2, p3, p4, p5;

+  uint32_t      st1, st2, st3;

+  uint32_t      dst_pitch_2 = (dst_stride << 1);

+  uint8_t       *odd_dst;

+  const int16_t *filter = &filter_x0[3];

+  uint32_t      filter45;

+

+  filter45 = ((const int32_t *)filter)[0];

+

+  for (y = h; y--;) {

+    /* prefetch data to cache memory */

+    vp9_prefetch_load(src_ptr + src_stride);

+    vp9_prefetch_load(src_ptr + src_stride + 32);

+    vp9_prefetch_load(src_ptr + src_stride + 64);

+

+    src = src_ptr;

+    dst = dst_ptr;

+

+    odd_dst = (dst + dst_stride);

+

+    for (c = 0; c < 4; c++) {

+      __asm__ __volatile__ (

+          "ulw              %[qload1],        0(%[src])                       \n\t"

+          "ulw              %[qload2],        4(%[src])                       \n\t"

+

+          /* even 1. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"

+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"

+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"

+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"

+          "ulw              %[qload1],        8(%[src])                       \n\t"

+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */

+

+          /* even 2. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"

+          "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"

+          "ulw              %[qload2],        12(%[src])                      \n\t"

+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */

+

+          /* even 3. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"

+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"

+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */

+

+          /* even 4. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"

+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */

+

+          /* even 5. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */

+

+          /* even 6. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "ulw              %[qload1],        20(%[src])                      \n\t"

+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */

+

+          /* even 7. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"

+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */

+

+          /* even 8. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */

+          "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */

+

+          /* ODD pixels */

+          "ulw              %[qload1],        1(%[src])                       \n\t"

+          "ulw              %[qload2],        5(%[src])                       \n\t"

+

+          /* odd 1. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"

+          "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"

+          "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"

+          "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"

+          "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */

+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"

+          "ulw              %[qload2],        9(%[src])                       \n\t"

+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */

+

+          /* odd 2. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"

+          "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"

+          "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */

+          "ulw              %[qload1],        13(%[src])                      \n\t"

+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */

+

+          /* odd 3. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"

+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */

+

+          /* odd 4. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"

+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */

+

+          /* odd 5. pixel */

+          "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */

+          "mthi             $zero,            $ac2                            \n\t"

+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */

+

+          /* odd 6. pixel */

+          "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */

+          "mthi             $zero,            $ac3                            \n\t"

+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "ulw              %[qload1],        21(%[src])                      \n\t"

+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */

+          "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */

+

+          /* odd 7. pixel */

+          "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */

+          "mthi             $zero,            $ac1                            \n\t"

+          "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"

+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */

+          "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */

+

+          /* odd 8. pixel */

+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */

+          "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */

+

+          "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */

+          "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */

+          "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */

+

+          "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+

+          "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */

+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"

+

+          "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */

+

+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),

+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),

+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),

+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),

+            [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)

+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),

+            [cm] "r" (cm),

+            [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)

+      );

+

+      src += 16;

+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));

+      odd_dst = (dst + dst_stride);

+    }

+

+    /* Next row... */

+    src_ptr += src_stride;

+    dst_ptr += 1;

+  }

+}

+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter, int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      sum += src[x] * filter[3];
+      sum += src[x + 1] * filter[4];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter,
+                         int w, int h) {
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  /* prefetch data to cache memory */
+  vp9_prefetch_load(src);
+  vp9_prefetch_load(src + 32);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
+                                           dst, dst_stride,
+                                           filter, h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
+                                           dst, dst_stride,
+                                           filter, h);
+      break;
+    case 16:
+    case 32:
+      convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
+                                            dst, dst_stride,
+                                            filter, h,
+                                            (w/16));
+      break;
+    case 64:
+      vp9_prefetch_load(src + 32);
+      convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
+                                            dst, dst_stride,
+                                            filter, h);
+      break;
+    default:
+      convolve_bi_horiz_transposed(src, src_stride,
+                                   dst, dst_stride,
+                                   filter, w, h);
+      break;
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
new file mode 100644
index 0000000..1debdb4
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
@@ -0,0 +1,713 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_x0,
+                                      int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[p1],       1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[p2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+          [p1] "=&r" (p1), [p2] "=&r" (p2),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+          [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_x0,
+                                      int32_t h) {
+  int32_t y;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src + src_stride);
+    vp9_prefetch_load(src + src_stride + 32);
+    vp9_prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__ (
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tp3],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tp3],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[p1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+          [st0] "=&r" (st0), [st1] "=&r" (st1),
+          [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+          [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+        : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+          [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+    );
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h,
+                                       int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0,
+                                       int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = vp9_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src_ptr + src_stride);
+    vp9_prefetch_load(src_ptr + src_stride + 32);
+    vp9_prefetch_load(src_ptr + src_stride + 64);
+    vp9_prefetch_store(dst_ptr + dst_stride);
+    vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__ (
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+            [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+            [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+            [p5] "=&r" (p5),
+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+          : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+            [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+      );
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (16 == x_step_q4) {
+    uint32_t pos = 38;
+
+    vp9_prefetch_load((const uint8_t *)filter_x);
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    /* prefetch data to cache memory */
+    vp9_prefetch_load(src);
+    vp9_prefetch_load(src + 32);
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h);
+        break;
+      case 8:
+        convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filter_x, (int32_t)h);
+        break;
+      case 16:
+        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                   dst, (int32_t)dst_stride,
+                                   filter_x, (int32_t)h, 1);
+        break;
+      case 32:
+        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+                                   dst, (int32_t)dst_stride,
+                                   filter_x, (int32_t)h, 2);
+        break;
+      case 64:
+        vp9_prefetch_load(src + 64);
+        vp9_prefetch_store(dst + 32);
+
+        convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+                                   dst, (int32_t)dst_stride,
+                                   filter_x, (int32_t)h);
+        break;
+      default:
+        vp9_convolve8_horiz_c(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_horiz_c(src, src_stride,
+                          dst, dst_stride,
+                          filter_x, x_step_q4,
+                          filter_y, y_step_q4,
+                          w, h);
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
new file mode 100644
index 0000000..8eb105c
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
@@ -0,0 +1,266 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src,

+                                     int32_t src_stride,

+                                     uint8_t *dst,

+                                     int32_t dst_stride,

+                                     const int16_t *filter_y,

+                                     int32_t w,

+                                     int32_t h) {

+  int32_t       x, y;

+  const uint8_t *src_ptr;

+  uint8_t       *dst_ptr;

+  uint8_t       *cm = vp9_ff_cropTbl;

+  uint32_t      vector4a = 64;

+  uint32_t      load1, load2;

+  uint32_t      p1, p2;

+  uint32_t      scratch1;

+  uint32_t      store1, store2;

+  int32_t       Temp1, Temp2;

+  const int16_t *filter = &filter_y[3];

+  uint32_t      filter45;

+

+  filter45 = ((const int32_t *)filter)[0];

+

+  for (y = h; y--;) {

+    /* prefetch data to cache memory */

+    vp9_prefetch_store(dst + dst_stride);

+

+    for (x = 0; x < w; x += 4) {

+      src_ptr = src + x;

+      dst_ptr = dst + x;

+

+      __asm__ __volatile__ (

+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"

+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"

+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"

+

+          "mtlo             %[vector4a],  $ac0                            \n\t"

+          "mtlo             %[vector4a],  $ac1                            \n\t"

+          "mtlo             %[vector4a],  $ac2                            \n\t"

+          "mtlo             %[vector4a],  $ac3                            \n\t"

+          "mthi             $zero,        $ac0                            \n\t"

+          "mthi             $zero,        $ac1                            \n\t"

+          "mthi             $zero,        $ac2                            \n\t"

+          "mthi             $zero,        $ac3                            \n\t"

+

+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"

+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"

+

+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */

+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

+

+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"

+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"

+

+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"

+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"

+

+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */

+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

+

+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"

+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"

+

+          "extp             %[Temp1],     $ac0,           31              \n\t"

+          "extp             %[Temp2],     $ac1,           31              \n\t"

+

+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"

+          "extp             %[Temp1],     $ac2,           31              \n\t"

+

+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"

+          "extp             %[Temp2],     $ac3,           31              \n\t"

+

+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"

+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"

+

+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"

+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"

+

+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"

+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"

+

+          : [load1] "=&r" (load1), [load2] "=&r" (load2),

+            [p1] "=&r" (p1), [p2] "=&r" (p2),

+            [scratch1] "=&r" (scratch1),

+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

+            [store1] "=&r" (store1), [store2] "=&r" (store2),

+            [src_ptr] "+r" (src_ptr)

+          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),

+            [src_stride] "r" (src_stride),

+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)

+      );

+    }

+

+    /* Next row... */

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src,
+                                      int32_t src_stride,
+                                      uint8_t *dst,
+                                      int32_t dst_stride,
+                                      const int16_t *filter_y,
+                                      int32_t h) {
+  int32_t       x, y;

+  const uint8_t *src_ptr;

+  uint8_t       *dst_ptr;

+  uint8_t       *cm = vp9_ff_cropTbl;

+  uint32_t      vector4a = 64;

+  uint32_t      load1, load2;

+  uint32_t      p1, p2;

+  uint32_t      scratch1;

+  uint32_t      store1, store2;

+  int32_t       Temp1, Temp2;

+  const int16_t *filter = &filter_y[3];

+  uint32_t      filter45;

+

+  filter45 = ((const int32_t *)filter)[0];

+

+  for (y = h; y--;) {

+    /* prefetch data to cache memory */

+    vp9_prefetch_store(dst + dst_stride);

+

+    for (x = 0; x < 64; x += 4) {

+      src_ptr = src + x;

+      dst_ptr = dst + x;

+

+      __asm__ __volatile__ (

+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"

+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"

+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"

+

+          "mtlo             %[vector4a],  $ac0                            \n\t"

+          "mtlo             %[vector4a],  $ac1                            \n\t"

+          "mtlo             %[vector4a],  $ac2                            \n\t"

+          "mtlo             %[vector4a],  $ac3                            \n\t"

+          "mthi             $zero,        $ac0                            \n\t"

+          "mthi             $zero,        $ac1                            \n\t"

+          "mthi             $zero,        $ac2                            \n\t"

+          "mthi             $zero,        $ac3                            \n\t"

+

+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"

+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"

+

+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */

+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

+

+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"

+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"

+

+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"

+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"

+

+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */

+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

+

+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"

+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"

+

+          "extp             %[Temp1],     $ac0,           31              \n\t"

+          "extp             %[Temp2],     $ac1,           31              \n\t"

+

+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"

+          "extp             %[Temp1],     $ac2,           31              \n\t"

+

+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"

+          "extp             %[Temp2],     $ac3,           31              \n\t"

+

+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"

+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"

+

+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"

+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"

+

+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"

+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"

+

+          : [load1] "=&r" (load1), [load2] "=&r" (load2),

+            [p1] "=&r" (p1), [p2] "=&r" (p2),

+            [scratch1] "=&r" (scratch1),

+            [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),

+            [store1] "=&r" (store1), [store2] "=&r" (store2),

+            [src_ptr] "+r" (src_ptr)

+          : [filter45] "r" (filter45),[vector4a] "r" (vector4a),

+            [src_stride] "r" (src_stride),

+            [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)

+      );

+    }

+

+    /* Next row... */

+    src += src_stride;

+    dst += dst_stride;

+  }

+}
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (16 == y_step_q4) {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+      "wrdsp      %[pos],     1           \n\t"
+      :
+      : [pos] "r" (pos)
+    );
+
+    vp9_prefetch_store(dst);
+
+    switch (w) {
+      case 4 :
+      case 8 :
+      case 16 :
+      case 32 :
+        convolve_bi_vert_4_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_y, w, h);
+        break;
+      case 64 :
+        vp9_prefetch_store(dst + 32);
+        convolve_bi_vert_64_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_y, h);
+        break;
+      default:
+        vp9_convolve8_vert_c(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  } else {
+    vp9_convolve8_vert_c(src, src_stride,
+                         dst, dst_stride,
+                         filter_x, x_step_q4,
+                         filter_y, y_step_q4,
+                         w, h);
+  }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index 0930ad1..da7f0fd 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -355,6 +355,12 @@
                      filter_x, x_step_q4,
                      filter_y, y_step_q4,
                      w, h);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve2_avg_vert_dspr2(src, src_stride,
+                                 dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4,
+                                 w, h);
   } else {
     if (16 == y_step_q4) {
       uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
index 37c665b..69da1cf 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
@@ -965,6 +965,12 @@
                      filter_x, x_step_q4,
                      filter_y, y_step_q4,
                      w, h);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vp9_convolve2_avg_horiz_dspr2(src, src_stride,
+                                  dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4,
+                                  w, h);
   } else {
     if (16 == x_step_q4) {
       uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 2c48bd0..126e05a 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -930,6 +930,21 @@
   }
 }
 
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x * dst_stride] = src[x];
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
 void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
@@ -966,20 +981,14 @@
 
   /* copy the src to dst */
   if (filter_x[3] == 0x80) {
-    int32_t y;
-    int32_t c;
-    const uint8_t *src_ptr = src - src_stride * 3;
-    uint8_t *dst_ptr = temp;
-
-    for (y = intermediate_height; y--;) {
-      for (c = 0; c < w; c++) {
-        dst_ptr[c * intermediate_height] = src_ptr[c];
-      }
-
-      /* next row... */
-      src_ptr += src_stride;
-      dst_ptr += 1;
-    }
+    copy_horiz_transposed(src - src_stride * 3, src_stride,
+                          temp, intermediate_height,
+                          w, intermediate_height);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
+                        temp, intermediate_height,
+                        filter_x,
+                        w, intermediate_height);
   } else {
     src -= (src_stride * 3 + 3);
 
@@ -1021,20 +1030,14 @@
 
   /* copy the src to dst */
   if (filter_y[3] == 0x80) {
-    int32_t y;
-    int32_t c;
-    uint8_t *src_ptr = temp + 3;
-    uint8_t *dst_ptr = dst;
-
-    for (y = w; y--;) {
-      for (c = 0; c < h; c++) {
-        dst_ptr[c * dst_stride] = src_ptr[c];
-      }
-
-      /* next row... */
-      src_ptr += intermediate_height;
-      dst_ptr += 1;
-    }
+    copy_horiz_transposed(temp + 3, intermediate_height,
+                          dst, dst_stride,
+                          h, w);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve2_dspr2(temp + 3, intermediate_height,
+                        dst, dst_stride,
+                        filter_y,
+                        h, w);
   } else {
     switch (h) {
       case 4:
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
index 743d641..0303896 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
@@ -849,6 +849,12 @@
                       filter_x, x_step_q4,
                       filter_y, y_step_q4,
                       w, h);
+  } else if (((const int32_t *)filter_x)[0] == 0) {
+    vp9_convolve2_horiz_dspr2(src, src_stride,
+                              dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4,
+                              w, h);
   } else {
     if (16 == x_step_q4) {
       uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
index bdc7930..0930bb3 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
@@ -341,6 +341,12 @@
                       filter_x, x_step_q4,
                       filter_y, y_step_q4,
                       w, h);
+  } else if (((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve2_vert_dspr2(src, src_stride,
+                             dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4,
+                             w, h);
   } else {
     if (16 == y_step_q4) {
       uint32_t pos = 38;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index f0c653f..0f50f37 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -179,6 +179,7 @@
 }
 
 void vp9_initialize_common() {
+  vp9_init_neighbors();
   vp9_coef_tree_initialize();
   vp9_entropy_mode_init();
   vp9_entropy_mv_init();
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f116c06..0538b37 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -221,9 +221,7 @@
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
-  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
-  void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
   struct subpix_fn_table  subpix;
 
@@ -578,7 +576,7 @@
   }
 }
 
-static int get_tx_eob(struct segmentation *seg, int segment_id,
+static int get_tx_eob(const struct segmentation *seg, int segment_id,
                       TX_SIZE tx_size) {
   const int eob_max = 16 << (tx_size << 1);
   return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 1705402..a2d864c 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -7,13 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vp9/common/vp9_convolve.h"
 
 #include <assert.h>
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 72ea72e..8ebe0e5 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -52,222 +52,7 @@
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
-  0,  4,  1,  5,
-  8,  2, 12,  9,
-  3,  6, 13, 10,
-  7, 14, 11, 15,
-};
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
-  0,  4,  8,  1,
-  12,  5,  9,  2,
-  13,  6, 10,  3,
-  7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
-  0,  1,  4,  2,
-  5,  3,  6,  8,
-  9,  7, 12, 10,
-  13, 11, 14, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
-  0,  8,  1, 16,  9,  2, 17, 24,
-  10,  3, 18, 25, 32, 11,  4, 26,
-  33, 19, 40, 12, 34, 27,  5, 41,
-  20, 48, 13, 35, 42, 28, 21,  6,
-  49, 56, 36, 43, 29,  7, 14, 50,
-  57, 44, 22, 37, 15, 51, 58, 30,
-  45, 23, 52, 59, 38, 31, 60, 53,
-  46, 39, 61, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
-  0,  8, 16,  1, 24,  9, 32, 17,
-  2, 40, 25, 10, 33, 18, 48,  3,
-  26, 41, 11, 56, 19, 34,  4, 49,
-  27, 42, 12, 35, 20, 57, 50, 28,
-  5, 43, 13, 36, 58, 51, 21, 44,
-  6, 29, 59, 37, 14, 52, 22,  7,
-  45, 60, 30, 15, 38, 53, 23, 46,
-  31, 61, 39, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
-  0,  1,  2,  8,  9,  3, 16, 10,
-  4, 17, 11, 24,  5, 18, 25, 12,
-  19, 26, 32,  6, 13, 20, 33, 27,
-  7, 34, 40, 21, 28, 41, 14, 35,
-  48, 42, 29, 36, 49, 22, 43, 15,
-  56, 37, 50, 44, 30, 57, 23, 51,
-  58, 45, 38, 52, 31, 59, 53, 46,
-  60, 39, 61, 47, 54, 55, 62, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
-  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
-  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
-  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
-  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
-  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
-  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
-  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
-  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
-  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
-  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
-  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
-  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
-  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
-  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
-  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
-  251,
-  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
-  255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
-  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
-  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
-  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
-  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
-  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
-  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
-  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
-  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
-  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
-  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
-  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
-  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
-  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
-  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
-  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
-  236,
-  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
-  255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
-  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
-  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
-  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
-  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
-  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
-  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
-  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
-  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
-  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
-  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
-  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
-  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
-  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
-  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
-  158,
-  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
-  175,
-  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
-  255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
-  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
-  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
-  68, 131, 37, 100,
-  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
-  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
-  102, 352, 8, 197,
-  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
-  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
-  41, 417, 199, 136,
-  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
-  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
-  295, 420, 106, 451,
-  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
-  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
-  453, 139, 44, 234,
-  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
-  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
-  486, 77, 204, 362,
-  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
-  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
-  111, 238, 48, 143,
-  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
-  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
-  393, 300, 269, 176, 145,
-  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
-  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
-  550, 519, 488, 457, 426, 395,
-  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
-  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
-  210, 179, 117, 86, 55, 738, 707,
-  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
-  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
-  645, 552, 521, 428, 397, 304,
-  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
-  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
-  864, 833, 802, 771, 740, 709,
-  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
-  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
-  710, 679, 617, 586, 555, 493,
-  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
-  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
-  743, 619, 495, 371, 247, 123,
-  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
-  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
-  898, 836, 805, 774, 712, 681,
-  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
-  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
-  651, 620, 589, 558, 527,
-  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
-  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
-  559, 497, 466, 435, 373,
-  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
-  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
-  499, 375, 251, 127,
-  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
-  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
-  685, 654, 592, 561,
-  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
-  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
-  438, 407, 376, 345,
-  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
-  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
-  967, 874, 843, 750,
-  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
-  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
-  564, 533, 440, 409,
-  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
-  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
-  752, 721, 690, 659,
-  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
-  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
-  350, 319, 1002, 971,
-  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
-  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
-  537, 444, 413, 972,
-  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
-  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
-  570, 539, 508, 477,
-  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
-  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
-  1007, 883, 759, 635, 511,
-  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
-  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
-  884, 853, 822, 791,
-  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
-  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
-  1011, 887, 763, 639,
-  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
-  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
-  702, 671, 1013, 982,
-  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
-  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
-  1016, 985, 954, 923,
-  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
-  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
-  990, 959, 1022, 991, 1023,
-};
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
@@ -513,134 +298,7 @@
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
-  int n, l2 = l * l;
-  for (n = 0; n < l2; n++) {
-    int rc = scan[n];
-    if (rc == idx)
-      return  n;
-  }
-  assert(0);
-  return -1;
-}
-static void init_scan_neighbors(const int16_t *scan,
-                                int16_t *iscan,
-                                int l, int16_t *neighbors) {
-  int l2 = l * l;
-  int n, i, j;
-
-  // dc doesn't use this type of prediction
-  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
-  iscan[0] = find_in_scan(scan, l, 0);
-  for (n = 1; n < l2; n++) {
-    int rc = scan[n];
-    iscan[n] = find_in_scan(scan, l, n);
-    i = rc / l;
-    j = rc % l;
-    if (i > 0 && j > 0) {
-      // col/row scan is used for adst/dct, and generally means that
-      // energy decreases to zero much faster in the dimension in
-      // which ADST is used compared to the direction in which DCT
-      // is used. Likewise, we find much higher correlation between
-      // coefficients within the direction in which DCT is used.
-      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
-      // as a context. If ADST or DCT is used in both directions, we
-      // use the combination of the two as a context.
-      int a = (i - 1) * l + j;
-      int b =  i      * l + j - 1;
-      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
-          scan == vp9_col_scan_16x16) {
-        // in the col/row scan cases (as well as left/top edge cases), we set
-        // both contexts to the same value, so we can branchlessly do a+b+1>>1
-        // which automatically becomes a if a == b
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = a;
-      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
-                 scan == vp9_row_scan_16x16) {
-        neighbors[MAX_NEIGHBORS * n + 0] =
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      } else {
-        neighbors[MAX_NEIGHBORS * n + 0] = a;
-        neighbors[MAX_NEIGHBORS * n + 1] = b;
-      }
-    } else if (i > 0) {
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
-    } else {
-      assert(j > 0);
-      neighbors[MAX_NEIGHBORS * n + 0] =
-      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
-    }
-    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
-  }
-  // one padding item so we don't have to add branches in code to handle
-  // calls to get_coef_context() for the token after the final dc token
-  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
-  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
-}
-
-void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      vp9_default_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      vp9_default_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      vp9_row_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      vp9_col_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      vp9_default_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      vp9_row_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      vp9_col_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      vp9_default_scan_32x32_neighbors);
-}
-
 void vp9_coef_tree_initialize() {
-  vp9_init_neighbors();
   init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
 }
@@ -657,10 +315,10 @@
 static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
 
   vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
-  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+  const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
   vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
   unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
       cm->counts.eob_branch[tx_size];
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index ef9ea46..02178b5 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -12,9 +12,13 @@
 #define VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_treecoder.h"
+
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_treecoder.h"
+
+#define DIFF_UPDATE_PROB 252
 
 /* Coefficient token alphabet */
 
@@ -36,6 +40,9 @@
 
 #define INTER_MODE_CONTEXTS     7
 
+extern DECLARE_ALIGNED(16, const uint8_t,
+                       vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
 extern const vp9_tree_index vp9_coef_tree[];
 
 #define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
@@ -44,7 +51,7 @@
 extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
 
 typedef struct {
-  vp9_tree_p tree;
+  vp9_tree_index *tree;
   const vp9_prob *prob;
   int len;
   int base_val;
@@ -96,64 +103,8 @@
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
 
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-#define MAX_NEIGHBORS 2
-
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-void vp9_coef_tree_initialize(void);
+void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
@@ -183,13 +134,6 @@
     ? (COEF_BANDS-1) : band_translate[coef_index];
 }
 
-static INLINE int get_coef_context(const int16_t *neighbors,
-                                   uint8_t *token_cache,
-                                   int c) {
-  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
-          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
-}
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -210,126 +154,6 @@
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_4x4;
-    case DCT_ADST:
-      return vp9_col_scan_4x4;
-    default:
-      return vp9_default_scan_4x4;
-  }
-}
-
-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_4x4;
-      *nb = vp9_row_scan_4x4_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_4x4;
-      *nb = vp9_col_scan_4x4_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_4x4;
-      *nb = vp9_default_scan_4x4_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_4x4;
-    case DCT_ADST:
-      return vp9_col_iscan_4x4;
-    default:
-      return vp9_default_iscan_4x4;
-  }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_8x8;
-    case DCT_ADST:
-      return vp9_col_scan_8x8;
-    default:
-      return vp9_default_scan_8x8;
-  }
-}
-
-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_8x8;
-      *nb = vp9_row_scan_8x8_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_8x8;
-      *nb = vp9_col_scan_8x8_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_8x8;
-      *nb = vp9_default_scan_8x8_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_8x8;
-    case DCT_ADST:
-      return vp9_col_iscan_8x8;
-    default:
-      return vp9_default_iscan_8x8;
-  }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_16x16;
-    case DCT_ADST:
-      return vp9_col_scan_16x16;
-    default:
-      return vp9_default_scan_16x16;
-  }
-}
-
-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
-                                     const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_16x16;
-      *nb = vp9_row_scan_16x16_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_16x16;
-      *nb = vp9_col_scan_16x16_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_16x16;
-      *nb = vp9_default_scan_16x16_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_16x16;
-    case DCT_ADST:
-      return vp9_col_iscan_16x16;
-    default:
-      return vp9_default_iscan_16x16;
-  }
-}
-
 static int get_entropy_context(TX_SIZE tx_size,
                                ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
@@ -386,7 +210,4 @@
   }
 }
 
-
-enum { VP9_COEF_UPDATE_PROB = 252 };
-
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index e176796..56e6444 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -226,7 +226,7 @@
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -237,22 +237,20 @@
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
-const vp9_tree_index vp9_inter_mode_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
   -NEARMV, -NEWMV
 };
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
-const vp9_tree_index vp9_partition_tree[6] = {
+const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
@@ -338,7 +336,8 @@
   vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree
+                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ccade27..ab37b75 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,7 +15,6 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define TX_SIZE_CONTEXTS 2
-#define MODE_UPDATE_PROB  252
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
 
 // #define MODE_STATS
@@ -38,19 +37,17 @@
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
 
-extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_inter_mode_tree[];
-
+extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+
+extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
-// probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (SWITCHABLE_FILTERS - 1)];
-
+                                [TREE_SIZE(SWITCHABLE_FILTERS)];
 extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init();
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index a9e25b7..e851181 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -18,14 +18,14 @@
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
 #define COMPANDED_MVREF_THRESH 8
 
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
 struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
   -MV_CLASS_1, 4,
   6, 8,
@@ -39,12 +39,12 @@
 };
 struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
-const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
 struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
   -0, 2,
   -1, 4,
   -2, -3
@@ -214,11 +214,11 @@
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
 
   nmv_context *ctx = &cm->fc.nmvc;
-  nmv_context *pre_ctx = &pre_fc->nmvc;
-  nmv_context_counts *cts = &cm->counts.mv;
+  const nmv_context *pre_ctx = &pre_fc->nmvc;
+  const nmv_context_counts *cts = &cm->counts.mv;
 
   adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 3b782ab..c42653d 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -43,9 +43,6 @@
   return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
 }
 
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
 /* Symbols for coding magnitude class of nonzero components */
 #define MV_CLASSES     11
 typedef enum {
@@ -62,9 +59,6 @@
   MV_CLASS_10 = 10,    /* (1024,2048] integer pel */
 } MV_CLASS_TYPE;
 
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */
 #define CLASS0_SIZE    (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
@@ -77,10 +71,16 @@
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
+
+extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
+
+extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
 extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
 extern struct vp9_token vp9_mv_fp_encodings[4];
 
 typedef struct {
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 676b274..36d19a7 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -27,7 +27,7 @@
   SWITCHABLE = 4  /* should be the last one */
 } INTERPOLATIONFILTERTYPE;
 
-typedef const int16_t subpel_kernel[SUBPEL_TAPS];
+typedef int16_t subpel_kernel[SUBPEL_TAPS];
 
 struct subpix_fn_table {
   const subpel_kernel *filter_x;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 99d84c9..52b039d 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,13 +18,13 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    0.5 shifts per pixel. */
   int i;
   int16_t output[16];
   int a1, b1, c1, d1, e1;
-  int16_t *ip = input;
+  const int16_t *ip = input;
   int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
@@ -60,21 +60,21 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
+    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
+    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
+    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
+    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
 
     ip++;
     dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int a1, e1;
   int16_t tmp[4];
-  int16_t *ip = in;
+  const int16_t *ip = in;
   int16_t *op = tmp;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -96,7 +96,7 @@
   }
 }
 
-void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
+static void idct4_1d(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -116,7 +116,7 @@
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
   int i, j;
@@ -124,7 +124,7 @@
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    vp9_idct4_1d(input, outptr);
+    idct4_1d(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -133,14 +133,14 @@
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    vp9_idct4_1d(temp_in, temp_out);
+    idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -156,7 +156,7 @@
   }
 }
 
-static void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(const int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -174,7 +174,7 @@
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  vp9_idct4_1d(step1, step1);
+  idct4_1d(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -201,7 +201,7 @@
   output[7] = step1[0] - step1[7];
 }
 
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
   int i, j;
@@ -220,12 +220,12 @@
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -234,11 +234,11 @@
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
-static void iadst4_1d(int16_t *input, int16_t *output) {
+static void iadst4_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[0];
@@ -280,13 +280,13 @@
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {
+void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   const transform_2d IHT_4[] = {
-    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
-    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
-    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2
-    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3
+    { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
+    { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
+    { idct4_1d, iadst4_1d },    // DCT_ADST = 2
+    { iadst4_1d, iadst4_1d }      // ADST_ADST = 3
   };
 
   int i, j;
@@ -307,11 +307,11 @@
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * stride + i]);
   }
 }
-static void iadst8_1d(int16_t *input, int16_t *output) {
+static void iadst8_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -395,8 +395,8 @@
   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {
+void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -416,12 +416,12 @@
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);  }
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
+  }
 }
 
-void vp9_short_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
-                                int dest_stride) {
+void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -441,12 +441,12 @@
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
   }
 }
 
-static void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(const int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -611,7 +611,7 @@
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
   int i, j;
@@ -630,12 +630,12 @@
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void iadst16_1d(int16_t *input, int16_t *output) {
+static void iadst16_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -813,8 +813,8 @@
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                              int tx_type) {
+void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -834,12 +834,11 @@
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);  }
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);  }
 }
 
-void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
+void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[16 * 16] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -859,13 +858,12 @@
       temp_in[j] = out[j*16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -874,11 +872,11 @@
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
-static void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(const int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
@@ -1245,7 +1243,7 @@
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
   int i, j;
@@ -1253,7 +1251,20 @@
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    idct32_1d(input, outptr);
+    int16_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      idct32_1d(input, outptr);
+    else
+      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
     input += 32;
     outptr += 32;
   }
@@ -1264,13 +1275,12 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_short_idct32x32_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
 
@@ -1281,28 +1291,27 @@
   for (j = 0; j < 32; ++j) {
     for (i = 0; i < 32; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
 // idct
-void vp9_idct_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob > 1)
-    vp9_short_idct4x4_add(input, dest, stride);
+    vp9_idct4x4_16_add(input, dest, stride);
   else
-    vp9_short_idct4x4_1_add(input, dest, stride);
+    vp9_idct4x4_1_add(input, dest, stride);
 }
 
 
-void vp9_idct_add_lossless(int16_t *input, uint8_t *dest, int stride,
-                             int eob) {
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob > 1)
-    vp9_short_iwalsh4x4_add(input, dest, stride);
+    vp9_iwht4x4_16_add(input, dest, stride);
   else
-    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
+    vp9_iwht4x4_1_add(input, dest, stride);
 }
 
-void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
@@ -1313,64 +1322,66 @@
   if (eob) {
     if (eob == 1)
       // DC only DCT coefficient
-      vp9_short_idct8x8_1_add(input, dest, stride);
+      vp9_idct8x8_1_add(input, dest, stride);
     else if (eob <= 10)
-      vp9_short_idct8x8_10_add(input, dest, stride);
+      vp9_idct8x8_10_add(input, dest, stride);
     else
-      vp9_short_idct8x8_add(input, dest, stride);
+      vp9_idct8x8_64_add(input, dest, stride);
   }
 }
 
-void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob) {
     if (eob == 1)
       /* DC only DCT coefficient. */
-      vp9_short_idct16x16_1_add(input, dest, stride);
+      vp9_idct16x16_1_add(input, dest, stride);
     else if (eob <= 10)
-      vp9_short_idct16x16_10_add(input, dest, stride);
+      vp9_idct16x16_10_add(input, dest, stride);
     else
-      vp9_short_idct16x16_add(input, dest, stride);
+      vp9_idct16x16_256_add(input, dest, stride);
   }
 }
 
-void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {
   if (eob) {
     if (eob == 1)
-      vp9_short_idct32x32_1_add(input, dest, stride);
+      vp9_idct32x32_1_add(input, dest, stride);
     else
-      vp9_short_idct32x32_add(input, dest, stride);
+      vp9_idct32x32_1024_add(input, dest, stride);
   }
 }
 
 // iht
-void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
-                   int eob) {
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {
   if (tx_type == DCT_DCT)
-    vp9_idct_add(input, dest, stride, eob);
+    vp9_idct4x4_add(input, dest, stride, eob);
   else
-    vp9_short_iht4x4_add(input, dest, stride, tx_type);
+    vp9_iht4x4_16_add(input, dest, stride, tx_type);
 }
 
-void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob) {
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {
   if (tx_type == DCT_DCT) {
-    vp9_idct_add_8x8(input, dest, stride, eob);
+    vp9_idct8x8_add(input, dest, stride, eob);
   } else {
     if (eob > 0) {
-      vp9_short_iht8x8_add(input, dest, stride, tx_type);
+      vp9_iht8x8_64_add(input, dest, stride, tx_type);
     }
   }
 }
 
-void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                         int stride, int eob) {
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob) {
   if (tx_type == DCT_DCT) {
-    vp9_idct_add_16x16(input, dest, stride, eob);
+    vp9_idct16x16_add(input, dest, stride, eob);
   } else {
     if (eob > 0) {
-      vp9_short_iht16x16_add(input, dest, stride, tx_type);
+      vp9_iht16x16_256_add(input, dest, stride, tx_type);
     }
   }
 }
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 0ef905c..2b3f35f 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -81,28 +81,27 @@
   return rv;
 }
 
-typedef void (*transform_1d)(int16_t*, int16_t*);
+typedef void (*transform_1d)(const int16_t*, int16_t*);
 
 typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
 
-void vp9_idct_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add_lossless(int16_t *input, uint8_t *dest,
-                           int stride, int eob);
-void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+                       eob);
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob);
 
-void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                 int stride, int eob);
-
-void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                     int stride, int eob);
-
-void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob);
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob);
 
 
 #endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index b3b9e1d..2fabe2a 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -174,36 +174,17 @@
 static INLINE void foreach_predicted_block_in_plane(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
     foreach_predicted_block_visitor visit, void *arg) {
-  int i, x, y;
-
-  // block sizes in number of 4x4 blocks log 2 ("*_b")
-  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  // subsampled size of the block
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
 
-  // size of the predictor to use.
-  int pred_w, pred_h;
-
   if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
+    int i = 0, x, y;
     assert(bsize == BLOCK_8X8);
-    pred_w = 0;
-    pred_h = 0;
+    for (y = 0; y < 1 << bhl; ++y)
+      for (x = 0; x < 1 << bwl; ++x)
+        visit(plane, i++, bsize, 0, 0, arg);
   } else {
-    pred_w = bwl;
-    pred_h = bhl;
-  }
-  assert(pred_w <= bwl);
-  assert(pred_h <= bhl);
-
-  // visit each subblock in raster order
-  i = 0;
-  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
-    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
-      visit(plane, i, bsize, pred_w, pred_h, arg);
-      i += 1 << pred_w;
-    }
-    i += (1 << (bwl + pred_h)) - (1 << bwl);
+    visit(plane, 0, bsize, bwl, bhl, arg);
   }
 }
 
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 61be7c6..21513d4 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -31,7 +31,7 @@
 # RECON
 #
 prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_4x4
+specialize vp9_d207_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_4x4 $ssse3_x86inc
@@ -49,7 +49,7 @@
 specialize vp9_d135_predictor_4x4
 
 prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_4x4
+specialize vp9_d153_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_v_predictor_4x4 $sse_x86inc
@@ -70,7 +70,7 @@
 specialize vp9_dc_128_predictor_4x4
 
 prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_8x8
+specialize vp9_d207_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_8x8 $ssse3_x86inc
@@ -88,7 +88,7 @@
 specialize vp9_d135_predictor_8x8
 
 prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_8x8
+specialize vp9_d153_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_v_predictor_8x8 $sse_x86inc
@@ -109,7 +109,7 @@
 specialize vp9_dc_128_predictor_8x8
 
 prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_16x16
+specialize vp9_d207_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_16x16 $ssse3_x86inc
@@ -127,7 +127,7 @@
 specialize vp9_d135_predictor_16x16
 
 prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_16x16
+specialize vp9_d153_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_v_predictor_16x16 $sse2_x86inc
@@ -148,7 +148,7 @@
 specialize vp9_dc_128_predictor_16x16
 
 prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_32x32
+specialize vp9_d207_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d45_predictor_32x32 $ssse3_x86inc
@@ -247,74 +247,72 @@
 specialize vp9_convolve_avg $sse2_x86inc neon dspr2
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3 neon dspr2
+specialize vp9_convolve8 sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3 neon dspr2
+specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3 neon dspr2
+specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3 neon dspr2
+specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3 neon dspr2
+specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
 
 prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3 neon dspr2
+specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
 
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add sse2 neon
+prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_1_add sse2 neon
 
-prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_add sse2 neon
+prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_16_add sse2 neon
 
-prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2 neon
+prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_1_add sse2 neon
 
-prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_add sse2 neon
+prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_64_add sse2 neon
 
-prototype void vp9_short_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_10_add sse2 neon
+prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_10_add sse2 neon
 
-prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2 neon
+prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_1_add sse2 neon
 
-prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_add sse2 neon
+prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_256_add sse2 neon
 
-prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_10_add sse2 neon
+prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_10_add sse2 neon
 
-prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2 neon
+prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1024_add sse2 neon
 
-prototype void vp9_short_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_1_add sse2
+prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1_add sse2
 
-prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2 neon
+prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht4x4_16_add sse2 neon
 
-prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2 neon
+prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht8x8_64_add sse2 neon
 
-prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add sse2
+prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_iht16x16_256_add sse2
 
-prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
-specialize vp9_idct4_1d sse2
 # dct and add
 
-prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_1_add
+prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_1_add
 
-prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_add
+prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_16_add
 
 #
 # Encoder functions below this point.
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
new file mode 100644
index 0000000..f17da91
--- /dev/null
+++ b/vp9/common/vp9_scan.c
@@ -0,0 +1,357 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_scan.h"
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+  0,  4,  1,  5,
+  8,  2, 12,  9,
+  3,  6, 13, 10,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+  0,  4,  8,  1,
+  12,  5,  9,  2,
+  13,  6, 10,  3,
+  7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+  0,  1,  4,  2,
+  5,  3,  6,  8,
+  9,  7, 12, 10,
+  13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+  0,  8,  1, 16,  9,  2, 17, 24,
+  10,  3, 18, 25, 32, 11,  4, 26,
+  33, 19, 40, 12, 34, 27,  5, 41,
+  20, 48, 13, 35, 42, 28, 21,  6,
+  49, 56, 36, 43, 29,  7, 14, 50,
+  57, 44, 22, 37, 15, 51, 58, 30,
+  45, 23, 52, 59, 38, 31, 60, 53,
+  46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+  0,  8, 16,  1, 24,  9, 32, 17,
+  2, 40, 25, 10, 33, 18, 48,  3,
+  26, 41, 11, 56, 19, 34,  4, 49,
+  27, 42, 12, 35, 20, 57, 50, 28,
+  5, 43, 13, 36, 58, 51, 21, 44,
+  6, 29, 59, 37, 14, 52, 22,  7,
+  45, 60, 30, 15, 38, 53, 23, 46,
+  31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+  0,  1,  2,  8,  9,  3, 16, 10,
+  4, 17, 11, 24,  5, 18, 25, 12,
+  19, 26, 32,  6, 13, 20, 33, 27,
+  7, 34, 40, 21, 28, 41, 14, 35,
+  48, 42, 29, 36, 49, 22, 43, 15,
+  56, 37, 50, 44, 30, 57, 23, 51,
+  58, 45, 38, 52, 31, 59, 53, 46,
+  60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+  0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+  50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+  98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+  100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+  55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+  133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+  26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+  180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+  12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+  226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+  242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+  77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+  230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+  63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+  251,
+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+  0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+  34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+  67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+  146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+  22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+  225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+  87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+  88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+  213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+  74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+  200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+  60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+  233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+  62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+  126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+  236,
+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+  255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+  0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+  49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+  23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+  25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+  13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+  144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+  89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+  75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+  165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+  167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+  197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+  124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+  156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+  157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+  158,
+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+  175,
+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+  0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+  129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+  68, 131, 37, 100,
+  225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+  258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+  102, 352, 8, 197,
+  71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+  135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+  41, 417, 199, 136,
+  262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+  419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+  295, 420, 106, 451,
+  481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+  75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+  453, 139, 44, 234,
+  484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+  546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+  486, 77, 204, 362,
+  608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+  610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+  111, 238, 48, 143,
+  80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+  83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+  393, 300, 269, 176, 145,
+  52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+  270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+  550, 519, 488, 457, 426, 395,
+  364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+  706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+  210, 179, 117, 86, 55, 738, 707,
+  614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+  367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+  645, 552, 521, 428, 397, 304,
+  273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+  522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+  864, 833, 802, 771, 740, 709,
+  678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+  275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+  710, 679, 617, 586, 555, 493,
+  462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+  742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+  743, 619, 495, 371, 247, 123,
+  896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+  649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+  898, 836, 805, 774, 712, 681,
+  650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+  92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+  651, 620, 589, 558, 527,
+  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+  93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+  559, 497, 466, 435, 373,
+  342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+  622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+  499, 375, 251, 127,
+  900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+  529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+  685, 654, 592, 561,
+  530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+  872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+  438, 407, 376, 345,
+  314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+  687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+  967, 874, 843, 750,
+  719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+  379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+  564, 533, 440, 409,
+  316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+  472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+  752, 721, 690, 659,
+  628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+  939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+  350, 319, 1002, 971,
+  878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+  507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+  537, 444, 413, 972,
+  941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+  1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+  570, 539, 508, 477,
+  446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+  509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+  1007, 883, 759, 635, 511,
+  912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+  914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+  884, 853, 822, 791,
+  760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+  761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+  1011, 887, 763, 639,
+  916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+  794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+  702, 671, 1013, 982,
+  951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+  891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+  1016, 985, 954, 923,
+  892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+  1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+  990, 959, 1022, 991, 1023,
+};
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
+  int n, l2 = l * l;
+  for (n = 0; n < l2; n++) {
+    int rc = scan[n];
+    if (rc == idx)
+      return  n;
+  }
+  assert(0);
+  return -1;
+}
+static void init_scan_neighbors(const int16_t *scan,
+                                int16_t *iscan,
+                                int l, int16_t *neighbors) {
+  int l2 = l * l;
+  int n, i, j;
+
+  // dc doesn't use this type of prediction
+  neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+  neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+  iscan[0] = find_in_scan(scan, l, 0);
+  for (n = 1; n < l2; n++) {
+    int rc = scan[n];
+    iscan[n] = find_in_scan(scan, l, n);
+    i = rc / l;
+    j = rc % l;
+    if (i > 0 && j > 0) {
+      // col/row scan is used for adst/dct, and generally means that
+      // energy decreases to zero much faster in the dimension in
+      // which ADST is used compared to the direction in which DCT
+      // is used. Likewise, we find much higher correlation between
+      // coefficients within the direction in which DCT is used.
+      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
+      // as a context. If ADST or DCT is used in both directions, we
+      // use the combination of the two as a context.
+      int a = (i - 1) * l + j;
+      int b =  i      * l + j - 1;
+      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
+          scan == vp9_col_scan_16x16) {
+        // in the col/row scan cases (as well as left/top edge cases), we set
+        // both contexts to the same value, so we can branchlessly do a+b+1>>1
+        // which automatically becomes a if a == b
+        neighbors[MAX_NEIGHBORS * n + 0] =
+        neighbors[MAX_NEIGHBORS * n + 1] = a;
+      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
+                 scan == vp9_row_scan_16x16) {
+        neighbors[MAX_NEIGHBORS * n + 0] =
+        neighbors[MAX_NEIGHBORS * n + 1] = b;
+      } else {
+        neighbors[MAX_NEIGHBORS * n + 0] = a;
+        neighbors[MAX_NEIGHBORS * n + 1] = b;
+      }
+    } else if (i > 0) {
+      neighbors[MAX_NEIGHBORS * n + 0] =
+      neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
+    } else {
+      assert(j > 0);
+      neighbors[MAX_NEIGHBORS * n + 0] =
+      neighbors[MAX_NEIGHBORS * n + 1] =  i      * l + j - 1;
+    }
+    assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
+  }
+  // one padding item so we don't have to add branches in code to handle
+  // calls to get_coef_context() for the token after the final dc token
+  neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+  neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
+}
+
+void vp9_init_neighbors() {
+  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+                      vp9_default_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+                      vp9_row_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+                      vp9_col_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+                      vp9_default_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+                      vp9_row_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+                      vp9_col_scan_8x8_neighbors);
+  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+                      vp9_default_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+                      vp9_row_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+                      vp9_col_scan_16x16_neighbors);
+  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+                      vp9_default_scan_32x32_neighbors);
+}
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
new file mode 100644
index 0000000..a5c8463
--- /dev/null
+++ b/vp9/common/vp9_scan.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCAN_H_
+#define VP9_COMMON_VP9_SCAN_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_enums.h"
+
+#define MAX_NEIGHBORS 2
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+void vp9_init_neighbors();
+
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_4x4;
+    case DCT_ADST:
+      return vp9_col_scan_4x4;
+    default:
+      return vp9_default_scan_4x4;
+  }
+}
+
+static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
+                                   const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_4x4;
+      *nb = vp9_row_scan_4x4_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_4x4;
+      *nb = vp9_col_scan_4x4_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_4x4;
+      *nb = vp9_default_scan_4x4_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_4x4;
+    case DCT_ADST:
+      return vp9_col_iscan_4x4;
+    default:
+      return vp9_default_iscan_4x4;
+  }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_8x8;
+    case DCT_ADST:
+      return vp9_col_scan_8x8;
+    default:
+      return vp9_default_scan_8x8;
+  }
+}
+
+static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
+                                   const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_8x8;
+      *nb = vp9_row_scan_8x8_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_8x8;
+      *nb = vp9_col_scan_8x8_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_8x8;
+      *nb = vp9_default_scan_8x8_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_8x8;
+    case DCT_ADST:
+      return vp9_col_iscan_8x8;
+    default:
+      return vp9_default_iscan_8x8;
+  }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_16x16;
+    case DCT_ADST:
+      return vp9_col_scan_16x16;
+    default:
+      return vp9_default_scan_16x16;
+  }
+}
+
+static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
+                                     const int16_t **scan, const int16_t **nb) {
+  switch (tx_type) {
+    case ADST_DCT:
+      *scan = vp9_row_scan_16x16;
+      *nb = vp9_row_scan_16x16_neighbors;
+      break;
+    case DCT_ADST:
+      *scan = vp9_col_scan_16x16;
+      *nb = vp9_col_scan_16x16_neighbors;
+      break;
+    default:
+      *scan = vp9_default_scan_16x16;
+      *nb = vp9_default_scan_16x16_neighbors;
+      break;
+  }
+}
+
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_iscan_16x16;
+    case DCT_ADST:
+      return vp9_col_iscan_16x16;
+    default:
+      return vp9_default_iscan_16x16;
+  }
+}
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+                                   uint8_t *token_cache,
+                                   int c) {
+  return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+          token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+#endif  // VP9_COMMON_VP9_SCAN_H_
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index cc909e2..254a431 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -13,6 +13,7 @@
 
 #ifdef _MSC_VER
 #include <math.h>
+#define snprintf _snprintf
 #endif
 
 #include "./vpx_config.h"
@@ -23,8 +24,8 @@
 #define vp9_clear_system_state()
 #endif
 
-#ifdef _MSC_VER
-// round is not defined in MSVC
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// round is not defined in MSVC before VS2013.
 static int round(double x) {
   if (x < 0)
     return (int)ceil(x - 0.5);
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 31182c3..4ba171f 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -21,6 +21,8 @@
 
 typedef int8_t vp9_tree_index;
 
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
 #define vp9_complement(x) (255 - x)
 
 /* We build coding trees compactly in arrays.
@@ -30,7 +32,7 @@
    Index > 0 means need another bit, specification at index.
    Nonnegative indices are always even;  processing begins at node 0. */
 
-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
+typedef const vp9_tree_index vp9_tree[];
 
 struct vp9_token {
   int value;
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 3f1c198..ba9ceb2 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -36,90 +36,28 @@
   {   8,  8,  8,  8, 120, 120, 120, 120 }
 };
 
+typedef void filter8_1dfunction (
+  const unsigned char *src_ptr,
+  const unsigned int src_pitch,
+  unsigned char *output_ptr,
+  unsigned int out_pitch,
+  unsigned int output_height,
+  const short *filter
+);
+
 #if HAVE_SSSE3
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
-                                   const unsigned int src_pitch,
-                                   unsigned char *output_ptr,
-                                   unsigned int out_pitch,
-                                   unsigned int output_height,
-                                   const short *filter);
-
-void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
-                                       const unsigned int src_pitch,
-                                       unsigned char *output_ptr,
-                                       unsigned int out_pitch,
-                                       unsigned int output_height,
-                                       const short *filter);
-
-void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
-                                       const unsigned int src_pitch,
-                                       unsigned char *output_ptr,
-                                       unsigned int out_pitch,
-                                       unsigned int output_height,
-                                       const short *filter);
-
-void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
-
-void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
-
-void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
-
-void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
-                                     const unsigned int src_pitch,
-                                     unsigned char *output_ptr,
-                                     unsigned int out_pitch,
-                                     unsigned int output_height,
-                                     const short *filter);
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
 
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -317,3 +255,214 @@
   }
 }
 #endif
+
+#if HAVE_SSE2
+filter8_1dfunction vp9_filter_block1d16_v8_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_sse2;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+
+void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  /* Ensure the filter can be compressed to int16_t. */
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_sse2(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+  }
+}
+
+void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4, filter_y, y_step_q4,
+                         w, h);
+  }
+}
+
+void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h) {
+  if (x_step_q4 == 16 && filter_x[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_x);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_x);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+  }
+}
+
+void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4,
+                              int w, int h) {
+  if (y_step_q4 == 16 && filter_y[3] != 128) {
+    while (w >= 16) {
+      vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
+                                    dst, dst_stride,
+                                    h, filter_y);
+      src += 16;
+      dst += 16;
+      w -= 16;
+    }
+    while (w >= 8) {
+      vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    }
+    while (w >= 4) {
+      vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
+                                   dst, dst_stride,
+                                   h, filter_y);
+      src += 4;
+      dst += 4;
+      w -= 4;
+    }
+  }
+  if (w) {
+    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+  }
+}
+
+void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+
+  assert(w <= 64);
+  assert(h <= 64);
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  }
+}
+
+void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int x_step_q4,
+                         const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+
+  assert(w <= 64);
+  assert(h <= 64);
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  }
+}
+#endif
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index d00993c..cfec36b 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -26,10 +26,10 @@
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_loadl_epi64((__m128i *)input);
-  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
-  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
-  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+  input0 = _mm_loadl_epi64((const __m128i *)input);
+  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
+  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
+  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -148,7 +148,7 @@
   RECON_AND_STORE4X4(dest, input3);
 }
 
-void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -165,41 +165,6 @@
   RECON_AND_STORE4X4(dest, dc_value);
 }
 
-void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
-                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
-                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
-
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i in, temp;
-
-  // Load input data.
-  in = _mm_loadl_epi64((__m128i *)input);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  in = _mm_shufflelo_epi16(in, 0xd8);
-  in = _mm_unpacklo_epi32(in, in);
-
-  // Stage 1
-  in = _mm_madd_epi16(in, c1);
-  in = _mm_add_epi32(in, rounding);
-  in = _mm_srai_epi32(in, DCT_CONST_BITS);
-  in = _mm_packs_epi32(in, zero);
-
-  // Stage 2
-  temp = _mm_shufflelo_epi16(in, 0x9c);
-  in = _mm_shufflelo_epi16(in, 0xc9);
-  in = _mm_unpacklo_epi64(temp, in);
-  in = _mm_madd_epi16(in, c2);
-  in = _mm_packs_epi32(in, zero);
-
-  // Store results
-  _mm_storel_epi64((__m128i *)output, in);
-}
-
 static INLINE void transpose_4x4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
@@ -210,7 +175,7 @@
   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
 }
 
-void idct4_1d_sse2(__m128i *in) {
+static void idct4_1d_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -249,7 +214,7 @@
   in[3] = _mm_sub_epi16(u[0], u[3]);
 }
 
-void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_1d_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -299,16 +264,16 @@
   in[3] = _mm_unpackhi_epi64(in[1], in[1]);
 }
 
-void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   __m128i in[4];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadl_epi64((__m128i *)input);
-  in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
-  in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
-  in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+  in[0] = _mm_loadl_epi64((const __m128i *)input);
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -529,7 +494,7 @@
       dest += stride; \
   }
 
-void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -549,14 +514,14 @@
   int i;
 
   // Load input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   // 2-D
   for (i = 0; i < 2; i++) {
@@ -597,7 +562,7 @@
   RECON_AND_STORE(dest, in7);
 }
 
-void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -648,7 +613,7 @@
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
-void idct8_1d_sse2(__m128i *in) {
+static void idct8_1d_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -689,7 +654,7 @@
   in[7] = in7;
 }
 
-void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_1d_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -918,21 +883,21 @@
 }
 
 
-void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
 
   // load input data
-  in[0] = _mm_load_si128((__m128i *)input);
-  in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -985,7 +950,7 @@
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_short_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -1005,10 +970,10 @@
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
   // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
 
   // 8x4 Transpose
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
@@ -1263,7 +1228,8 @@
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1318,22 +1284,22 @@
       if (i == 1) input += 128;
 
       // Load input data.
-      in0 = _mm_load_si128((__m128i *)input);
-      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
-      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
-      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
-      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
-      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
-      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
-      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
-      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
-      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
-      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
-      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
-      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
-      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
-      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+      in0 = _mm_load_si128((const __m128i *)input);
+      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
+      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
+      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
+      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
+      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
+      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
+      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
+      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
 
       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
@@ -1470,7 +1436,7 @@
   }
 }
 
-void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a, i;
@@ -1519,7 +1485,7 @@
   res0[15] = tbuf[7];
 }
 
-void iadst16_1d_8col(__m128i *in) {
+static void iadst16_1d_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1989,7 +1955,7 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void idct16_1d_8col(__m128i *in) {
+static void idct16_1d_8col(__m128i *in) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2333,36 +2299,36 @@
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   idct16_1d_8col(in0);
   idct16_1d_8col(in1);
 }
 
-void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   iadst16_1d_8col(in0);
   iadst16_1d_8col(in1);
 }
 
-static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
-  in[0]  = _mm_load_si128((__m128i *)(input + 0 * 16));
-  in[1]  = _mm_load_si128((__m128i *)(input + 1 * 16));
-  in[2]  = _mm_load_si128((__m128i *)(input + 2 * 16));
-  in[3]  = _mm_load_si128((__m128i *)(input + 3 * 16));
-  in[4]  = _mm_load_si128((__m128i *)(input + 4 * 16));
-  in[5]  = _mm_load_si128((__m128i *)(input + 5 * 16));
-  in[6]  = _mm_load_si128((__m128i *)(input + 6 * 16));
-  in[7]  = _mm_load_si128((__m128i *)(input + 7 * 16));
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
 
-  in[8]  = _mm_load_si128((__m128i *)(input + 8 * 16));
-  in[9]  = _mm_load_si128((__m128i *)(input + 9 * 16));
-  in[10]  = _mm_load_si128((__m128i *)(input + 10 * 16));
-  in[11]  = _mm_load_si128((__m128i *)(input + 11 * 16));
-  in[12]  = _mm_load_si128((__m128i *)(input + 12 * 16));
-  in[13]  = _mm_load_si128((__m128i *)(input + 13 * 16));
-  in[14]  = _mm_load_si128((__m128i *)(input + 14 * 16));
-  in[15]  = _mm_load_si128((__m128i *)(input + 15 * 16));
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
 }
 
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -2421,8 +2387,8 @@
   RECON_AND_STORE(dest, in[15]);
 }
 
-void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                                 int tx_type) {
+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
   __m128i in0[16], in1[16];
 
   load_buffer_8x16(input, in0);
@@ -2456,8 +2422,8 @@
   write_buffer_8x16(dest, in1, stride);
 }
 
-void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
-                                     int stride) {
+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -2503,14 +2469,14 @@
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
   // 1-D idct. Load input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
   TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
@@ -2815,11 +2781,12 @@
 
 #define LOAD_DQCOEFF(reg, input) \
   {  \
-    reg = _mm_load_si128((__m128i *) input); \
+    reg = _mm_load_si128((const __m128i *) input); \
     input += 8; \
   }  \
 
-void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -3550,7 +3517,7 @@
   }
 }  //NOLINT
 
-void vp9_short_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a, i;
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index c51d011..314d1a2 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -13,27 +13,23 @@
 SECTION_RODATA
 
 pb_1: times 16 db 1
-pw_2: times 8 dw 2
-pb_7m1: times 8 db 7, -1
-pb_15: times 16 db 15
-
 sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
-sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
-sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
-sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
-sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
-sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
-sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
-sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
-sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
 SECTION .text
 
@@ -455,3 +451,590 @@
   jnz .loop
   RESTORE_GOT
   REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movd                m0, [leftq]               ; l1, l2, l3, l4
+  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
+  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
+  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
+  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1
+  ; A2 B2 A1 B1
+  ; A3 B3 A2 B2
+  ; A4 B4 A3 B3
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
+  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
+
+  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+stride3q ], m3
+  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq*2], m3
+  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
+  movd  [dstq+strideq  ], m3
+  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
+  movd  [dstq          ], m3
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
+  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
+  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
+  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
+  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
+  psrldq              m4, m0, 1                       ; t1-7 [word]
+  psrldq              m5, m0, 2                       ; t2-7 [word]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1
+  ; A2 B2 A1 B1 C1 D1 E1 F1
+  ; A3 B3 A2 B2 A1 B1 C1 D1
+  ; A4 B4 A3 B3 A2 B2 A1 B1
+  ; A5 B5 A4 B4 A3 B3 A2 B2
+  ; A6 B6 A5 B5 A4 B4 A3 B3
+  ; A7 B7 A6 B6 A5 B5 A4 B4
+  ; A8 B8 A7 B7 A6 B6 A5 B5
+  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
+
+  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
+
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+
+  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
+  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2                     ; A-H1
+  movq  [dstq          ], m0
+  lea               dstq, [dstq+strideq*4]
+  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
+  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
+  movq  [dstq+strideq*2], m6
+  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
+  movq  [dstq+strideq  ], m6
+  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
+  movq  [dstq          ], m6
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                m0, [leftq]
+  movu                m7, [aboveq-1]
+  ; comments below are for a predictor like this
+  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr             m5, m0, m6, 15
+  palignr             m3, m0, m6, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+  pavgb               m5, m0                            ; A1 - Ag
+
+  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
+
+  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
+
+  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  palignr             m2, m1, m6, 14
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m1, m6, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m1, m6, 6
+  mova  [dstq          ], m2
+  palignr             m2, m1, m6, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m1, m6, 2
+  mova  [dstq+strideq*2], m2
+  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
+  mova  [dstq+stride3q ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  palignr             m2, m6, m4, 14
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 12
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 10
+  mova  [dstq+strideq*2], m2
+  palignr             m2, m6, m4, 8
+  mova  [dstq+stride3q ], m2
+  lea               dstq, [dstq+strideq*4]
+  palignr             m2, m6, m4, 6
+  mova  [dstq          ], m2
+  palignr             m2, m6, m4, 4
+  mova  [dstq+strideq  ], m2
+  palignr             m2, m6, m4, 2
+  mova  [dstq+strideq*2], m2
+  mova  [dstq+stride3q ], m4
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                  m0, [leftq]
+  movu                  m7, [aboveq-1]
+  movu                  m1, [aboveq+15]
+
+  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
+
+  palignr               m3, m1, m7, 1
+  palignr               m5, m1, m7, 2
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
+
+  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr               m5, m0, m7, 15
+  palignr               m3, m0, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pavgb                 m5, m0                            ; A1 - Ag
+  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
+  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
+
+  DEFINE_ARGS dst, stride, stride3, left, line
+  lea             stride3q, [strideq*3]
+
+  palignr               m5, m2, m1, 14
+  palignr               m7, m1, m6, 14
+  mova  [dstq            ], m7
+  mova  [dstq+16         ], m5
+  palignr               m5, m2, m1, 12
+  palignr               m7, m1, m6, 12
+  mova  [dstq+strideq    ], m7
+  mova  [dstq+strideq+16 ], m5
+  palignr                m5, m2, m1, 10
+  palignr                m7, m1, m6, 10
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m2, m1, 8
+  palignr                m7, m1, m6, 8
+  mova  [dstq+stride3q    ], m7
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m2, m1, 6
+  palignr                m7, m1, m6, 6
+  mova  [dstq             ], m7
+  mova  [dstq+16          ], m5
+  palignr                m5, m2, m1, 4
+  palignr                m7, m1, m6, 4
+  mova  [dstq+strideq     ], m7
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m2, m1, 2
+  palignr                m7, m1, m6, 2
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m6
+  mova  [dstq+stride3q+16 ], m1
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m5, m1, m6, 14
+  palignr                m3, m6, m4, 14
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 12
+  palignr                m3, m6, m4, 12
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 10
+  palignr                m3, m6, m4, 10
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m1, m6, 8
+  palignr                m3, m6, m4, 8
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m1, m6, 6
+  palignr                m3, m6, m4, 6
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 4
+  palignr                m3, m6, m4, 4
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 2
+  palignr                m3, m6, m4, 2
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m4
+  mova  [dstq+stride3q+16 ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  mova                   m7, [leftq]
+  mova                   m3, [leftq+16]
+  palignr                m5, m3, m7, 15
+  palignr                m0, m3, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
+  pavgb                  m5, m3                            ; Ah -
+  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
+  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
+  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
+
+  palignr                m7, m6, m4, 14
+  palignr                m0, m4, m3, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 12
+  palignr                m0, m4, m3, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 10
+  palignr                m0, m4, m3, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m6, m4, 8
+  palignr                m0, m4, m3, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m6, m4, 6
+  palignr                m0, m4, m3, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 4
+  palignr                m0, m4, m3, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 2
+  palignr                m0, m4, m3, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m4
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m7, m4, m3, 14
+  palignr                m0, m3, m2, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 12
+  palignr                m0, m3, m2, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 10
+  palignr                m0, m3, m2, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m4, m3, 8
+  palignr                m0, m3, m2, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m4, m3, 6
+  palignr                m0, m3, m2, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 4
+  palignr                m0, m3, m2, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 2
+  palignr                m0, m3, m2, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m2
+  mova  [dstq+stride3q+16 ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX ssse3
+cglobal d207_predictor_4x4, 2, 5, 4, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+  movifnidn        leftq, leftmp
+  movd                m0, [leftq]                ; abcd [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
+  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
+  pavgb               m1, m0             ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  pshufw              m1, m1, q1111      ; d, d, d, d
+  movd    [dstq+strideq], m1
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 2, 5, 4, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  movifnidn        leftq, leftmp
+  movq                m3, [leftq]            ; abcdefgh [byte]
+  lea           stride3q, [strideq*3]
+
+  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
+  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
+  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+  pavgb               m0, m2
+  punpcklbw           m0, m3        ; interleaved output
+
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  lea               dstq, [dstq+strideq*4]
+  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+  psrldq              m0, 2
+  movq  [dstq          ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq  ], m0
+  psrldq              m0, 2
+  movq  [dstq+strideq*2], m0
+  psrldq              m0, 2
+  movq  [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 2, 5, 5, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  movifnidn        leftq, leftmp
+  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
+  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
+
+  punpckhbw           m4, m1, m3    ; interleaved input
+  punpcklbw           m1, m3        ; interleaved output
+  mova  [dstq          ], m1
+  palignr             m3, m4, m1, 2
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 4
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 6
+  mova  [dstq+stride3q ], m3
+  lea               dstq, [dstq+strideq*4]
+  palignr             m3, m4, m1, 8
+  mova  [dstq          ], m3
+  palignr             m3, m4, m1, 10
+  mova  [dstq+strideq  ], m3
+  palignr             m3, m4, m1, 12
+  mova  [dstq+strideq*2], m3
+  palignr             m3, m4, m1, 14
+  mova  [dstq+stride3q ], m3
+  DEFINE_ARGS dst, stride, stride3, line
+  mov              lined, 2
+  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq          ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq  ], m4
+  pshufb              m4, m0
+  mova  [dstq+strideq*2], m4
+  pshufb              m4, m0
+  mova  [dstq+stride3q ], m4
+  pshufb              m4, m0
+  dec              lined
+  jnz .loop
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 2, 5, 8, dst, stride, stride3, left, goffset
+  GET_GOT     goffsetq
+  lea           stride3q, [strideq*3]
+  movifnidn        leftq, leftmp
+  mova                m1, [leftq]              ;  0-15 [byte]
+  mova                m2, [leftq+16]           ; 16-31 [byte]
+  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+  palignr             m6, m2, m1, 1
+  palignr             m5, m2, m1, 2
+  pavgb               m2, m4         ; high 16px even lines
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+  pavgb                   m1, m6         ; low 16px even lines
+
+  punpckhbw               m6, m1, m0               ; interleaved output 2
+  punpcklbw               m1, m0                   ; interleaved output 1
+
+  punpckhbw               m7, m2, m3               ; interleaved output 4
+  punpcklbw               m2, m3                   ; interleaved output 3
+
+  ; output 1st 8 lines (and half of 2nd 8 lines)
+  DEFINE_ARGS dst, stride, stride3, dst8
+  lea                  dst8q, [dstq+strideq*8]
+  mova  [dstq              ], m1
+  mova  [dstq           +16], m6
+  mova  [dst8q             ], m6
+  palignr             m0, m6, m1, 2
+  palignr             m4, m2, m6, 2
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 4
+  palignr             m4, m2, m6, 4
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 6
+  palignr             m4, m2, m6, 6
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq +strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m0, m6, m1, 8
+  palignr             m4, m2, m6, 8
+  mova  [dstq              ], m0
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m0, m6, m1, 10
+  palignr             m4, m2, m6, 10
+  mova  [dstq +strideq     ], m0
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m0, m6, m1, 12
+  palignr             m4, m2, m6, 12
+  mova  [dstq +strideq*2   ], m0
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m0, m6, m1, 14
+  palignr             m4, m2, m6, 14
+  mova  [dstq +stride3q    ], m0
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+  mova  [dstq           +16], m2
+  mova  [dst8q             ], m2
+  palignr             m4, m7, m2, 2
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 4
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 6
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  palignr             m4, m7, m2, 8
+  mova  [dstq           +16], m4
+  mova  [dst8q             ], m4
+  palignr             m4, m7, m2, 10
+  mova  [dstq +strideq  +16], m4
+  mova  [dst8q+strideq     ], m4
+  palignr             m4, m7, m2, 12
+  mova  [dstq +strideq*2+16], m4
+  mova  [dst8q+strideq*2   ], m4
+  palignr             m4, m7, m2, 14
+  mova  [dstq +stride3q +16], m4
+  mova  [dst8q+stride3q    ], m4
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+
+  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+  mova                m0, [sh_b23456789abcdefff]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+  lea              dst8q, [dst8q+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dst8q             ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq  +16], m7
+  mova  [dst8q+strideq     ], m7
+  pshufb              m7, m0
+  mova  [dstq +strideq*2+16], m7
+  mova  [dst8q+strideq*2   ], m7
+  pshufb              m7, m0
+  mova  [dstq +stride3q +16], m7
+  mova  [dst8q+stride3q    ], m7
+  pshufb              m7, m0
+  lea               dstq, [dstq+strideq*4]
+
+  ; output last half of 4th 8 lines
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+  lea               dstq, [dstq+strideq*4]
+  mova  [dstq           +16], m7
+  mova  [dstq +strideq  +16], m7
+  mova  [dstq +strideq*2+16], m7
+  mova  [dstq +stride3q +16], m7
+
+  ; done!
+  RESTORE_GOT
+  RET
diff --git a/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/vp9/common/x86/vp9_subpixel_8t_sse2.asm
new file mode 100644
index 0000000..9dc8d0a
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_sse2.asm
@@ -0,0 +1,987 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+;void vp9_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 1
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 1, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 1, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 1
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 1, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 224a724..d89d6b8 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -363,15 +363,14 @@
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB,
-                           &fc->switchable_interp_prob[j][i]);
+      vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &fc->inter_mode_probs[i][j]);
+      vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
 static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -426,6 +425,45 @@
   mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
 }
 
+static INLINE void assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+                             int_mv mv[2], int_mv best_mv[2],
+                             int_mv nearest_mv[2], int_mv near_mv[2],
+                             int is_compound, int allow_hp, vp9_reader *r) {
+  int i;
+
+  switch (mode) {
+    case NEWMV:
+       read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+               &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+       if (is_compound)
+         read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+                 &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+       break;
+    case NEARESTMV:
+      mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound)
+        mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    case NEARMV:
+      mv[0].as_int = near_mv[0].as_int;
+      if (is_compound)
+        mv[1].as_int = near_mv[1].as_int;
+      break;
+    case ZEROMV:
+      mv[0].as_int = 0;
+      if (is_compound)
+        mv[1].as_int = 0;
+      break;
+    default:
+      assert(!"Invalid inter mode value.");
+  }
+
+  for (i = 0; i < 1 + is_compound; ++i) {
+    assert(mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW);
+    assert(mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW);
+  }
+}
+
 static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
@@ -445,10 +483,7 @@
                                        int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  nmv_context *const nmvc = &cm->fc.nmvc;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int_mv *const mv0 = &mbmi->mv[0];
-  int_mv *const mv1 = &mbmi->mv[1];
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
@@ -518,41 +553,12 @@
                                           mi_row, mi_col);
         }
 
-        switch (b_mode) {
-          case NEWMV:
-            read_mv(r, &block[0].as_mv, &best[0].as_mv, nmvc, &cm->counts.mv,
-                    allow_hp);
-            if (is_compound)
-              read_mv(r, &block[1].as_mv, &best[1].as_mv, nmvc, &cm->counts.mv,
-                      allow_hp);
-            break;
-          case NEARESTMV:
-            block[0].as_int = nearest[0].as_int;
-            if (is_compound)
-              block[1].as_int = nearest[1].as_int;
-            break;
-          case NEARMV:
-            block[0].as_int = nearmv[0].as_int;
-            if (is_compound)
-              block[1].as_int = nearmv[1].as_int;
-            break;
-          case ZEROMV:
-            block[0].as_int = 0;
-            if (is_compound)
-              block[1].as_int = 0;
-            break;
-          default:
-            assert(!"Invalid inter mode value");
-        }
-        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
-        assert(block[0].as_mv.row < MV_UPP && block[0].as_mv.row > MV_LOW);
-        assert(block[0].as_mv.col < MV_UPP && block[0].as_mv.col > MV_LOW);
+        assign_mv(cm, b_mode, block, best, nearest, nearmv,
+                  is_compound, allow_hp, r);
 
-        if (is_compound) {
+        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+        if (is_compound)
           mi->bmi[j].as_mv[1].as_int = block[1].as_int;
-          assert(block[1].as_mv.row < MV_UPP && block[1].as_mv.row > MV_LOW);
-          assert(block[1].as_mv.col < MV_UPP && block[1].as_mv.col > MV_LOW);
-        }
 
         if (num_4x4_h == 2)
           mi->bmi[j + 2] = mi->bmi[j];
@@ -562,43 +568,12 @@
     }
 
     mi->mbmi.mode = b_mode;
-    mv0->as_int = mi->bmi[3].as_mv[0].as_int;
-    mv1->as_int = mi->bmi[3].as_mv[1].as_int;
+
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    switch (mbmi->mode) {
-      case NEARMV:
-        mv0->as_int = nearmv[0].as_int;
-        if (is_compound)
-          mv1->as_int = nearmv[1].as_int;
-        break;
-
-      case NEARESTMV:
-        mv0->as_int = nearest[0].as_int;
-        if (is_compound)
-          mv1->as_int = nearest[1].as_int;
-        break;
-
-      case ZEROMV:
-        mv0->as_int = 0;
-        if (is_compound)
-          mv1->as_int = 0;
-        break;
-
-      case NEWMV:
-        read_mv(r, &mv0->as_mv, &best[0].as_mv, nmvc, &cm->counts.mv, allow_hp);
-        if (is_compound)
-          read_mv(r, &mv1->as_mv, &best[1].as_mv, nmvc, &cm->counts.mv,
-                  allow_hp);
-        break;
-      default:
-        assert(!"Invalid inter mode value");
-    }
-    assert(mv0->as_mv.row < MV_UPP && mv0->as_mv.row > MV_LOW);
-    assert(mv0->as_mv.col < MV_UPP && mv0->as_mv.col > MV_LOW);
-    if (is_compound) {
-      assert(mv1->as_mv.row < MV_UPP && mv1->as_mv.row > MV_LOW);
-      assert(mv1->as_mv.col < MV_UPP && mv1->as_mv.col > MV_LOW);
-    }
+    assign_mv(cm, mbmi->mode, mbmi->mv, best, nearest, nearmv,
+              is_compound, allow_hp, r);
   }
 }
 
@@ -630,17 +605,17 @@
 
   if (cm->comp_pred_mode == HYBRID_PREDICTION)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_inter_prob[i]);
+      vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
 
   if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++) {
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][0]);
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][1]);
+      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
     }
 
   if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_ref_prob[i]);
+      vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
 
 void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
@@ -650,7 +625,7 @@
   // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.mbskip_probs[k]);
+    vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
 
   if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
     nmv_context *const nmvc = &pbi->common.fc.nmvc;
@@ -663,18 +638,17 @@
       read_switchable_interp_probs(&cm->fc, r);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.intra_inter_prob[i]);
+      vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
 
     read_comp_pred(cm, r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
-        vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.y_mode_prob[j][i]);
+        vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
 
     for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        vp9_diff_update_prob(r, MODE_UPDATE_PROB,
-                             &cm->fc.partition_prob[INTER_FRAME][j][i]);
+        vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
 
     read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
   }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 8b23c73..cc3422f 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -63,15 +63,15 @@
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 3; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p8x8[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 2; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p16x16[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 1; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p32x32[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
 static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
@@ -101,19 +101,19 @@
         if (tx_type == DCT_DCT)
           xd->itxm_add(qcoeff, dst, stride, eob);
         else
-          vp9_iht_add(tx_type, qcoeff, dst, stride, eob);
+          vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
         break;
       case TX_8X8:
         tx_type = get_tx_type_8x8(pd->plane_type, xd);
-        vp9_iht_add_8x8(tx_type, qcoeff, dst, stride, eob);
+        vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob);
         break;
       case TX_16X16:
         tx_type = get_tx_type_16x16(pd->plane_type, xd);
-        vp9_iht_add_16x16(tx_type, qcoeff, dst, stride, eob);
+        vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob);
         break;
       case TX_32X32:
         tx_type = DCT_DCT;
-        vp9_idct_add_32x32(qcoeff, dst, stride, eob);
+        vp9_idct32x32_add(qcoeff, dst, stride, eob);
         break;
       default:
         assert(!"Invalid transform size");
@@ -253,7 +253,7 @@
     if (!less8x8) {
       assert(mbmi->sb_type == bsize);
       if (eobtotal == 0)
-        mbmi->skip_coeff = 1; // skip loopfilter
+        mbmi->skip_coeff = 1;  // skip loopfilter
     }
 
     set_ref(pbi, 0, mi_row, mi_col);
@@ -371,8 +371,7 @@
           for (l = 0; l < PREV_COEF_CONTEXTS; l++)
             if (k > 0 || l < 3)
               for (m = 0; m < UNCONSTRAINED_NODES; m++)
-                vp9_diff_update_prob(r, VP9_COEF_UPDATE_PROB,
-                                     &coef_probs[i][j][k][l][m]);
+                vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
@@ -490,8 +489,7 @@
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
 
-  xd->itxm_add = xd->lossless ? vp9_idct_add_lossless
-                              : vp9_idct_add;
+  xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 }
 
 static INTERPOLATIONFILTERTYPE read_interp_filter_type(
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 8fcf83e..a67945c 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -61,8 +61,6 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 #define INCREMENT_COUNT(token)               \
   do {                                       \
     coef_counts[type][ref][band][pt]         \
@@ -205,7 +203,6 @@
   if (c < seg_eob)
     coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;
 
-
   return c;
 }
 
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index 6f01cea..df044c4 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -99,8 +99,8 @@
   return word;
 }
 
-void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p) {
-  if (vp9_read(r, update_prob)) {
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+  if (vp9_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
     *p = (vp9_prob)inv_remap_prob(delp, *p);
   }
diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
index 21ac313..aeb9399 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/decoder/vp9_dsubexp.h
@@ -14,6 +14,6 @@
 
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p);
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
 
 #endif  // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/vp9/decoder/vp9_thread.c b/vp9/decoder/vp9_thread.c
index dc3b681..5442ddf 100644
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
@@ -29,7 +29,7 @@
 //------------------------------------------------------------------------------
 // simplistic pthread emulation layer
 
-#include <process.h>
+#include <process.h>  // NOLINT
 
 // _beginthreadex requires __stdcall
 #define THREADFN unsigned int __stdcall
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index f7778a4..428ca7e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -22,7 +22,6 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -180,9 +179,8 @@
   vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
   n--;
 
-  for (i = 0; i < n; ++i) {
-    vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
-  }
+  for (i = 0; i < n; ++i)
+    vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
 }
 
 static void update_mbintra_mode_probs(VP9_COMP* const cpi,
@@ -228,8 +226,7 @@
   int k;
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
-                              MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
 }
 
 static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
@@ -252,7 +249,7 @@
   for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
       vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
-                                MODE_UPDATE_PROB, branch_ct[j][i]);
+                                branch_ct[j][i]);
     }
   }
 #ifdef MODE_STATS
@@ -274,7 +271,7 @@
 
     for (j = 0; j < INTER_MODES - 1; ++j)
       vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
-                                MODE_UPDATE_PROB, branch_ct[j]);
+                                branch_ct[j]);
   }
 }
 
@@ -341,7 +338,7 @@
                             const vp9_prob *p) {
   assert(is_inter_mode(mode));
   write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[mode - NEARESTMV]);
+              &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
 }
 
 
@@ -389,8 +386,8 @@
            mi->ref_frame[0]);
   }
 
-  // if using the prediction mdoel we have nothing further to do because
-  // the reference frame is fully coded by the segment
+  // If using the prediction model we have nothing further to do because
+  // the reference frame is fully coded by the segment.
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -407,8 +404,6 @@
   const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = xd->allow_high_precision_mv;
 
-  x->partition_info = x->pi + (m - cm->mi);
-
 #ifdef ENTROPY_STATS
   active_section = 9;
 #endif
@@ -490,7 +485,7 @@
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
-          const MB_PREDICTION_MODE blockmode = x->partition_info->bmi[j].mode;
+          const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [inter_mode_offset(blockmode)];
@@ -784,7 +779,7 @@
   vp9_coeff_probs_model *old_frame_coef_probs =
       cpi->common.fc.coef_probs[tx_size];
   vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
-  const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+  const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   switch (cpi->sf.use_fast_coef_updates) {
@@ -839,7 +834,7 @@
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
                 vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
-                const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+                const vp9_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
                 if (l >= 3 && k == 0)
@@ -1122,26 +1117,23 @@
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
-                                     ct_8x8p);
+      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
-                                  MODE_UPDATE_PROB, ct_8x8p[j]);
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
-                                       ct_16x16p);
+      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
-                                  MODE_UPDATE_PROB, ct_16x16p[j]);
+                                  ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
-                                  MODE_UPDATE_PROB, ct_32x32p[j]);
+                                  ct_32x32p[j]);
     }
 #ifdef MODE_STATS
     if (!cpi->dummy_packing)
@@ -1471,7 +1463,6 @@
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                MODE_UPDATE_PROB,
                                 cpi->intra_inter_count[i]);
 
     if (cm->allow_comp_inter_inter) {
@@ -1485,7 +1476,6 @@
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      MODE_UPDATE_PROB,
                                       cpi->comp_inter_count[i]);
       }
     }
@@ -1493,10 +1483,8 @@
     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][1]);
       }
     }
@@ -1504,7 +1492,6 @@
     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  MODE_UPDATE_PROB,
                                   cpi->comp_ref_count[i]);
 
     update_mbintra_mode_probs(cpi, &header_bc);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2e28a2e..9b57bc3 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -23,16 +23,9 @@
   int offset;
 } search_site;
 
-typedef struct {
-  struct {
-    MB_PREDICTION_MODE mode;
-  } bmi[4];
-} PARTITION_INFO;
-
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
   MODE_INFO mic;
-  PARTITION_INFO partition_info;
   unsigned char zcoeff_blk[256];
   int skip;
   int_mv best_ref_mv;
@@ -87,9 +80,6 @@
 
   MACROBLOCKD e_mbd;
   int skip_block;
-  PARTITION_INFO *partition_info; /* work pointer */
-  PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
-  PARTITION_INFO *pip;  /* Base of allocated array */
 
   search_site *ss;
   int ss_count;
@@ -145,6 +135,7 @@
 
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
+  uint8_t token_cache[1024];
 
   int optimize;
 
@@ -188,4 +179,23 @@
                          int y_blocks);
 };
 
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int skip;
+  const int16_t *scan, *nb;
+};
+
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c
index 0f1aa59..32c136e 100644
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -22,23 +22,28 @@
 #endif
 
 const unsigned int vp9_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-  1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-  767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-  511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-  428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-  304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-  255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-  172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-  137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-  75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-  48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
+  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
+  1129, 1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,
+  873,  858,  843,  829,  816,  803,  790,  778,  767,  755,  744,  733,
+  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,
+  534,  528,  522,  516,  511,  505,  499,  494,  488,  483,  477,  472,
+  467,  462,  457,  452,  447,  442,  437,  433,  428,  424,  419,  415,
+  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,
+  317,  314,  311,  307,  304,  301,  297,  294,  291,  288,  285,  281,
+  278,  275,  272,  269,  266,  263,  260,  257,  255,  252,  249,  246,
+  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,
+  181,  179,  177,  174,  172,  170,  168,  165,  163,  161,  159,  156,
+  154,  152,  150,  148,  145,  143,  141,  139,  137,  135,  133,  131,
+  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+  105,  103,  101,  99,   97,   95,   93,   92,   90,   88,   86,   84,
+  82,   81,   79,   77,   75,   73,   72,   70,   68,   66,   65,   63,
+  61,   60,   58,   56,   55,   53,   51,   50,   48,   46,   45,   43,
+  41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+  22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
+  4,    3,    1,    1};
 
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 27e4cd0..a232a86 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -17,7 +17,7 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 
-static void fdct4_1d(int16_t *input, int16_t *output) {
+static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
 
@@ -102,7 +102,7 @@
   }
 }
 
-static void fadst4_1d(int16_t *input, int16_t *output) {
+static void fadst4(const int16_t *input, int16_t *output) {
   int x0, x1, x2, x3;
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -143,10 +143,10 @@
 }
 
 static const transform_2d FHT_4[] = {
-  { fdct4_1d,  fdct4_1d  },  // DCT_DCT  = 0
-  { fadst4_1d, fdct4_1d  },  // ADST_DCT = 1
-  { fdct4_1d,  fadst4_1d },  // DCT_ADST = 2
-  { fadst4_1d, fadst4_1d }   // ADST_ADST = 3
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
 void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
@@ -183,7 +183,7 @@
     vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
-static void fdct8_1d(int16_t *input, int16_t *output) {
+static void fdct8(const int16_t *input, int16_t *output) {
   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   /*needs32*/ int t0, t1, t2, t3;
   /*canbe16*/ int x0, x1, x2, x3;
@@ -198,7 +198,7 @@
   s6 = input[1] - input[6];
   s7 = input[0] - input[7];
 
-  // fdct4_1d(step, step);
+  // fdct4(step, step);
   x0 = s0 + s3;
   x1 = s1 + s2;
   x2 = s1 - s2;
@@ -259,7 +259,7 @@
       s6 = (input[1 * stride] - input[6 * stride]) * 4;
       s7 = (input[0 * stride] - input[7 * stride]) * 4;
 
-      // fdct4_1d(step, step);
+      // fdct4(step, step);
       x0 = s0 + s3;
       x1 = s1 + s2;
       x2 = s1 - s2;
@@ -301,7 +301,7 @@
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
+    fdct8(&intermediate[i * 8], &final_output[i * 8]);
     for (j = 0; j < 8; ++j)
       final_output[j + i * 8] /= 2;
   }
@@ -368,7 +368,7 @@
         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
         /*needs32*/ int t0, t1, t2, t3;
@@ -384,7 +384,7 @@
         s6 = input[1] - input[6];
         s7 = input[0] - input[7];
 
-        // fdct4_1d(step, step);
+        // fdct4(step, step);
         x0 = s0 + s3;
         x1 = s1 + s2;
         x2 = s1 - s2;
@@ -486,7 +486,7 @@
   }
 }
 
-static void fadst8_1d(int16_t *input, int16_t *output) {
+static void fadst8(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -558,10 +558,10 @@
 }
 
 static const transform_2d FHT_8[] = {
-  { fdct8_1d,  fdct8_1d  },  // DCT_DCT  = 0
-  { fadst8_1d, fdct8_1d  },  // ADST_DCT = 1
-  { fdct8_1d,  fadst8_1d },  // DCT_ADST = 2
-  { fadst8_1d, fadst8_1d }   // ADST_ADST = 3
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
 void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
@@ -654,7 +654,7 @@
 
 
 // Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+static void fdct16(const int16_t in[16], int16_t out[16]) {
   /*canbe16*/ int step1[8];
   /*canbe16*/ int step2[8];
   /*canbe16*/ int step3[8];
@@ -680,7 +680,7 @@
   step1[6] = in[1] - in[14];
   step1[7] = in[0] - in[15];
 
-  // fdct8_1d(step, step);
+  // fdct8(step, step);
   {
     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
     /*needs32*/ int t0, t1, t2, t3;
@@ -696,7 +696,7 @@
     s6 = input[1] - input[6];
     s7 = input[0] - input[7];
 
-    // fdct4_1d(step, step);
+    // fdct4(step, step);
     x0 = s0 + s3;
     x1 = s1 + s2;
     x2 = s1 - s2;
@@ -795,7 +795,7 @@
   out[15] = dct_const_round_shift(temp2);
 }
 
-void fadst16_1d(int16_t *input, int16_t *output) {
+static void fadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -958,10 +958,10 @@
 }
 
 static const transform_2d FHT_16[] = {
-  { fdct16_1d,  fdct16_1d  },  // DCT_DCT  = 0
-  { fadst16_1d, fdct16_1d  },  // ADST_DCT = 1
-  { fdct16_1d,  fadst16_1d },  // DCT_ADST = 2
-  { fadst16_1d, fadst16_1d }   // ADST_ADST = 3
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
 void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
@@ -1003,7 +1003,7 @@
   return rv;
 }
 
-static void dct32_1d(int *input, int *output, int round) {
+static void dct32_1d(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1fbdb72..631a276 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -22,6 +22,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -381,7 +382,6 @@
   }
 
   if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
-    *x->partition_info = ctx->partition_info;
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   }
@@ -492,9 +492,6 @@
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
   x->active_ptr = cpi->active_map + idx_map;
 
-  /* pointers to mode info contexts */
-  x->partition_info = x->pi + idx_str;
-
   xd->mi_8x8 = cm->mi_grid_visible + idx_str;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
@@ -1866,8 +1863,7 @@
     // printf("Switching to lossless\n");
     cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+    cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
     cpi->common.lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
@@ -1876,8 +1872,7 @@
     // printf("Not lossless\n");
     cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+    cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
   }
 }
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index a610d63..a0a7bab 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -24,9 +24,6 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff_ptr, ptrdiff_t diff_stride,
                           const uint8_t *src_ptr, ptrdiff_t src_stride,
@@ -43,15 +40,6 @@
   }
 }
 
-static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
-                                        int16_t *dqcoeff, uint8_t *dest,
-                                        int stride) {
-  if (eob <= 1)
-    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
-  else
-    xd->inv_txm4x4_add(dqcoeff, dest, stride);
-}
-
 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -80,8 +68,7 @@
   vp9_subtract_sbuv(x, bsize);
 }
 
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
 typedef struct vp9_token_state vp9_token_state;
 
 struct vp9_token_state {
@@ -92,7 +79,7 @@
   short         qc;
 };
 
-// TODO: experiments to find optimal multiple numbers
+// TODO(jimbankoski): experiment to find optimal RD numbers.
 #define Y1_RD_MULT 4
 #define UV_RD_MULT 2
 
@@ -272,11 +259,10 @@
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
-    }
-    /* There's no choice to make for a zero coefficient, so we don't
-     *  add a new trellis node, but we do need to update the costs.
-     */
-    else {
+    } else {
+      /* There's no choice to make for a zero coefficient, so we don't
+       *  add a new trellis node, but we do need to update the costs.
+       */
       band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
@@ -456,20 +442,19 @@
 
   switch (tx_size) {
     case TX_32X32:
-      vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+      vp9_idct32x32_1024_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
-      vp9_idct_add_16x16(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_8X8:
-      vp9_idct_add_8x8(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
-                                  dst, pd->dst.stride);
+      xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
       break;
     default:
       assert(!"Invalid transform size");
@@ -554,7 +539,7 @@
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob)
-        vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+        vp9_idct32x32_1024_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
       tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -579,7 +564,7 @@
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob)
-        vp9_iht_add_16x16(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -604,7 +589,7 @@
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob)
-        vp9_iht_add_8x8(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
@@ -636,9 +621,9 @@
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
+          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
         else
-          vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
       break;
     default:
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 04a4172..9ebcc49 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -337,7 +337,7 @@
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
-        if (x->partition_info->bmi[i].mode == NEWMV)
+        if (mi->bmi[i].as_mode == NEWMV)
           inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
       }
     }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4719313..b2becbb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -8,8 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "math.h"
-#include "limits.h"
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
@@ -23,7 +24,6 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
-#include <stdio.h>
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -77,7 +77,8 @@
 }
 
 
-// Resets the first pass file to the given position using a relative seek from the current position
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
 static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
   cpi->twopass.stats_in = position;
 }
@@ -250,8 +251,10 @@
   section->duration   /= section->count;
 }
 
-// Calculate a modified Error used in distributing bits between easier and harder frames
-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+static double calculate_modified_err(VP9_COMP *cpi,
+                                     FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
   const double av_err = stats->ssim_weighted_pred_err / stats->count;
   const double this_err = this_frame->ssim_weighted_pred_err;
@@ -260,38 +263,43 @@
 }
 
 static const double weight_table[256] = {
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
-  0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
-  0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
-  0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
-  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
-  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+  0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500,
+  0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250,
+  0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000,
+  0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+  0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500,
+  0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+  1.000000, 1.000000, 1.000000, 1.000000
 };
 
 static double simple_weight(YV12_BUFFER_CONFIG *source) {
@@ -300,7 +308,8 @@
   uint8_t *src = source->y_buffer;
   double sum_weights = 0.0;
 
-  // Loop throught the Y plane raw examining levels and creating a weight for the image
+  // Loop through the Y plane examining levels and creating a weight for
+  // the image.
   i = source->y_height;
   do {
     j = source->y_width;
@@ -340,7 +349,9 @@
   output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                             YV12_BUFFER_CONFIG *recon_buffer,
+                             int *best_motion_err, int recon_yoffset) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   // Set up pointers for this macro block recon buffer
@@ -444,9 +455,9 @@
   while (n < further_steps) {
     n++;
 
-    if (num00)
+    if (num00) {
       num00--;
-    else {
+    } else {
       tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
                                         &num00, &v_fn_ptr,
@@ -504,7 +515,6 @@
   setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
   setup_dst_planes(xd, new_yv12, 0, 0);
 
-  x->partition_info = x->pi;
   xd->mi_8x8 = cm->mi_grid_visible;
   // required for vp9_frame_init_quantizer
   xd->this_mi =
@@ -574,16 +584,20 @@
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(x, use_dc_pred);
 
-      // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
-      // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
-      // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+      // intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (eg a plain black frame).
+      // We do not have special cases in first pass for 0,0 and nearest etc so
+      // all inter modes carry an overhead cost estimate for the mv.
+      // When the error score is very low this causes us to pick all or lots of
+      // INTRA modes and throw lots of key frames.
       // This penalty adds a cost matching that of a 0,0 mv to the intra case.
       this_error += intrapenalty;
 
       // Cumulative intra error total
       intra_error += (int64_t)this_error;
 
-      // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+      // Set up limit values for motion vectors to prevent them extending
+      // outside the UMV borders.
       x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
                       + BORDER_MV_PIXELS_B16;
@@ -604,7 +618,8 @@
                                  &mv.as_mv, lst_yv12,
                                  &motion_error, recon_yoffset);
 
-        // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+        // If the current best reference mv is not centered on 0,0 then do a 0,0
+        // based search as well.
         if (best_ref_mv.as_int) {
           tmp_err = INT_MAX;
           first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
@@ -644,9 +659,9 @@
             sr_coded_error += gf_motion_error;
           else
             sr_coded_error += this_error;
-        } else
+        } else {
           sr_coded_error += motion_error;
-
+        }
         /* Intra assumed best */
         best_ref_mv.as_int = 0;
 
@@ -718,9 +733,9 @@
             }
           }
         }
-      } else
+      } else {
         sr_coded_error += (int64_t)this_error;
-
+      }
       coded_error += (int64_t)this_error;
 
       // adjust to the next column of macroblocks
@@ -779,16 +794,19 @@
       fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
       fps.MVc = (double)sum_mvc / (double)mvcount;
       fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
-      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
-      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+      fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) /
+                 (double)mvcount;
+      fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) /
+                 (double)mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
       fps.new_mv_count = new_mv_count;
 
       fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
     }
 
-    // TODO:  handle the case when duration is set to 0, or something less
-    // than the full time between subsequent values of cpi->source_time_stamp.
+    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+    // something less than the full time between subsequent values of
+    // cpi->source_time_stamp.
     fps.duration = (double)(cpi->source->ts_end
                             - cpi->source->ts_start);
 
@@ -808,15 +826,16 @@
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
-  } else
+  } else {
     cpi->twopass.sr_update_lag++;
-
+  }
   // swap frame pointers so last frame refers to the frame we just compressed
   swap_yv12(lst_yv12, new_yv12);
 
   vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
 
-  // Special case for the first frame. Copy into the GF buffer as a second reference.
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
   if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
 
@@ -824,7 +843,8 @@
   if (0) {
     char filename[512];
     FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+    snprintf(filename, sizeof(filename), "enc%04d.yuv",
+             (int)cm->current_video_frame);
 
     if (cm->current_video_frame == 0)
       recon_file = fopen(filename, "wb");
@@ -836,7 +856,6 @@
   }
 
   cm->current_video_frame++;
-
 }
 
 // Estimate a cost per mb attributable to overheads such as the coding of
@@ -879,7 +898,7 @@
            (av_intra * intra_cost)) * cpi->common.MBs) << 9;
 
   // return mv_cost + mode_cost;
-  // TODO PGW Fix overhead costs for extended Q range
+  // TODO(paulwilkins): Fix overhead costs for extended Q range.
 #endif
   return 0;
 }
@@ -1103,8 +1122,8 @@
   FIRSTPASS_STATS *start_pos;
 
   double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
-  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                      * cpi->oxcf.two_pass_vbrmin_section / 100);
+  double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+                                      cpi->oxcf.two_pass_vbrmin_section / 100);
 
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
@@ -1142,15 +1161,17 @@
   // This variable monitors how far behind the second ref update is lagging
   cpi->twopass.sr_update_lag = 1;
 
-  // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+  // Scan the first pass file and calculate an average Intra / Inter error score
+  // ratio for the sequence.
   {
     double sum_iiratio = 0.0;
     double IIRatio;
 
-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+    start_pos = cpi->twopass.stats_in;  // Note the starting "file" position.
 
     while (input_stats(cpi, &this_frame) != EOF) {
-      IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+      IIRatio = this_frame.intra_error
+                / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
       IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
       sum_iiratio += IIRatio;
     }
@@ -1162,21 +1183,21 @@
     reset_fpf_position(cpi, start_pos);
   }
 
-  // Scan the first pass file and calculate a modified total error based upon the bias/power function
-  // used to allocate bits
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
   {
-    start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+    start_pos = cpi->twopass.stats_in;  // Note starting "file" position
 
     cpi->twopass.modified_error_total = 0.0;
     cpi->twopass.modified_error_used = 0.0;
 
     while (input_stats(cpi, &this_frame) != EOF) {
-      cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+      cpi->twopass.modified_error_total +=
+          calculate_modified_err(cpi, &this_frame);
     }
     cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
 
-    reset_fpf_position(cpi, start_pos);            // Reset file position
-
+    reset_fpf_position(cpi, start_pos);  // Reset file position
   }
 }
 
@@ -1322,7 +1343,6 @@
       (this_frame_mvc_ratio < this_frame->mvc_abs)
       ? (this_frame_mvc_ratio * motion_pct)
       : this_frame->mvc_abs * motion_pct;
-
   }
 }
 
@@ -1381,7 +1401,8 @@
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
@@ -1417,7 +1438,8 @@
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
@@ -1433,7 +1455,6 @@
 
     boost_score += (decay_accumulator *
                     calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-
   }
   *b_boost = (int)boost_score;
 
@@ -1667,7 +1688,8 @@
     // Update the motion related elements to the boost calculation
     accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
-                                  &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+                                  &abs_mv_in_out_accumulator,
+                                  &mv_ratio_accumulator);
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
@@ -1710,8 +1732,7 @@
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
          (mv_in_out_accumulator < -2.0) ||
-         ((boost_score - old_boost_score) < IIFACTOR))
-      )) {
+         ((boost_score - old_boost_score) < IIFACTOR)))) {
       boost_score = old_boost_score;
       break;
     }
@@ -1765,7 +1786,8 @@
        (mv_in_out_accumulator > -2.0)) &&
       (boost_score > 100)) {
     // Alternative boost calculation for alt ref
-    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+                                    &b_boost);
     cpi->source_alt_ref_pending = 1;
 
 #if CONFIG_MULTIPLE_ARF
@@ -1842,9 +1864,9 @@
     cpi->twopass.gf_group_bits =
       (int64_t)(cpi->twopass.kf_group_bits *
                 (gf_group_err / cpi->twopass.kf_group_error_left));
-  } else
+  } else {
     cpi->twopass.gf_group_bits = 0;
-
+  }
   cpi->twopass.gf_group_bits =
     (cpi->twopass.gf_group_bits < 0)
     ? 0
@@ -1908,11 +1930,10 @@
 
       if (gf_bits > alt_gf_bits)
         gf_bits = alt_gf_bits;
-    }
-    // Else if it is harder than other frames in the group make sure it at
-    // least receives an allocation in keeping with its relative error
-    // score, otherwise it may be worse off than an "un-boosted" frame
-    else {
+    } else {
+      // If it is harder than other frames in the group make sure it at
+      // least receives an allocation in keeping with its relative error
+      // score, otherwise it may be worse off than an "un-boosted" frame.
       int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
                         mod_frame_err /
                         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
@@ -2024,9 +2045,9 @@
 
   // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
   // the top end.
-  if (target_frame_size < 0)
+  if (target_frame_size < 0) {
     target_frame_size = 0;
-  else {
+  } else {
     if (target_frame_size > max_bits)
       target_frame_size = max_bits;
 
@@ -2249,16 +2270,17 @@
   if ((this_frame->pcnt_second_ref < 0.10) &&
       (next_frame->pcnt_second_ref < 0.10) &&
       ((this_frame->pcnt_inter < 0.05) ||
-       (
-         ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
-         ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
-         ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
-          (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
-          ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
-         )
-       )
-      )
-     ) {
+       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          .40) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          .40) ||
+         ((next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
     int i;
     FIRSTPASS_STATS *start_pos;
 
@@ -2276,7 +2298,8 @@
 
     // Examine how well the key frame predicts subsequent frames
     for (i = 0; i < 16; i++) {
-      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+      next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
+                      DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
 
       if (next_iiratio > RMAX)
         next_iiratio = RMAX;
@@ -2285,7 +2308,8 @@
       if (local_next_frame.pcnt_inter > 0.85)
         decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
       else
-        decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+        decay_accumulator =
+            decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
 
       // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
 
@@ -2313,9 +2337,9 @@
 
     // If there is tolerable prediction for at least the next 3 frames then
     // break out else discard this potential key frame and move on
-    if (boost_score > 30.0 && (i > 3))
+    if (boost_score > 30.0 && (i > 3)) {
       is_viable_kf = 1;
-    else {
+    } else {
       // Reset the file position
       reset_fpf_position(cpi, start_pos);
 
@@ -2375,8 +2399,9 @@
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including key frames in the group.
-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+    // These figures keep intra and coded error counts for all frames including
+    // key frames in the group. The effect of the key frame itself can be
+    // subtracted out using the first_frame data collected above.
     kf_group_intra_err += this_frame->intra_error;
     kf_group_coded_err += this_frame->coded_error;
 
@@ -2416,9 +2441,9 @@
       // forcekeyframeevery intervals then break out of the loop.
       if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
         break;
-    } else
+    } else {
       cpi->twopass.frames_to_key++;
-
+    }
     i++;
   }
 
@@ -2458,22 +2483,24 @@
     reset_fpf_position(cpi, current_pos);
 
     cpi->next_key_frame_forced = 1;
-  } else
+  } else {
     cpi->next_key_frame_forced = 0;
-
+  }
   // Special case for the last frame of the file
   if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
-    // These figures keep intra and coded error counts for all frames including key frames in the group.
-    // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+    // These figures keep intra and coded error counts for all frames including
+    // key frames in the group. The effect of the key frame itself can be
+    // subtracted out using the first_frame data collected above.
     kf_group_intra_err += this_frame->intra_error;
     kf_group_coded_err += this_frame->coded_error;
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
+  if ((cpi->twopass.bits_left > 0) &&
+      (cpi->twopass.modified_error_left > 0.0)) {
     // Max for a single normal frame (not key frame)
     int max_bits = frame_max_bits(cpi);
 
@@ -2490,13 +2517,14 @@
     max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
     if (cpi->twopass.kf_group_bits > max_grp_bits)
       cpi->twopass.kf_group_bits = max_grp_bits;
-  } else
+  } else {
     cpi->twopass.kf_group_bits = 0;
-
+  }
   // Reset the first pass file position
   reset_fpf_position(cpi, start_position);
 
-  // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+  // Determine how big to make this keyframe based on how well the subsequent
+  // frames use inter blocks.
   decay_accumulator = 1.0;
   boost_score = 0.0;
   loop_decay_rate = 1.00;       // Starting decay rate
@@ -2569,7 +2597,7 @@
     if (kf_boost < (cpi->twopass.frames_to_key * 3))
       kf_boost = (cpi->twopass.frames_to_key * 3);
 
-    if (kf_boost < 300) // Min KF boost
+    if (kf_boost < 300)  // Min KF boost
       kf_boost = 300;
 
     // Make a note of baseline boost and the zero motion
@@ -2604,10 +2632,13 @@
       allocation_chunks /= divisor;
     }
 
-    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+    cpi->twopass.kf_group_bits =
+        (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
 
     // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+    cpi->twopass.kf_bits =
+        (int)((double)kf_boost *
+              ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
 
     // If the key frame is actually easier than the average for the
     // kf group (which does sometimes happen... eg a blank intro frame)
@@ -2625,11 +2656,10 @@
       if (cpi->twopass.kf_bits > alt_kf_bits) {
         cpi->twopass.kf_bits = alt_kf_bits;
       }
-    }
+    } else {
     // Else if it is much harder than other frames in the group make sure
     // it at least receives an allocation in keeping with its relative
     // error score
-    else {
       alt_kf_bits =
         (int)((double)cpi->twopass.bits_left *
               (kf_mod_err /
@@ -2655,6 +2685,7 @@
   cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
-  // The count of bits left is adjusted elsewhere based on real coded frame sizes
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
   cpi->twopass.modified_error_left -= kf_group_err;
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 2296a66..c18d11e 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -10,6 +10,7 @@
 
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
+#include "vp9/encoder/vp9_onyx_int.h"
 
 void vp9_init_first_pass(VP9_COMP *cpi);
 void vp9_first_pass(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 81445a9..c28c868 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -10,7 +10,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/common/vp9_extend.h"
@@ -77,7 +77,7 @@
         goto bail;
   }
   return ctx;
-bail:
+ bail:
   vp9_lookahead_destroy(ctx);
   return NULL;
 }
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 44eaa65..561c725 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -320,8 +320,8 @@
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
-  // TODO: Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
+  // TODO(jbb): Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 if diagonal is selected.
   while (halfiters--) {
     // 1/2 pel
     FIRST_LEVEL_CHECKS;
@@ -332,8 +332,8 @@
     tc = bc;
   }
 
-  // TODO: Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  // TODO(yaowu): Each subsequent iteration checks at least one point in common
+  // with the last iteration could be 2 if diagonal is selected.
 
   // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
   if (forced_stop != 2) {
@@ -1122,8 +1122,10 @@
                 + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
                                  mvjsadcost, mvsadcost, sad_per_bit);
 
-  // search_param determines the length of the initial step and hence the number of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
   ss = &x->ss[search_param * x->searches_per_step];
   tot_steps = (x->ss_count / x->searches_per_step) - search_param;
 
@@ -1192,8 +1194,9 @@
         break;
       };
 #endif
-    } else if (best_address == in_what)
+    } else if (best_address == in_what) {
       (*num00)++;
+    }
   }
 
   this_mv.as_mv.row = best_mv->as_mv.row * 8;
@@ -1263,8 +1266,11 @@
                 + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
                                  mvjsadcost, mvsadcost, sad_per_bit);
 
-  // search_param determines the length of the initial step and hence the number of iterations
-  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
   ss = &x->ss[search_param * x->searches_per_step];
   tot_steps = (x->ss_count / x->searches_per_step) - search_param;
 
@@ -1273,13 +1279,16 @@
   for (step = 0; step < tot_steps; step++) {
     int all_in = 1, t;
 
-    // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
-    // checking 4 bounds for each points.
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
     all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
     all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
     all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
     all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
 
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
     if (all_in) {
       unsigned int sad_array[4];
 
@@ -1312,10 +1321,13 @@
         this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
         this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
 
-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
           check_here = ss[i].offset + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
 
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
@@ -1365,8 +1377,9 @@
         break;
       };
 #endif
-    } else if (best_address == in_what)
+    } else if (best_address == in_what) {
       (*num00)++;
+    }
   }
 
   this_mv.as_mv.row = best_mv->as_mv.row * 8;
@@ -1401,16 +1414,17 @@
   n = num00;
   num00 = 0;
 
-  /* If there won't be more n-step search, check to see if refining search is needed. */
+  /* If there won't be more n-step search, check to see if refining search is
+   * needed. */
   if (n > further_steps)
     do_refine = 0;
 
   while (n < further_steps) {
     n++;
 
-    if (num00)
+    if (num00) {
       num00--;
-    else {
+    } else {
       thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
@@ -1504,7 +1518,8 @@
     check_here = r * mv_stride + in_what + col_min;
 
     for (c = col_min; c < col_max; c++) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                            bestsad);
 
       this_mv.as_mv.col = c;
       thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
@@ -1621,7 +1636,8 @@
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                            bestsad);
 
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
@@ -1639,7 +1655,6 @@
       check_here++;
       c++;
     }
-
   }
 
   this_mv.as_mv.row = best_mv->as_mv.row * 8;
@@ -1770,7 +1785,8 @@
     }
 
     while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                            bestsad);
 
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
@@ -1840,10 +1856,14 @@
       this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
       this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
 
-      if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+                     best_address;
+        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                              bestsad);
 
         if (thissad < bestsad) {
           this_mv.as_mv.row = this_row_offset;
@@ -1859,12 +1879,13 @@
       }
     }
 
-    if (best_site == -1)
+    if (best_site == -1) {
       break;
-    else {
+    } else {
       ref_mv->as_mv.row += neighbors[best_site].row;
       ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+                      neighbors[best_site].col;
     }
   }
 
@@ -1927,7 +1948,8 @@
       block_offset[2] = best_address + 1;
       block_offset[3] = best_address + in_what_stride;
 
-      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+      fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                     sad_array);
 
       for (j = 0; j < 4; j++) {
         if (sad_array[j] < bestsad) {
@@ -1947,10 +1969,14 @@
         this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
         this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
 
-        if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
-          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
-          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+        if ((this_col_offset > x->mv_col_min) &&
+            (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) &&
+            (this_row_offset < x->mv_row_max)) {
+          check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+                       best_address;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
 
           if (thissad < bestsad) {
             this_mv.as_mv.row = this_row_offset;
@@ -1967,12 +1993,13 @@
       }
     }
 
-    if (best_site == -1)
+    if (best_site == -1) {
       break;
-    else {
+    } else {
       ref_mv->as_mv.row += neighbors[best_site].row;
       ref_mv->as_mv.col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+                      neighbors[best_site].col;
     }
   }
 
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index a5dfaed..b867d8b 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -17,7 +17,7 @@
 
 void vp9_init_mode_costs(VP9_COMP *c) {
   VP9_COMMON *const cm = &c->common;
-  const vp9_tree_p KT = vp9_intra_mode_tree;
+  const vp9_tree_index *KT = vp9_intra_mode_tree;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; i++) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e7384ba..0833b4a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -8,45 +8,35 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include "vpx_config.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_firstpass.h"
-#include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "./vp9_rtcd.h"
-#include "./vpx_scale_rtcd.h"
-#if CONFIG_VP9_POSTPROC
-#include "vp9/common/vp9_postproc.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_timer.h"
-
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/encoder/vp9_temporal_filter.h"
-
 #include <math.h>
 #include <stdio.h>
 #include <limits.h>
 
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_tile_common.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_psnr.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+#include "vpx_ports/vpx_timer.h"
+
+
 extern void print_tree_update_probs();
 
 static void set_default_lf_deltas(struct loopfilter *lf);
@@ -55,12 +45,12 @@
 
 #define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
 
-#define ALTREF_HIGH_PRECISION_MV 1      /* whether to use high precision mv
-                                           for altref computation */
-#define HIGH_PRECISION_MV_QTHRESH 200   /* Q threshold for use of high precision
-                                           mv. Choose a very high value for
-                                           now so that HIGH_PRECISION is always
-                                           chosen */
+#define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
+                                         //  for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
+                                         // mv. Choose a very high value for
+                                         // now so that HIGH_PRECISION is always
+                                         // chosen.
 
 // Masks for partially or completely disabling split mode
 #define DISABLE_ALL_SPLIT         0x3F
@@ -69,8 +59,6 @@
 #define LAST_AND_INTRA_SPLIT_ONLY 0x1E
 
 #if CONFIG_INTERNAL_STATS
-#include "math.h"
-
 extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest, int lumamask,
                             double *weight);
@@ -113,7 +101,8 @@
 #endif
 
 #ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0};
 #endif
 
 #if defined(SECTIONBITS_OUTPUT)
@@ -321,9 +310,6 @@
   cpi->mb_activity_map = 0;
   vpx_free(cpi->mb_norm_activity_map);
   cpi->mb_norm_activity_map = 0;
-
-  vpx_free(cpi->mb.pip);
-  cpi->mb.pip = 0;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -402,7 +388,6 @@
 
       // Where relevant assume segment data is delta data
       seg->abs_delta = SEGMENT_DELTADATA;
-
     }
   } else if (seg->enabled) {
     // All other frames if segmentation has been enabled
@@ -752,8 +737,10 @@
   sf->mode_search_skip_flags = 0;
   sf->disable_split_var_thresh = 0;
   sf->disable_filter_search_var_thresh = 0;
-  sf->intra_y_mode_mask = ALL_INTRA_MODES;
-  sf->intra_uv_mode_mask = ALL_INTRA_MODES;
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
+    sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+  }
   sf->use_rd_breakout = 0;
   sf->skip_encode_sb = 0;
   sf->use_uv_intra_rd_estimate = 0;
@@ -770,7 +757,7 @@
 #endif
 
   switch (mode) {
-    case 0: // best quality mode
+    case 0:  // This is the best quality mode.
       break;
 
     case 1:
@@ -782,16 +769,19 @@
 #endif
       sf->use_avoid_tested_higherror = 1;
       sf->adaptive_rd_thresh = 1;
+      sf->recode_loop = (speed < 1);
 
       if (speed == 1) {
         sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
                                           cpi->common.intra_only);
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only)
-                                     ? USE_FULL_RD : USE_LARGESTALL);
+        sf->less_rectangular_check  = 1;
+        sf->tx_size_search_method = (cpi->common.frame_type == KEY_FRAME ||
+                                     cpi->common.intra_only)
+                                     ? USE_FULL_RD : USE_LARGESTALL;
 
         if (MIN(cpi->common.width, cpi->common.height) >= 720)
-          sf->disable_split_mask = DISABLE_ALL_SPLIT;
+          sf->disable_split_mask = cpi->common.show_frame ?
+              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
         else
           sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
 
@@ -799,19 +789,26 @@
         sf->adaptive_motion_search = 1;
         sf->auto_mv_step_size = 1;
         sf->adaptive_rd_thresh = 2;
+        sf->recode_loop = 2;
+        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
       }
       if (speed == 2) {
         sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
                                           cpi->common.intra_only);
+        sf->less_rectangular_check  = 1;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
                                       cpi->common.intra_only)
                                      ? USE_FULL_RD : USE_LARGESTALL);
 
         if (MIN(cpi->common.width, cpi->common.height) >= 720)
-          sf->disable_split_mask = DISABLE_ALL_SPLIT;
+          sf->disable_split_mask = cpi->common.show_frame ?
+              DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
         else
           sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
 
+
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
@@ -831,6 +828,10 @@
 
         sf->adaptive_rd_thresh = 2;
         sf->mode_skip_start = 11;
+        sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+        sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
       }
       if (speed == 3) {
         sf->use_square_partition_only = 1;
@@ -910,11 +911,9 @@
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
         sf->use_one_partition_size_always = 1;
         sf->always_this_block_size = BLOCK_16X16;
-        sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
-                                      cpi->common.intra_only ||
-                                      cpi->common.show_frame == 0) ?
-                                     USE_FULL_RD :
-                                     USE_LARGESTALL);
+        sf->tx_size_search_method = (cpi->common.frame_type == KEY_FRAME ||
+                                     cpi->common.intra_only) ?
+                                     USE_FULL_RD : USE_LARGESTALL;
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
@@ -933,14 +932,15 @@
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
         sf->disable_filter_search_var_thresh = 96;
-        sf->intra_y_mode_mask = INTRA_DC_ONLY;
-        sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+        for (i = 0; i < TX_SIZES; i++) {
+          sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+          sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+        }
         sf->use_fast_coef_updates = 2;
         sf->adaptive_rd_thresh = 4;
         sf->mode_skip_start = 6;
       }
       break;
-
   }; /* switch */
 
   // Set rd thresholds based on mode and speed setting
@@ -997,20 +997,6 @@
                        "Failed to allocate altref buffer");
 }
 
-static int alloc_partition_data(VP9_COMP *cpi) {
-  vpx_free(cpi->mb.pip);
-
-  cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
-                           (cpi->common.mi_rows + MI_BLOCK_SIZE),
-                           sizeof(PARTITION_INFO));
-  if (!cpi->mb.pip)
-    return 1;
-
-  cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
-  return 0;
-}
-
 void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
@@ -1018,10 +1004,6 @@
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffers");
 
-  if (alloc_partition_data(cpi))
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate partition data");
-
   if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
                              cm->width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
@@ -1093,10 +1075,6 @@
 }
 
 
-// TODO perhaps change number of steps expose to outside world when setting
-// max and min limits. Also this will likely want refining for the extended Q
-// range.
-//
 // Table that converts 0-63 Q range values passed in outside to the Qindex
 // range used internally.
 static const int q_trans[] = {
@@ -1123,11 +1101,14 @@
   if (framerate < 0.1)
     framerate = 30;
 
-  cpi->oxcf.framerate             = framerate;
-  cpi->output_framerate            = cpi->oxcf.framerate;
-  cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
-  cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
-  cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+  cpi->oxcf.framerate = framerate;
+  cpi->output_framerate = cpi->oxcf.framerate;
+  cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+                             / cpi->output_framerate);
+  cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+                                / cpi->output_framerate);
+  cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
+                                   cpi->oxcf.two_pass_vbrmin_section / 100);
 
 
   cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
@@ -1260,14 +1241,8 @@
   cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
   cpi->oxcf.lossless = oxcf->lossless;
-  if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
-  } else {
-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
-  }
-
+  cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
+                                              : vp9_idct4x4_add;
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -1280,7 +1255,7 @@
   cm->reset_frame_context = 0;
 
   setup_features(cm);
-  cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
+  cpi->mb.e_mbd.allow_high_precision_mv = 0;  // Default mv precision
   set_mvcost(&cpi->mb);
 
   {
@@ -1521,7 +1496,7 @@
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
 
-  cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
+  cpi->frames_since_key = 8;  // Sensible default for first frame.
   cpi->key_frame_frequency = cpi->oxcf.key_freq;
   cpi->this_key_frame_forced = 0;
   cpi->next_key_frame_forced = 0;
@@ -1803,8 +1778,10 @@
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded = (cpi->last_end_time_stamp_seen
                              - cpi->first_time_stamp_ever) / 10000000.000;
-      double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
-      double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
+      double total_encode_time = (cpi->time_receive_data +
+                                  cpi->time_compress_data)   / 1000.000;
+      double dr = (double)cpi->bytes * (double) 8 / (double)1000
+                  / time_encoded;
 
       if (cpi->b_calculate_psnr) {
         YV12_BUFFER_CONFIG *lst_yv12 =
@@ -1824,20 +1801,15 @@
                 dr, cpi->total / cpi->count, total_psnr,
                 cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
                 total_encode_time);
-//         fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-//                 dr, cpi->total / cpi->count, total_psnr,
-//                 cpi->totalp / cpi->count, total_psnr2, total_ssim,
-//                 total_encode_time, cpi->tot_recode_hits);
       }
 
       if (cpi->b_calculate_ssimg) {
         fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
         fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-                cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
-//                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f  %10ld\n", dr,
-//                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-//                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
+                cpi->total_ssimg_y / cpi->count,
+                cpi->total_ssimg_u / cpi->count,
+                cpi->total_ssimg_v / cpi->count,
+                cpi->total_ssimg_all / cpi->count, total_encode_time);
       }
 
       fclose(f);
@@ -1884,11 +1856,9 @@
                      "[INTRA_MODES] =\n{\n");
 
       for (i = 0; i < INTRA_MODES; i++) {
-
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
         for (j = 0; j < INTRA_MODES; j++) {
-
           fprintf(fmode, "        {");
 
           for (k = 0; k < INTRA_MODES; k++) {
@@ -1899,11 +1869,9 @@
           }
 
           fprintf(fmode, "}, // left_mode %d\n", j);
-
         }
 
         fprintf(fmode, "    },\n");
-
       }
 
       fprintf(fmode, "};\n");
@@ -1937,14 +1905,14 @@
              (cpi->time_receive_data + cpi->time_compress_data) / 1000);
     }
 #endif
-
   }
 
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
 
-  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+                  sizeof(cpi->mbgraph_stats[0]); ++i) {
     vpx_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
@@ -1971,7 +1939,6 @@
     fclose(kf_list);
 
 #endif
-
 }
 
 
@@ -2292,14 +2259,15 @@
     cpi->frames_since_golden = 0;
 
     // ******** Fixed Q test code only ************
-    // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+    // If we are going to use the ALT reference for the next group of frames
+    // set a flag to say so.
     if (cpi->oxcf.fixed_q >= 0 &&
         cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
       cpi->source_alt_ref_pending = 1;
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
-      // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a
-      // large GF_interval
+      // TODO(ivan): For SVC encoder, GF automatic update is disabled by using
+      // a large GF_interval.
       if (cpi->use_svc) {
         cpi->frames_till_gf_update_due = INT_MAX;
       }
@@ -2339,12 +2307,12 @@
   return i;
 }
 
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
+                        unsigned int *frame_flags) {
   (void) size;
   (void) dest;
   (void) frame_flags;
 
-
   vp9_set_quantizer(cpi, find_fp_qindex());
   vp9_first_pass(cpi);
 }
@@ -2352,13 +2320,11 @@
 #define WRITE_RECON_BUFFER 0
 #if WRITE_RECON_BUFFER
 void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
-  // write the frame
   FILE *yframe;
   int i;
   char filename[255];
 
-  sprintf(filename, "cx\\y%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->y_height; i++)
@@ -2366,7 +2332,7 @@
            frame->y_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "cx\\u%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -2374,7 +2340,7 @@
            frame->uv_width, 1, yframe);
 
   fclose(yframe);
-  sprintf(filename, "cx\\v%04d.raw", this_frame);
+  snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
   yframe = fopen(filename, "wb");
 
   for (i = 0; i < frame->uv_height; i++)
@@ -2396,8 +2362,10 @@
   for (i = 1; i < frame->y_height - 1; i++) {
     for (j = 1; j < frame->y_width - 1; j++) {
       /* Sobel hor and ver gradients */
-      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
-      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
+      int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) +
+              (next[1] - next[-1]);
+      int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) +
+              (prev[-1] - next[-1]);
       h = (h < 0 ? -h : h);
       v = (v < 0 ? -v : v);
       if (h > EDGE_THRESH || v > EDGE_THRESH)
@@ -2433,10 +2401,9 @@
     if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
         ((cpi->projected_frame_size < low_limit) && (q > minq))) {
       force_recode = 1;
-    }
-    // Special Constrained quality tests
-    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      // Undershoot and below auto cq level
+    } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
       if (q > cpi->cq_target_quality &&
           cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
         force_recode = 1;
@@ -2597,152 +2564,75 @@
         }
 }
 
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+  int recon_err;
 
-static void encode_frame_to_data_rate(VP9_COMP *cpi,
-                                      unsigned long *size,
-                                      unsigned char *dest,
-                                      unsigned int *frame_flags) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  TX_SIZE t;
-  int q;
-  int frame_over_shoot_limit;
-  int frame_under_shoot_limit;
+  vp9_clear_system_state();  // __asm emms;
 
-  int loop = 0;
-  int loop_count;
+  recon_err = vp9_calc_ss_err(cpi->Source,
+                              &cm->yv12_fb[cm->new_fb_idx]);
 
-  int q_low;
-  int q_high;
+  if (cpi->twopass.total_left_stats.coded_error != 0.0)
+    fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
+        "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+        "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
+        "%10.3f %8d %10d %10d %10d\n",
+        cpi->common.current_video_frame, cpi->this_frame_target,
+        cpi->projected_frame_size, 0,
+        (cpi->projected_frame_size - cpi->this_frame_target),
+        (int)cpi->total_target_vs_actual,
+        (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+        (int)cpi->total_actual_bits, cm->base_qindex,
+        vp9_convert_qindex_to_q(cm->base_qindex),
+        (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+        vp9_convert_qindex_to_q(cpi->active_best_quality),
+        vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q,
+        vp9_convert_qindex_to_q(cpi->ni_av_qi),
+        vp9_convert_qindex_to_q(cpi->cq_target_quality),
+        cpi->refresh_last_frame, cpi->refresh_golden_frame,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost,
+        cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left,
+        cpi->twopass.total_left_stats.coded_error,
+        (double)cpi->twopass.bits_left /
+            (1 + cpi->twopass.total_left_stats.coded_error),
+        cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct);
 
-  int top_index;
-  int bottom_index;
-  int active_worst_qchanged = 0;
+  fclose(f);
 
-  int overshoot_seen = 0;
-  int undershoot_seen = 0;
+  if (0) {
+    FILE *const fmodes = fopen("Modes.stt", "a");
+    int i;
 
-  SPEED_FEATURES *sf = &cpi->sf;
-  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
-  struct segmentation *seg = &cm->seg;
+    fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+            cm->frame_type, cpi->refresh_golden_frame,
+            cpi->refresh_alt_ref_frame);
 
-  /* Scale the source buffer, if required */
-  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
-      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
-    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
-    cpi->Source = &cpi->scaled_source;
-  } else {
-    cpi->Source = cpi->un_scaled_source;
+    for (i = 0; i < MAX_MODES; ++i)
+      fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+    for (i = 0; i < MAX_REFS; ++i)
+      fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
+
+    fprintf(fmodes, "\n");
+
+    fclose(fmodes);
   }
+}
+#endif
 
-  scale_references(cpi);
-
-  // Clear down mmx registers to allow floating point in what follows
-  vp9_clear_system_state();
-
-
-  // For an alt ref frame in 2 pass we skip the call to the second
-  // pass function that sets the target bandwidth so must set it here
-  if (cpi->refresh_alt_ref_frame) {
-    // Per frame bit target for the alt ref frame
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
-    // per second target bitrate
-    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
-                                  cpi->output_framerate);
-  }
-
-  // Clear zbin over-quant value and mode boost values.
-  cpi->zbin_mode_boost = 0;
-
-  // Enable or disable mode based tweaking of the zbin
-  // For 2 Pass Only used where GF/ARF prediction quality
-  // is above a threshold
-  cpi->zbin_mode_boost = 0;
-
-  // if (cpi->oxcf.lossless)
-    cpi->zbin_mode_boost_enabled = 0;
-  // else
-  //   cpi->zbin_mode_boost_enabled = 1;
-
-  // Current default encoder behaviour for the altref sign bias
-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
-
-  // Check to see if a key frame is signaled
-  // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
-  if ((cm->current_video_frame == 0) ||
-      (cm->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
-    // Key frame from VFW/auto-keyframe/first frame
-    cm->frame_type = KEY_FRAME;
-  }
-
-  // Set default state for segment based loop filter update flags
-  cm->lf.mode_ref_delta_update = 0;
-
-  // Initialize cpi->mv_step_param to default based on max resolution
-  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
-  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
-  if (sf->auto_mv_step_size) {
-    if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
-      // initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution
-        cpi->mv_step_param = vp9_init_search_range(
-            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
-
-  // Set various flags etc to special state if it is a key frame
-  if (cm->frame_type == KEY_FRAME) {
-    // Reset the loop filter deltas and segmentation map
-    setup_features(cm);
-
-    // If segmentation is enabled force a map update for key frames
-    if (seg->enabled) {
-      seg->update_map = 1;
-      seg->update_data = 1;
-    }
-
-    // The alternate reference frame cannot be active for a key frame
-    cpi->source_alt_ref_active = 0;
-
-    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
-    cm->frame_parallel_decoding_mode =
-      (cpi->oxcf.frame_parallel_decoding_mode != 0);
-    if (cm->error_resilient_mode) {
-      cm->frame_parallel_decoding_mode = 1;
-      cm->reset_frame_context = 0;
-      cm->refresh_frame_context = 0;
-    }
-  }
-
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed for now in second pass of two pass (as requires lagged coding)
-  // and if the relevant speed feature flag is set.
-  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
-    configure_static_seg_features(cpi);
-  }
-
-  // Decide how big to make the frame
-  vp9_pick_frame_size(cpi);
-
-  vp9_clear_system_state();
-
+static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+                                      int * bottom_index, int * top_index) {
   // Set an active best quality and if necessary active worst quality
-  q = cpi->active_worst_quality;
+  int q = cpi->active_worst_quality;
+  VP9_COMMON *const cm = &cpi->common;
 
   if (cm->frame_type == KEY_FRAME) {
 #if !CONFIG_MULTIPLE_ARF
-      // Special case for key frames forced because we have reached
-      // the maximum key frame interval. Here force the Q to a range
-      // based on the ambient Q to reduce the risk of popping
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
     if (cpi->this_key_frame_forced) {
       int delta_qindex;
       int qindex = cpi->last_boosted_qindex;
@@ -2786,12 +2676,13 @@
     cpi->active_best_quality = cpi->active_worst_quality
         + compute_qdelta(cpi, current_q, current_q * 0.3);
 #endif
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+  } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     int high = 2000;
     int low = 400;
 
     // Use the lower of cpi->active_worst_quality and recent
-    // average Q as basis for GF/ARF Q limit unless last frame was
+    // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
     if (cpi->frames_since_key > 1 &&
         cpi->avg_frame_qindex < cpi->active_worst_quality) {
@@ -2832,14 +2723,10 @@
         }
       }
     } else {
-      if (!cpi->refresh_alt_ref_frame) {
-        cpi->active_best_quality = inter_minq[q];
-      } else {
         cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
                                                       low, high,
                                                       gf_low_motion_minq,
                                                       gf_high_motion_minq);
-      }
     }
   } else {
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
@@ -2855,7 +2742,7 @@
       cpi->active_best_quality = inter_minq[q];
 #endif
 
-      // For the constant/constrained quality mode we don't want
+      // For the constrained quality mode we don't want
       // q to fall below the cq level.
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (cpi->active_best_quality < cpi->cq_target_quality)) {
@@ -2883,16 +2770,171 @@
   if (cpi->active_worst_quality < cpi->active_best_quality)
     cpi->active_worst_quality = cpi->active_best_quality;
 
-  // Special case code to try and match quality with forced key frames
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
+    *top_index = cpi->active_best_quality;
+  } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    *top_index =
+      (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+  } else {
+    *top_index = cpi->active_worst_quality;
+  }
+  *bottom_index = cpi->active_best_quality;
+
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     q = cpi->active_best_quality;
+  // Special case code to try and match quality with forced key frames
   } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
     q = cpi->last_boosted_qindex;
   } else {
     // Determine initial Q to try
     q = vp9_regulate_q(cpi, cpi->this_frame_target);
+    if (q > *top_index)
+      q = *top_index;
   }
 
+  return q;
+}
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+                                      unsigned long *size,
+                                      unsigned char *dest,
+                                      unsigned int *frame_flags) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  TX_SIZE t;
+  int q;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+
+  int loop = 0;
+  int loop_count;
+
+  int q_low;
+  int q_high;
+
+  int top_index;
+  int bottom_index;
+  int active_worst_qchanged = 0;
+
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+
+  SPEED_FEATURES *const sf = &cpi->sf;
+  unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+  struct segmentation *const seg = &cm->seg;
+
+  /* Scale the source buffer, if required. */
+  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
+      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
+    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+    cpi->Source = &cpi->scaled_source;
+  } else {
+    cpi->Source = cpi->un_scaled_source;
+  }
+  scale_references(cpi);
+
+  // Clear down mmx registers to allow floating point in what follows.
+  vp9_clear_system_state();
+
+  // For an alt ref frame in 2 pass we skip the call to the second
+  // pass function that sets the target bandwidth so we must set it here.
+  if (cpi->refresh_alt_ref_frame) {
+    // Set a per frame bit target for the alt ref frame.
+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+    // Set a per second target bitrate.
+    cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
+  }
+
+  // Clear zbin over-quant value and mode boost values.
+  cpi->zbin_mode_boost = 0;
+
+  // Enable or disable mode based tweaking of the zbin.
+  // For 2 pass only used where GF/ARF prediction quality
+  // is above a threshold.
+  cpi->zbin_mode_boost = 0;
+  cpi->zbin_mode_boost_enabled = 0;
+
+  // Current default encoder behavior for the altref sign bias.
+  cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
+
+  // Check to see if a key frame is signaled.
+  // For two pass with auto key frame enabled cm->frame_type may already be
+  // set, but not for one pass.
+  if ((cm->current_video_frame == 0) ||
+      (cm->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && (cpi->frames_since_key %
+                              cpi->key_frame_frequency == 0))) {
+    // Set frame type to key frame for the force key frame, if we exceed the
+    // maximum distance in an automatic keyframe selection or for the first
+    // frame.
+    cm->frame_type = KEY_FRAME;
+  }
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  // Initialize cpi->mv_step_param to default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+  // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+  if (sf->auto_mv_step_size) {
+    if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame)
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = vp9_init_search_range(
+            cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+
+  // Set various flags etc to special state if it is a key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    // Reset the loop filter deltas and segmentation map.
+    setup_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->source_alt_ref_active = 0;
+
+    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
+    cm->frame_parallel_decoding_mode =
+      (cpi->oxcf.frame_parallel_decoding_mode != 0);
+    if (cm->error_resilient_mode) {
+      cm->frame_parallel_decoding_mode = 1;
+      cm->reset_frame_context = 0;
+      cm->refresh_frame_context = 0;
+    }
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in second pass of two pass (as requires lagged coding)
+  // and if the relevant speed feature flag is set.
+  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+    configure_static_seg_features(cpi);
+  }
+
+  // Decide how big to make the frame.
+  vp9_pick_frame_size(cpi);
+
+  vp9_clear_system_state();
+
+  q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+
+  q_high = top_index;
+  q_low  = bottom_index;
+
   vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
                                 &frame_over_shoot_limit);
 
@@ -2915,16 +2957,9 @@
     q_high = q;
 
     printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  } else {
-#endif
-    // Limit Q range for the adaptive loop.
-    bottom_index = cpi->active_best_quality;
-    top_index    = cpi->active_worst_quality;
-    q_low  = cpi->active_best_quality;
-    q_high = cpi->active_worst_quality;
-#if CONFIG_MULTIPLE_ARF
   }
 #endif
+
   loop_count = 0;
   vp9_zero(cpi->rd_tx_select_threshes);
 
@@ -2974,7 +3009,6 @@
     vp9_set_quantizer(cpi, q);
 
     if (loop_count == 0) {
-
       // Set up entropy depending on frame type.
       if (cm->frame_type == KEY_FRAME) {
         /* Choose which entropy context to use. When using a forward reference
@@ -3016,10 +3050,10 @@
       frame_over_shoot_limit = 1;
     active_worst_qchanged = 0;
 
-    // Special case handling for forced key frames
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
       loop = 0;
     } else {
+      // Special case handling for forced key frames
       if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
         int last_q = q;
         int kf_err = vp9_calc_ss_err(cpi->Source,
@@ -3261,9 +3295,11 @@
 
   // Keep a record of ambient average Q.
   if (cm->frame_type != KEY_FRAME)
-    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex +
+                            cm->base_qindex) >> 2;
 
-  // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+  // Keep a record from which we can calculate the average Q excluding GF
+  // updates and key frames.
   if (cm->frame_type != KEY_FRAME &&
       !cpi->refresh_golden_frame &&
       !cpi->refresh_alt_ref_frame) {
@@ -3281,7 +3317,8 @@
   if (!cm->show_frame)
     cpi->bits_off_target -= cpi->projected_frame_size;
   else
-    cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+    cpi->bits_off_target += cpi->av_per_frame_bandwidth -
+                            cpi->projected_frame_size;
 
   // Clip the buffer level at the maximum buffer size
   if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
@@ -3305,122 +3342,28 @@
   cpi->total_actual_bits += cpi->projected_frame_size;
 
   // Debug stats
-  cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+  cpi->total_target_vs_actual += (cpi->this_frame_target -
+                                  cpi->projected_frame_size);
 
   cpi->buffer_level = cpi->bits_off_target;
 
-  // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+  // Update bits left to the kf and gf groups to account for overshoot or
+  // undershoot on these frames
   if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+    cpi->twopass.kf_group_bits += cpi->this_frame_target -
+                                  cpi->projected_frame_size;
 
     cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
   } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+    cpi->twopass.gf_group_bits += cpi->this_frame_target -
+                                  cpi->projected_frame_size;
 
     cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
   }
 
-  // Update the skip mb flag probabilities based on the distribution seen
-  // in this frame.
-  // update_base_skip_probs(cpi);
-
-#if 0  // CONFIG_INTERNAL_STATS
-  {
-    FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-    int recon_err;
-
-    vp9_clear_system_state();  // __asm emms;
-
-    recon_err = vp9_calc_ss_err(cpi->Source,
-                                &cm->yv12_fb[cm->new_fb_idx]);
-
-    if (cpi->twopass.total_left_stats.coded_error != 0.0)
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
-              "%10.3f %8d %10d %10d %10d\n",
-              cpi->common.current_video_frame, cpi->this_frame_target,
-              cpi->projected_frame_size, 0, //loop_size_estimate,
-              (cpi->projected_frame_size - cpi->this_frame_target),
-              (int)cpi->total_target_vs_actual,
-              (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-              (int)cpi->total_actual_bits,
-              cm->base_qindex,
-              vp9_convert_qindex_to_q(cm->base_qindex),
-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-              vp9_convert_qindex_to_q(cpi->active_best_quality),
-              vp9_convert_qindex_to_q(cpi->active_worst_quality),
-              cpi->avg_q,
-              vp9_convert_qindex_to_q(cpi->ni_av_qi),
-              vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->refresh_last_frame,
-              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
-              cm->frame_type, cpi->gfu_boost,
-              cpi->twopass.est_max_qcorrection_factor,
-              (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats.coded_error,
-              (double)cpi->twopass.bits_left /
-              cpi->twopass.total_left_stats.coded_error,
-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
-              cpi->kf_zeromotion_pct);
-    else
-      fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-              "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
-              "%8d %10d %10d %10d\n",
-              cpi->common.current_video_frame,
-              cpi->this_frame_target, cpi->projected_frame_size,
-              0, //loop_size_estimate,
-              (cpi->projected_frame_size - cpi->this_frame_target),
-              (int)cpi->total_target_vs_actual,
-              (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-              (int)cpi->total_actual_bits,
-              cm->base_qindex,
-              vp9_convert_qindex_to_q(cm->base_qindex),
-              (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-              vp9_convert_qindex_to_q(cpi->active_best_quality),
-              vp9_convert_qindex_to_q(cpi->active_worst_quality),
-              cpi->avg_q,
-              vp9_convert_qindex_to_q(cpi->ni_av_qi),
-              vp9_convert_qindex_to_q(cpi->cq_target_quality),
-              cpi->refresh_last_frame,
-              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
-              cm->frame_type, cpi->gfu_boost,
-              cpi->twopass.est_max_qcorrection_factor,
-              (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats.coded_error,
-              cpi->tot_recode_hits, recon_err, cpi->kf_boost,
-              cpi->kf_zeromotion_pct);
-
-    fclose(f);
-
-    if (0) {
-      FILE *fmodes = fopen("Modes.stt", "a");
-      int i;
-
-      fprintf(fmodes, "%6d:%1d:%1d:%1d ",
-              cpi->common.current_video_frame,
-              cm->frame_type, cpi->refresh_golden_frame,
-              cpi->refresh_alt_ref_frame);
-
-      for (i = 0; i < MAX_MODES; ++i)
-        fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
-      for (i = 0; i < MAX_REFS; ++i)
-        fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
-
-      fprintf(fmodes, "\n");
-
-      fclose(fmodes);
-    }
-  }
-
-#endif
-
 #if 0
-  // Debug stats for segment feature experiments.
-  print_seg_map(cpi);
+  output_frame_level_debug_stats(cpi);
 #endif
-
   // If this was a kf or Gf note the Q
   if ((cm->frame_type == KEY_FRAME)
       || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
@@ -3504,7 +3447,8 @@
 #endif
   }
 
-  // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
   cm->seg.update_map = 0;
   cm->seg.update_data = 0;
   cm->lf.mode_ref_delta_update = 0;
@@ -3536,28 +3480,10 @@
   // restore prev_mi
   cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
-
-  #if 0
-  {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
-    recon_file = fopen(filename, "wb");
-    fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
-           cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
-           1, recon_file);
-    fclose(recon_file);
-  }
-#endif
-#ifdef OUTPUT_YUV_REC
-  vp9_write_yuv_rec_frame(cm);
-#endif
-
 }
 
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
                         unsigned char *dest, unsigned int *frame_flags) {
-
   cpi->enable_encode_breakout = 1;
 
   if (!cpi->refresh_alt_ref_frame)
@@ -3574,12 +3500,14 @@
   if (!cpi->refresh_alt_ref_frame) {
     double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                        * cpi->oxcf.two_pass_vbrmin_section / 100);
+                                        * cpi->oxcf.two_pass_vbrmin_section
+                                        / 100);
 
     if (two_pass_min_rate < lower_bounds_min_rate)
       two_pass_min_rate = lower_bounds_min_rate;
 
-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate);
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
+                              / cpi->oxcf.framerate);
   }
 }
 
@@ -3917,7 +3845,6 @@
     cpi->bytes += *size;
 
     if (cm->show_frame) {
-
       cpi->count++;
 
       if (cpi->b_calculate_psnr) {
@@ -4027,9 +3954,9 @@
                               vp9_ppflags_t *flags) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
 
-  if (!cpi->common.show_frame)
+  if (!cpi->common.show_frame) {
     return -1;
-  else {
+  } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
     ret = vp9_post_proc_frame(&cpi->common, dest, flags);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 2652929..f88ae8a 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -36,7 +36,7 @@
 #define DISABLE_RC_LONG_TERM_MEM 0
 #endif
 
-#define MODE_TEST_HIT_STATS
+// #define MODE_TEST_HIT_STATS
 
 // #define SPEEDSTATS 1
 #if CONFIG_MULTIPLE_ARF
@@ -230,6 +230,7 @@
 #define ALL_INTRA_MODES 0x3FF
 #define INTRA_DC_ONLY 0x01
 #define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
 #define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
 
 typedef enum {
@@ -285,8 +286,8 @@
   // A source variance threshold below which filter search is disabled
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
-  int intra_y_mode_mask;
-  int intra_uv_mode_mask;
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
   int use_rd_breakout;
   int use_uv_intra_rd_estimate;
   int use_fast_lpf_pick;
@@ -314,6 +315,7 @@
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
+  struct rdcost_block_args rdcost_stack;
 
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 239fd6b..476ecaa 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -54,7 +54,8 @@
   src += srcoffset;
   dst += dstoffset;
 
-  // Loop through the Y plane raw and reconstruction data summing (square differences)
+  // Loop through the raw Y plane and reconstruction data summing the square
+  // differences.
   for (i = 0; i < linestocopy; i += 16) {
     for (j = 0; j < source->y_width; j += 16) {
       unsigned int sse;
@@ -72,20 +73,6 @@
 // Enforce a minimum filter level based upon baseline Q
 static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
   int min_filter_level;
-  /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
-  if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
-      min_filter_level = 0;
-  else
-  {
-      if (q <= 10)
-          min_filter_level = 0;
-      else if (q <= 64)
-          min_filter_level = 1;
-      else
-          min_filter_level = (q >> 6);
-  }
-  */
   min_filter_level = 0;
 
   return min_filter_level;
@@ -93,11 +80,7 @@
 
 // Enforce a maximum filter level based upon baseline Q
 static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
-  // PGW August 2006: Highest filter values almost always a bad idea
-
-  // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
-  // with lots of intra coming in.
-  int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+  int max_filter_level = MAX_LOOP_FILTER;
   (void)base_qindex;
 
   if (cpi->twopass.section_intra_rating > 8)
@@ -128,7 +111,7 @@
   int filt_best;
   int filt_direction = 0;
 
-  int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+  int Bias = 0;  // Bias against raising loop filter in favor of lowering it.
 
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -136,7 +119,8 @@
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                     : cpi->oxcf.Sharpness;
 
-  // Start the search at the previous frame filter level unless it is now out of range.
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
   filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
 
   // Define the initial step size
@@ -153,9 +137,8 @@
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
   while (filter_step > 0) {
-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
     if (cpi->twopass.section_intra_rating < 20)
       Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
@@ -163,8 +146,12 @@
     if (cpi->common.tx_mode != ONLY_4X4)
       Bias >>= 1;
 
-    filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
-    filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+    filt_high = ((filt_mid + filter_step) > max_filter_level)
+                    ? max_filter_level
+                    : (filt_mid + filter_step);
+    filt_low = ((filt_mid - filter_step) < min_filter_level)
+                   ? min_filter_level
+                   : (filt_mid - filter_step);
 
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
@@ -176,7 +163,8 @@
       //  Re-instate the unfiltered frame
       vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
-      // If value is close to the best so far then bias towards a lower loop filter value.
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
       if ((filt_err - Bias) < best_err) {
         // Was it actually better than the previous best?
         if (filt_err < best_err)
@@ -215,4 +203,3 @@
 
   lf->filter_level = filt_best;
 }
-
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index bbcad17..224d1e4 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -59,9 +59,8 @@
 
 int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                     double correction_factor) {
-
   const double q = vp9_convert_qindex_to_q(qindex);
-  int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;
+  int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
   // q based adjustment to baseline enumerator
   enumerator += (int)(enumerator * q) >> 12;
@@ -192,11 +191,12 @@
     cpi->this_frame_target = cpi->per_frame_bandwidth;
   }
 
-  // Sanity check that the total sum of adjustments is not above the maximum allowed
-  // That is that having allowed for KF and GF penalties we have not pushed the
-  // current interframe target to low. If the adjustment we apply here is not capable of recovering
-  // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
-  // a longer time span via other buffer / rate control mechanisms.
+  // Check that the total sum of adjustments is not above the maximum allowed.
+  // That is, having allowed for the KF and GF penalties, we have not pushed
+  // the current inter-frame target too low. If the adjustment we apply here is
+  // not capable of recovering all the extra bits we have spent in the KF or GF,
+  // then the remainder will have to be recovered over a longer time span via
+  // other buffer / rate control mechanisms.
   if (cpi->this_frame_target < min_frame_target)
     cpi->this_frame_target = min_frame_target;
 
@@ -265,12 +265,12 @@
                                                  rate_correction_factor);
 
   // Work out a size correction factor.
-  // if ( cpi->this_frame_target > 0 )
-  //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
   if (projected_size_based_on_q > 0)
-    correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+    correction_factor =
+        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
 
-  // More heavily damped adjustment used if we have been oscillating either side of target
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
   switch (damp_var) {
     case 0:
       adjustment_limit = 0.75;
@@ -287,27 +287,29 @@
   // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
-    correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+    correction_factor =
+        (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+    rate_correction_factor =
+        ((rate_correction_factor * correction_factor) / 100);
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor > MAX_BPB_FACTOR)
       rate_correction_factor = MAX_BPB_FACTOR;
-  }
-  // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
-  else if (correction_factor < 99) {
+  } else if (correction_factor < 99) {
     // We are not already at the best allowable quality
-    correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
-    rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+    correction_factor =
+        (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+    rate_correction_factor =
+        ((rate_correction_factor * correction_factor) / 100);
 
     // Keep rate_correction_factor within limits
     if (rate_correction_factor < MIN_BPB_FACTOR)
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  if (cpi->common.frame_type == KEY_FRAME)
+  if (cpi->common.frame_type == KEY_FRAME) {
     cpi->key_frame_rate_correction_factor = rate_correction_factor;
-  else {
+  } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       cpi->gf_rate_correction_factor = rate_correction_factor;
     else
@@ -326,20 +328,24 @@
   double correction_factor;
 
   // Select the appropriate correction factor based upon type of frame.
-  if (cpi->common.frame_type == KEY_FRAME)
+  if (cpi->common.frame_type == KEY_FRAME) {
     correction_factor = cpi->key_frame_rate_correction_factor;
-  else {
+  } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
       correction_factor = cpi->gf_rate_correction_factor;
     else
       correction_factor = cpi->rate_correction_factor;
   }
 
-  // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+  // Calculate required scaling factor based on target frame size and size of
+  // frame produced using previous Q.
   if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
-    target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int
+    target_bits_per_mb =
+        (target_bits_per_frame / cpi->common.MBs)
+        << BPER_MB_NORMBITS;  // Case where we would overflow int
   else
-    target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+    target_bits_per_mb =
+        (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
 
   i = cpi->active_best_quality;
 
@@ -405,7 +411,6 @@
     }
 
     av_key_frame_frequency /= total_weight;
-
   }
   return av_key_frame_frequency;
 }
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 26bbc82..ba521af 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -36,7 +36,7 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
@@ -45,9 +45,6 @@
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
 #define LAST_FRAME_MODE_MASK    0xFFEDCD60
 #define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
 #define ALT_REF_MODE_MASK       0xFFC648D0
@@ -110,8 +107,13 @@
 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-#define MAX_RD_THRESH_FACT 64
-#define RD_THRESH_INC 1
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+#define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
+
+#define MV_COST_WEIGHT      108
+#define MV_COST_WEIGHT_SUB  120
 
 static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -161,7 +163,17 @@
 
 static int compute_rd_mult(int qindex) {
   const int q = vp9_dc_quant(qindex, 0);
-  return (11 * q * q) >> 2;
+  // TODO(debargha): Adjust the function below
+  return (88 * q * q / 25);
+}
+
+static int compute_rd_thresh_factor(int qindex) {
+  int q;
+  // TODO(debargha): Adjust the function below
+  q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+  if (q < 8)
+    q = 8;
+  return q;
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -169,35 +181,9 @@
   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
 }
 
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
+static void set_block_thresholds(VP9_COMP *cpi, int qindex) {
   int q, i, bsize;
-
-  vp9_clear_system_state();  // __asm emms;
-
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  qindex = clamp(qindex, 0, MAXQ);
-
-  cpi->RDDIV = 100;
-  cpi->RDMULT = compute_rd_mult(qindex);
-  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
-    if (cpi->twopass.next_iiratio > 31)
-      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
-    else
-      cpi->RDMULT +=
-          (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
-  }
-  cpi->mb.errorperbit = cpi->RDMULT >> 6;
-  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
-  vp9_set_speed_features(cpi);
-
-  q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
-  q <<= 2;
-  if (q < 8)
-    q = 8;
+  q = compute_rd_thresh_factor(qindex);
 
   for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
     for (i = 0; i < MAX_MODES; ++i) {
@@ -226,6 +212,34 @@
       }
     }
   }
+}
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
+  int i;
+
+  vp9_clear_system_state();  // __asm emms;
+
+  // Further tests required to see if optimum is different
+  // for key frames, golden frames and arf frames.
+  // if (cpi->common.refresh_golden_frame ||
+  //     cpi->common.refresh_alt_ref_frame)
+  qindex = clamp(qindex, 0, MAXQ);
+
+  cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
+  cpi->RDMULT = compute_rd_mult(qindex);
+  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    if (cpi->twopass.next_iiratio > 31)
+      cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+    else
+      cpi->RDMULT +=
+          (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+  }
+  cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
+  cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+  vp9_set_speed_features(cpi);
+
+  set_block_thresholds(cpi, qindex);
 
   fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
 
@@ -249,10 +263,10 @@
       MB_PREDICTION_MODE m;
 
       for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+        cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
             cost_token(vp9_inter_mode_tree,
                        cpi->common.fc.inter_mode_probs[i],
-                       vp9_inter_mode_encodings + (m - NEARESTMV));
+                       vp9_inter_mode_encodings + inter_mode_offset(m));
     }
   }
 }
@@ -462,12 +476,12 @@
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
 
-static INLINE int cost_coeffs(MACROBLOCK *mb,
+static INLINE int cost_coeffs(MACROBLOCK *x,
                               int plane, int block,
                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
                               const int16_t *scan, const int16_t *nb) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
@@ -476,9 +490,9 @@
   const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
-                   mb->token_costs[tx_size][type][ref];
+                   x->token_costs[tx_size][type][ref];
   const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
-  uint8_t token_cache[1024];
+  uint8_t *p_tok = x->token_cache;
   int pt = combine_entropy_contexts(above_ec, left_ec);
   int c, cost;
 
@@ -497,7 +511,7 @@
     int v = qcoeff_ptr[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
-    token_cache[0] = vp9_pt_energy_class[prev_t];
+    p_tok[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
@@ -507,9 +521,9 @@
 
       v = qcoeff_ptr[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
-      pt = get_coef_context(nb, token_cache, c);
+      pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
-      token_cache[rc] = vp9_pt_energy_class[t];
+      p_tok[rc] = vp9_pt_energy_class[t];
       prev_t = t;
       if (!--band_left) {
         band_left = *band_count++;
@@ -519,7 +533,7 @@
 
     // eob token
     if (band_left) {
-      pt = get_coef_context(nb, token_cache, c);
+      pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
     }
   }
@@ -530,25 +544,6 @@
   return cost;
 }
 
-struct rdcost_block_args {
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int rate[256];
-  int64_t dist[256];
-  int64_t sse[256];
-  int this_rate;
-  int64_t this_dist;
-  int64_t this_sse;
-  int64_t this_rd;
-  int64_t best_rd;
-  int skip;
-  const int16_t *scan, *nb;
-};
-
 static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
   const int ss_txfrm_size = tx_size << 1;
   struct rdcost_block_args* args = arg;
@@ -560,17 +555,17 @@
   int shift = args->tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  args->dist[block] = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                &this_sse) >> shift;
-  args->sse[block]  = this_sse >> shift;
+  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                               &this_sse) >> shift;
+  args->sse  = this_sse >> shift;
 
   if (x->skip_encode &&
       xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
-                    (1 << ss_txfrm_size)) >> shift;
-    args->dist[block] = p;
-    args->sse[block]  = p;
+                    (1 << ss_txfrm_size)) >> (shift + 2);
+    args->dist += (p >> 4);
+    args->sse  += p;
   }
 }
 
@@ -581,10 +576,9 @@
   int x_idx, y_idx;
   txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
 
-  args->rate[block] = cost_coeffs(args->x, plane, block,
-                                  args->t_above + x_idx,
-                                  args->t_left + y_idx, args->tx_size,
-                                  args->scan, args->nb);
+  args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+                           args->t_left + y_idx, args->tx_size,
+                           args->scan, args->nb);
 }
 
 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -605,17 +599,17 @@
 
   dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
-  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]);
+  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
     x->zcoeff_blk[tx_size][block] = rd1 > rd2;
 
-  args->this_rate += args->rate[block];
-  args->this_dist += args->dist[block];
-  args->this_sse  += args->sse[block];
+  args->this_rate += args->rate;
+  args->this_dist += args->dist;
+  args->this_sse  += args->sse;
   args->this_rd += rd;
 
   if (args->this_rd > args->best_rd) {
@@ -657,7 +651,20 @@
   }
 }
 
+static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
+                              const int num_4x4_w, const int num_4x4_h,
+                              const int64_t ref_rdcost,
+                              struct rdcost_block_args *arg) {
+  vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
+  arg->x = x;
+  arg->tx_size = tx_size;
+  arg->bw = num_4x4_w;
+  arg->bh = num_4x4_h;
+  arg->best_rd = ref_rdcost;
+}
+
 static void txfm_rd_in_plane(MACROBLOCK *x,
+                             struct rdcost_block_args *rd_stack,
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
@@ -669,30 +676,29 @@
   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
   const uint8_t *band_translate;  // just for the get_scan_and_band call
 
-  struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
-                                    num_4x4_w, num_4x4_h,
-                                    { 0 }, { 0 }, { 0 },
-                                    0, 0, 0, 0, ref_best_rd, 0 };
+  init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
+                    ref_best_rd, rd_stack);
   if (plane == 0)
     xd->this_mi->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
+  vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 
-  get_scan_and_band(xd, tx_size, pd->plane_type, 0, &args.scan, &args.nb,
-                    &band_translate);
+  get_scan_and_band(xd, tx_size, pd->plane_type, 0, &rd_stack->scan,
+                    &rd_stack->nb, &band_translate);
 
-  foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
-  if (args.skip) {
+  foreach_transformed_block_in_plane(xd, bsize, plane,
+                                     block_yrd_txfm, rd_stack);
+  if (rd_stack->skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
     *sse        = INT64_MAX;
     *skippable  = 0;
   } else {
-    *distortion = args.this_dist;
-    *rate       = args.this_rate;
-    *sse        = args.this_sse;
+    *distortion = rd_stack->this_dist;
+    *rate       = rd_stack->this_rate;
+    *sse        = rd_stack->this_sse;
     *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
   }
 }
@@ -720,7 +726,7 @@
   } else {
     mbmi->tx_size = TX_4X4;
   }
-  txfm_rd_in_plane(x, rate, distortion, skip,
+  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
                    mbmi->tx_size);
   cpi->tx_stepdown_count[0]++;
@@ -904,8 +910,8 @@
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
-  txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
-                   ref_best_rd, 0, bs, mbmi->tx_size);
+  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
   if (max_tx_size == TX_32X32 &&
       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -932,6 +938,7 @@
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
 
   assert(bs == mbmi->sb_type);
   if (mbmi->ref_frame[0] > INTRA_FRAME)
@@ -967,14 +974,16 @@
                                   skip, sse, ref_best_rd, bs);
   } else {
     if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                       &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
+      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
+                       &s[TX_32X32], &sse[TX_32X32],
+                       ref_best_rd, 0, bs, TX_32X32);
     if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                       &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
+                       &s[TX_16X16], &sse[TX_16X16],
+                       ref_best_rd, 0, bs, TX_16X16);
+    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
                      &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
                      &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
@@ -1044,7 +1053,7 @@
     int64_t this_rd;
     int ratey = 0;
 
-    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
       continue;
 
     // Only do the oblique modes if the best so far is
@@ -1100,11 +1109,11 @@
           goto next;
 
         if (tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
                                dst, pd->dst.stride, tx_type);
         else
-          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
-                             dst, pd->dst.stride);
+          xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
+                       16);
       }
     }
 
@@ -1236,7 +1245,7 @@
     MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
     MODE_INFO *left_mi = xd->mi_8x8[-1];
 
-    if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
       continue;
 
     if (cpi->common.frame_type == KEY_FRAME) {
@@ -1284,7 +1293,7 @@
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
@@ -1307,7 +1316,7 @@
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_txfm_size);
     if (pnrate == INT_MAX)
       goto term;
@@ -1339,14 +1348,15 @@
   // int mode_mask = (bsize <= BLOCK_8X8)
   //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
 
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+  for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
     // if (!(mode_mask & (1 << mode)))
-    if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+    if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
+          & (1 << mode)))
       continue;
 
     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
+    super_block_uvrd(cpi, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1377,8 +1387,8 @@
   int64_t this_sse;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(&cpi->common, x, rate_tokenonly,
-                   distortion, skippable, &this_sse, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+                   skippable, &this_sse, bsize, INT64_MAX);
   *rate = *rate_tokenonly +
           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1416,7 +1426,7 @@
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
-    return x->inter_mode_cost[mode_context][mode - NEARESTMV];
+    return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
   } else {
     return 0;
   }
@@ -1466,12 +1476,12 @@
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
       thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
-                                    mvjcost, mvcost, 102);
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       if (has_second_rf) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
         thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
                                       &second_best_ref_mv->as_mv,
-                                      mvjcost, mvcost, 102);
+                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       }
       break;
     case NEARESTMV:
@@ -1502,7 +1512,8 @@
   if (has_second_rf)
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-  x->partition_info->bmi[i].mode = m;
+  mic->bmi[i].as_mode = m;
+
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
@@ -1649,7 +1660,7 @@
                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
                                     int mi_row, int mi_col) {
-  int i, j, br = 0, idx, idy;
+  int i, br = 0, idx, idy;
   int64_t bd = 0, block_sse = 0;
   MB_PREDICTION_MODE this_mode;
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
@@ -2011,15 +2022,6 @@
         bsi->segment_rd = INT64_MAX;
         return;
       }
-
-      for (j = 1; j < num_4x4_blocks_high; ++j)
-        vpx_memcpy(&x->partition_info->bmi[i + j * 2],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
-      for (j = 1; j < num_4x4_blocks_wide; ++j)
-        vpx_memcpy(&x->partition_info->bmi[i + j],
-                   &x->partition_info->bmi[i],
-                   sizeof(x->partition_info->bmi[i]));
     }
   } /* for each label */
 
@@ -2031,7 +2033,7 @@
 
   // update the coding decisions
   for (i = 0; i < 4; ++i)
-    bsi->modes[i] = x->partition_info->bmi[i].mode;
+    bsi->modes[i] = mi->bmi[i].as_mode;
 }
 
 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2076,7 +2078,7 @@
     if (has_second_ref(mbmi))
       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
     xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
-    x->partition_info->bmi[i].mode = bsi->modes[i];
+    mi->bmi[i].as_mode = bsi->modes[i];
   }
 
   /*
@@ -2209,7 +2211,6 @@
 
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
-                         PARTITION_INFO *partition,
                          int_mv *ref_mv,
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
@@ -2223,9 +2224,6 @@
   ctx->best_mode_index = mode_index;
   ctx->mic = *xd->this_mi;
 
-  if (partition)
-    ctx->partition_info = *partition;
-
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
@@ -2457,7 +2455,7 @@
                                  &dis, &sse);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
-                             x->nmvjointcost, x->mvcost, 96);
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
     x->pred_mv[ref].as_int = tmp_mv->as_int;
@@ -2618,10 +2616,10 @@
   }
   *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                               &mbmi->ref_mvs[refs[0]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, 96);
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
                               &mbmi->ref_mvs[refs[1]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, 96);
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   vpx_free(second_pred);
 }
@@ -2674,10 +2672,10 @@
       } else {
         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
-                                   x->nmvjointcost, x->mvcost, 96);
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
-                                   x->nmvjointcost, x->mvcost, 96);
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
       if (frame_mv[refs[0]].as_int == INVALID_MV ||
           frame_mv[refs[1]].as_int == INVALID_MV)
@@ -3015,7 +3013,7 @@
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3645,10 +3643,17 @@
         // values, which actually are bigger than this_rd itself. This can
         // cause negative best_filter_rd[] values, which is obviously silly.
         // Therefore, if filter_cache < ref, we do an adjusted calculation.
-        if (cpi->rd_filter_cache[i] >= ref)
+        if (cpi->rd_filter_cache[i] >= ref) {
           adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
-        else  // FIXME(rbultje) do this for comppred also
-          adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+        } else {
+          // FIXME(rbultje) do this for comppsred also
+          //
+          // To prevent out-of-range computation in
+          //    adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
+          // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
+          int tmp = cpi->rd_filter_cache[i] * 256 / ref;
+          adj_rd = (this_rd * tmp) >> 8;
+        }
         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
       }
     }
@@ -3734,9 +3739,9 @@
       } else {
         cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
+            (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
           cpi->rd_thresh_freq_fact[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+            cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
         }
       }
     }
@@ -3783,7 +3788,6 @@
   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                     scale_factor);
   store_coding_context(x, ctx, best_mode_index,
-                       NULL,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
                                       mbmi->ref_frame[1]][0],
@@ -3842,7 +3846,6 @@
                                              cpi->common.y_dc_delta_q);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
-  PARTITION_INFO best_partition;
   int best_skip2 = 0;
   unsigned char best_zcoeff_blk[256] = { 0 };
 
@@ -3910,6 +3913,32 @@
     ref_frame = vp9_ref_order[mode_index].ref_frame;
     second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
 
+    // Look at the reference frame of the best mode so far and set the
+    // skip mask to look at a subset of the remaining modes.
+    if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+      if (mode_index == 3) {
+        switch (vp9_ref_order[best_mode_index].ref_frame) {
+          case INTRA_FRAME:
+            cpi->mode_skip_mask = 0;
+            break;
+          case LAST_FRAME:
+            cpi->mode_skip_mask = 0x0010;
+            break;
+          case GOLDEN_FRAME:
+            cpi->mode_skip_mask = 0x0008;
+            break;
+          case ALTREF_FRAME:
+            cpi->mode_skip_mask = 0x0000;
+            break;
+          case NONE:
+          case MAX_REF_FRAMES:
+            assert(!"Invalid Reference frame");
+        }
+      }
+      if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+        continue;
+    }
+
     // Skip if the current reference frame has been masked off
     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
         (cpi->ref_frame_mask & (1 << ref_frame)))
@@ -4058,7 +4087,6 @@
                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
       b_mode_info tmp_best_bmodes[16];
       MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
       int pred_exists = 0;
       int uv_skippable;
@@ -4122,7 +4150,6 @@
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
-              tmp_best_partition = *x->partition_info;
               for (i = 0; i < 4; i++)
                 tmp_best_bmodes[i] = xd->this_mi->bmi[i];
               pred_exists = 1;
@@ -4174,7 +4201,6 @@
         distortion = tmp_best_distortion;
         skippable = tmp_best_skippable;
         *mbmi = tmp_best_mbmode;
-        *x->partition_info = tmp_best_partition;
         for (i = 0; i < 4; i++)
           xd->this_mi->bmi[i] = tmp_best_bmodes[i];
       }
@@ -4202,7 +4228,7 @@
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
@@ -4302,7 +4328,6 @@
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
-        best_partition = *x->partition_info;
         vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(best_zcoeff_blk));
 
@@ -4445,9 +4470,9 @@
       } else {
         cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
         if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
-            (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
+            (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
           cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
-            cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+            cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
         }
       }
     }
@@ -4460,15 +4485,8 @@
     for (i = 0; i < 4; i++)
       xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
-    for (i = 0; i < 4; i++)
-      xd->this_mi->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
-
-    if (has_second_ref(mbmi))
-      for (i = 0; i < 4; i++)
-        xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int;
-
-    *x->partition_info = best_partition;
+    for (i = 0; i < 4; ++i)
+      vpx_memcpy(&xd->this_mi->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
     mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
@@ -4511,7 +4529,6 @@
   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                     scale_factor);
   store_coding_context(x, ctx, best_mode_index,
-                       &best_partition,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
                                       mbmi->ref_frame[1]][0],
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index c86ea27..aa4068d 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -12,8 +12,10 @@
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#define RDDIV_BITS          7
+
 #define RDCOST(RM, DM, R, D) \
-  (((128 + ((int64_t)R) * (RM)) >> 8) + ((int64_t)DM) * (D))
+  (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
diff --git a/vp9/encoder/vp9_ssim.c b/vp9/encoder/vp9_ssim.c
index c155516..a5f18e6 100644
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c
@@ -42,8 +42,8 @@
   }
 }
 
-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
 static double similarity(unsigned long sum_s, unsigned long sum_r,
                          unsigned long sum_sq_s, unsigned long sum_sq_r,
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 667b801..eb864d9 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,8 @@
 }
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               vp9_prob upd, unsigned int *ct) {
+                               unsigned int *ct) {
+  const vp9_prob upd = DIFF_UPDATE_PROB;
   vp9_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
                                                           upd);
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 7acdaf6..521c777 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -19,7 +19,7 @@
                                 vp9_prob newp, vp9_prob oldp);
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               vp9_prob upd, unsigned int *ct);
+                               unsigned int *ct);
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
                                         vp9_prob oldp, vp9_prob *bestp,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 4e095f2..08745b0 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -29,9 +29,6 @@
 extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
-DECLARE_ALIGNED(16, extern const uint8_t,
-                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 6e686d6..61031e0 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -67,12 +67,6 @@
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
 
-typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
-                                int rp, unsigned long *sum_s,
-                                unsigned long *sum_r, unsigned long *sum_sq_s,
-                                unsigned long *sum_sq_r,
-                                unsigned long *sum_sxr);
-
 typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
 
 typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 95ae266..11eec7f 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -27,24 +27,6 @@
   __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
   return _mm_unpacklo_epi64(buf0, buf1);
 }
-
-static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
-  // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
-  __m128i sign_bit = _mm_and_si128(a, mask16);
-  __m128i b = _mm_unpacklo_epi16(a, kZero);
-  sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
-  sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
-  return _mm_or_si128(sign_bit, b);
-}
-
-static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
-  // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
-  __m128i sign_bit = _mm_and_si128(a, mask16);
-  __m128i b = _mm_unpackhi_epi16(a, kZero);
-  sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
-  sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
-  return _mm_or_si128(sign_bit, b);
-}
 #endif
 
 void FDCT32x32_2D(int16_t *input,
@@ -1159,28 +1141,43 @@
       } else {
         __m128i lstep1[64], lstep2[64], lstep3[64];
         __m128i u[32], v[32], sign[16];
-        const __m128i mask16 = _mm_set1_epi32(0x80008000);
         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
         // start using 32-bit operations
         // stage 3
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
-          lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
-          lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
-          lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
-          lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
-          lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
-          lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
-          lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
-          lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
-          lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
-          lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
-          lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
-          lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
-          lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
-          lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
-          lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+          lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
 
           lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
           lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
@@ -1231,42 +1228,75 @@
           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
         }
         {
-          lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
-          lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
-          lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
-          lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
-          lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
-          lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
-          lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
-          lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
-          lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
-          lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
-          lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
-          lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
-          lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
-          lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
-          lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
-          lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
 
-          lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
-          lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
-          lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
-          lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
-          lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
-          lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
-          lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
-          lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
-          lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
-          lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
-          lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
-          lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
-          lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
-          lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
-          lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
-          lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
 
           lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
           lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
           lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
           lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
           lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
@@ -1302,14 +1332,22 @@
         // stage 4
         {
           // expanding to 32-bit length priori to addition operations
-          lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
-          lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
-          lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
-          lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero);
-          lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero);
-          lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero);
-          lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero);
-          lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero);
+          lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
 
           lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
           lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
@@ -1337,41 +1375,41 @@
           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
         }
         {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+        // to be continued...
+        //
+        const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+        const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
 
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+        u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
 
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+        // TODO(jingning): manually inline k_madd_epi32_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
 
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
+        u[0] = k_packs_epi64(v[0], v[1]);
+        u[1] = k_packs_epi64(v[2], v[3]);
+        u[2] = k_packs_epi64(v[4], v[5]);
+        u[3] = k_packs_epi64(v[6], v[7]);
 
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+        v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
 
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+        lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
         }
         {
           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -2647,4 +2685,4 @@
       }
     }
   }
-}
+}  // NOLINT
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index d141560..a3d0114 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -8,12 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr);
 extern unsigned int vp9_get8x8var_mmx
 (
   const unsigned char *src_ptr,
@@ -45,7 +45,6 @@
   vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 4));
-
 }
 
 unsigned int vp9_variance8x8_mmx(
@@ -61,7 +60,6 @@
   *sse = var;
 
   return (var - (((unsigned int)avg * avg) >> 6));
-
 }
 
 unsigned int vp9_mse16x16_mmx(
@@ -74,10 +72,14 @@
   int sum0, sum1, sum2, sum3;
 
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
 
   var = sse0 + sse1 + sse2 + sse3;
   *sse = var;
@@ -94,11 +96,14 @@
   unsigned int sse0, sse1, sse2, sse3, var;
   int sum0, sum1, sum2, sum3, avg;
 
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
 
   var = sse0 + sse1 + sse2 + sse3;
   avg = sum0 + sum1 + sum2 + sum3;
@@ -115,14 +120,15 @@
   unsigned int sse0, sse1, var;
   int sum0, sum1, avg;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+                    &sse1, &sum1);
 
   var = sse0 + sse1;
   avg = sum0 + sum1;
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 7));
-
 }
 
 
@@ -135,13 +141,14 @@
   unsigned int sse0, sse1, var;
   int sum0, sum1, avg;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                    &sum0);
+  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+                    ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
 
   var = sse0 + sse1;
   avg = sum0 + sum1;
   *sse = var;
 
   return (var - (((unsigned int)avg * avg) >> 7));
-
 }
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index cea934d..79e42c4 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/common/vp9_pragmas.h"
@@ -26,7 +26,7 @@
 
 unsigned int vp9_get_mb_ss_sse2
 (
-  const short *src_ptr
+  const int16_t *src_ptr
 );
 unsigned int vp9_get16x16var_sse2
 (
@@ -250,7 +250,6 @@
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-
   unsigned int sse0;
   int sum0;
   vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
@@ -407,12 +406,12 @@
 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1,); \
-FN(8,  16,  8, 3, 4, opt1,); \
-FN(8,   8,  8, 3, 3, opt1,); \
-FN(8,   4,  8, 3, 2, opt1,); \
-FN(4,   8,  4, 2, 3, opt2,); \
-FN(4,   4,  4, 2, 2, opt2,)
+FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
+FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
+FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
+FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
+FN(4,   4,  4, 2, 2, opt2, (unsigned int))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
@@ -487,12 +486,12 @@
 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1,); \
-FN(8,  16,  8, 3, 4, opt1,); \
-FN(8,   8,  8, 3, 3, opt1,); \
-FN(8,   4,  8, 3, 2, opt1,); \
-FN(4,   8,  4, 2, 3, opt2,); \
-FN(4,   4,  4, 2, 2, opt2,)
+FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
+FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
+FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
+FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
+FN(4,   4,  4, 2, 2, opt2, (unsigned int))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 10fa461..af6e665 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -68,6 +68,8 @@
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.h
+VP9_COMMON_SRCS-yes += common/vp9_scan.c
+VP9_COMMON_SRCS-yes += common/vp9_scan.h
 
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
@@ -75,6 +77,7 @@
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
@@ -89,6 +92,11 @@
 
 # common (c)
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_common_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve2_vert_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 157752a..810fdf5 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -8,30 +8,30 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdlib.h>
+#include <string.h>
 
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/common/vp9_onyx.h"
 #include "vp9/vp9_iface_common.h"
-#include <stdlib.h>
-#include <string.h>
 
 struct vp9_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
-  int                         cpu_used;                    /** available cpu percentage in 1/16*/
-  unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
+  int                         cpu_used;  /* available cpu percentage in 1/16 */
+  unsigned int                enable_auto_alt_ref;
   unsigned int                noise_sensitivity;
   unsigned int                Sharpness;
   unsigned int                static_thresh;
   unsigned int                tile_columns;
   unsigned int                tile_rows;
-  unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
-  unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
-  unsigned int                arnr_type;        /* alt_ref filter type */
+  unsigned int                arnr_max_frames;
+  unsigned int                arnr_strength;
+  unsigned int                arnr_type;
   unsigned int                experimental;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;         /* constrained quality level */
@@ -48,7 +48,7 @@
 static const struct extraconfig_map extracfg_map[] = {
   {
     0,
-    {
+    { // NOLINT
       NULL,
       0,                          /* cpu_used      */
       1,                          /* enable_auto_alt_ref */
@@ -85,7 +85,7 @@
   uint32_t                pending_frame_magnitude;
   vpx_image_t             preview_img;
   vp8_postproc_cfg_t      preview_ppcfg;
-  vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
+  vpx_codec_pkt_list_decl(64) pkt_list;
   unsigned int                fixed_kf_cntr;
 };
 
@@ -120,26 +120,26 @@
 #define ERROR(str) do {\
     ctx->base.err_detail = str;\
     return VPX_CODEC_INVALID_PARAM;\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK(p,memb,lo,hi) do {\
-    if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
       ERROR(#memb " out of range ["#lo".."#hi"]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_HI(p,memb,hi) do {\
-    if(!((p)->memb <= (hi))) \
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+    if (!((p)->memb <= (hi))) \
       ERROR(#memb " out of range [.."#hi"]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_LO(p,memb,lo) do {\
-    if(!((p)->memb >= (lo))) \
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+    if (!((p)->memb >= (lo))) \
       ERROR(#memb " out of range ["#lo"..]");\
-  } while(0)
+  } while (0)
 
-#define RANGE_CHECK_BOOL(p,memb) do {\
-    if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
-  } while(0)
+#define RANGE_CHECK_BOOL(p, memb) do {\
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+  } while (0)
 
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
@@ -247,7 +247,8 @@
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
   /* guess a frame rate if out of whack, use 30 */
-  oxcf->framerate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+  oxcf->framerate = (double)(cfg.g_timebase.den)
+                    / (double)(cfg.g_timebase.num);
 
   if (oxcf->framerate > 180) {
     oxcf->framerate = 30;
@@ -266,11 +267,11 @@
   }
 
   if (cfg.g_pass == VPX_RC_FIRST_PASS) {
-    oxcf->allow_lag              = 0;
-    oxcf->lag_in_frames           = 0;
+    oxcf->allow_lag     = 0;
+    oxcf->lag_in_frames = 0;
   } else {
-    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;
-    oxcf->lag_in_frames           = cfg.g_lag_in_frames;
+    oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
+    oxcf->lag_in_frames = cfg.g_lag_in_frames;
   }
 
   // VBR only supported for now.
@@ -282,7 +283,7 @@
   else if (cfg.rc_end_usage == VPX_Q)
     oxcf->end_usage      = USAGE_CONSTANT_QUALITY;
 
-  oxcf->target_bandwidth        = cfg.rc_target_bitrate;
+  oxcf->target_bandwidth         = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
@@ -297,7 +298,7 @@
   oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
   oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;
 
-  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrbias         = cfg.rc_2pass_vbr_bias_pct;
   oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
   oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
 
@@ -313,23 +314,23 @@
   oxcf->encode_breakout        =  vp8_cfg.static_thresh;
   oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->Sharpness             =  vp8_cfg.Sharpness;
+  oxcf->Sharpness              =  vp8_cfg.Sharpness;
 
-  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;
+  oxcf->two_pass_stats_in      =  cfg.rc_twopass_stats_in;
+  oxcf->output_pkt_list        =  vp8_cfg.pkt_list;
 
   oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength =  vp8_cfg.arnr_strength;
-  oxcf->arnr_type =      vp8_cfg.arnr_type;
+  oxcf->arnr_strength   = vp8_cfg.arnr_strength;
+  oxcf->arnr_type       = vp8_cfg.arnr_type;
 
   oxcf->tuning = vp8_cfg.tuning;
 
   oxcf->tile_columns = vp8_cfg.tile_columns;
-  oxcf->tile_rows = vp8_cfg.tile_rows;
+  oxcf->tile_rows    = vp8_cfg.tile_rows;
 
   oxcf->lossless = vp8_cfg.lossless;
 
-  oxcf->error_resilient_mode = cfg.g_error_resilient;
+  oxcf->error_resilient_mode         = cfg.g_error_resilient;
   oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
 
   oxcf->ss_number_layers = cfg.ss_number_layers;
@@ -498,7 +499,7 @@
      */
     for (i = 0;
          extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-         i++);
+         i++) {}
 
     priv->vp8_cfg = extracfg_map[i].cfg;
     priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
@@ -553,7 +554,6 @@
 
 
 static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
-
   free(ctx->cx_data);
   vp9_remove_compressor(&ctx->cpi);
   free(ctx);
@@ -712,8 +712,10 @@
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
     /* vp8 use 10,000,000 ticks/second as time stamp */
-    dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+    dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num
+                     / ctx->cfg.g_timebase.den;
+    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
+                         ctx->cfg.g_timebase.den;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
@@ -839,8 +841,6 @@
           cx_data += size;
           cx_data_sz -= size;
         }
-
-        // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
       }
     }
   }
@@ -867,15 +867,14 @@
     vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
                           &sd);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
-
+  }
 }
 
 static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            int ctr_id,
                                            va_list args) {
-
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
   if (data) {
@@ -886,8 +885,9 @@
     vp9_copy_reference_enc(ctx->cpi,
                            ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -916,8 +916,9 @@
   if (data) {
     ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
     return VPX_CODEC_OK;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 #else
   (void)ctx;
   (void)ctr_id;
@@ -928,7 +929,6 @@
 
 
 static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
   YV12_BUFFER_CONFIG sd;
   vp9_ppflags_t flags = {0};
 
@@ -941,8 +941,9 @@
   if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
-  } else
+  } else {
     return NULL;
+  }
 }
 
 static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
@@ -951,7 +952,6 @@
   int update = va_arg(args, int);
   vp9_update_entropy(ctx->cpi, update);
   return VPX_CODEC_OK;
-
 }
 
 static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,
@@ -983,8 +983,9 @@
       return VPX_CODEC_OK;
     else
       return VPX_CODEC_INVALID_PARAM;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 
@@ -994,21 +995,20 @@
   vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
 
   if (data) {
-
     vpx_active_map_t *map = (vpx_active_map_t *)data;
 
     if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
       return VPX_CODEC_OK;
     else
       return VPX_CODEC_INVALID_PARAM;
-  } else
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
-
   vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
 
   if (data) {
@@ -1019,10 +1019,12 @@
 
     if (!res) {
       return VPX_CODEC_OK;
-    } else
+    } else {
       return VPX_CODEC_INVALID_PARAM;
-  } else
+    }
+  } else {
     return VPX_CODEC_INVALID_PARAM;
+  }
 }
 
 static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
@@ -1128,7 +1130,7 @@
 static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
   {
     0,
-    {
+    {  // NOLINT
       0,                  /* g_usage */
       0,                  /* g_threads */
       0,                  /* g_profile */
@@ -1197,13 +1199,13 @@
   vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  {  // NOLINT
     NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  {  // NOLINT
     vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
     vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
     vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
@@ -1226,13 +1228,13 @@
   vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
-  {
+  {  // NOLINT
     NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
     NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
-  {
+  {  // NOLINT
     vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
     vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
     vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
diff --git a/vpxenc.c b/vpxenc.c
index 71cf01f..d7c6c0e 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -45,8 +45,8 @@
 #include "vpx_ports/vpx_timer.h"
 #include "tools_common.h"
 #include "y4minput.h"
-#include "libmkv/EbmlWriter.h"
-#include "libmkv/EbmlIDs.h"
+#include "third_party/libmkv/EbmlWriter.h"
+#include "third_party/libmkv/EbmlIDs.h"
 #include "third_party/libyuv/include/libyuv/scale.h"
 
 /* Need special handling of these functions on Windows */