Merge "vp9_filter: make all filter tables static"

commit: 714a46a63c5434a4f18b52905e840516a5d85017 [log] [tgz]
author: James Zern <jzern@google.com> Fri Jun 19 03:32:24 2015 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Fri Jun 19 03:32:24 2015 +0000
tree: a44a767f9182712f1f370fea5d5f25ef2e06e10a
parent: c5d779d266ba254d60b56afe813b681684613b24 [diff]
parent: e2b52f6f0115911cb4d0f669763718b858bc3644 [diff]
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index c22a04a..5a60976 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c

@@ -630,7 +630,8 @@
 
   if (svc_ctx.speed != -1)
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
-  vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, 0);
+  if (svc_ctx.threads)
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
 
   // Encode frames
   while (!end_of_stream) {

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index da2a7cf..484deb5 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c

@@ -608,7 +608,6 @@
   // Real time parameters.
   cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
   cfg.rc_end_usage = VPX_CBR;
-  cfg.rc_resize_allowed = 0;
   cfg.rc_min_quantizer = 2;
   cfg.rc_max_quantizer = 56;
   if (strncmp(encoder->name, "vp9", 3) == 0)
@@ -619,6 +618,9 @@
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
 
+  // Disable dynamic resizing by default.
+  cfg.rc_resize_allowed = 0;
+
   // Use 1 thread as default.
   cfg.g_threads = 1;
 

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index b37d8e3..66ca4bb 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -933,14 +933,15 @@
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans16x16DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c,
+        make_tuple(&vp9_fdct16x16_msa,
                    &vp9_idct16x16_256_add_msa, 0, VPX_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans16x16HT,
     ::testing::Values(
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
+                   VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 267dfb8..25059a5 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -386,7 +386,9 @@
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans32x32Test,
     ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8)));
+        make_tuple(&vp9_fdct32x32_msa,
+                   &vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_msa,
+                   &vp9_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index c61ffff..352cde2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -16,6 +16,7 @@
 
 #include "./vp9_rtcd.h"
 #include "test/acm_random.h"
+#include "test/clear_system_state.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -66,6 +67,7 @@
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
       pred_funcs[k](src, kBPS, above, left);
     }
+    libvpx_test::ClearSystemState();
     vpx_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
@@ -206,9 +208,12 @@
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, NULL, NULL, NULL, NULL,
-                vp9_v_predictor_4x4_neon, vp9_h_predictor_4x4_neon, NULL, NULL,
-                NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
+                vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
+                vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
+                vp9_h_predictor_4x4_neon, vp9_d45_predictor_4x4_neon,
+                vp9_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_4x4_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -354,14 +359,18 @@
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
                 vp9_h_predictor_32x32_ssse3, vp9_d45_predictor_32x32_ssse3,
-                NULL, NULL, NULL, vp9_d207_predictor_32x32_ssse3,
-                vp9_d63_predictor_32x32_ssse3, NULL)
+                NULL, NULL, vp9_d153_predictor_32x32_ssse3,
+                vp9_d207_predictor_32x32_ssse3, vp9_d63_predictor_32x32_ssse3,
+                NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred32, NULL, NULL, NULL, NULL,
-                vp9_v_predictor_32x32_neon, vp9_h_predictor_32x32_neon, NULL,
-                NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_32x32_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred32, vp9_dc_predictor_32x32_neon,
+                vp9_dc_left_predictor_32x32_neon,
+                vp9_dc_top_predictor_32x32_neon,
+                vp9_dc_128_predictor_32x32_neon, vp9_v_predictor_32x32_neon,
+                vp9_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vp9_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2d17119..670fe09 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -1106,12 +1106,12 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest;
 
 TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest;
@@ -1160,7 +1160,7 @@
                       make_tuple(6, 5, subpel_variance64x32_c, 0),
                       make_tuple(6, 6, subpel_variance64x64_c, 0)));
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 const SubpixVarMxNFunc vp8_subpel_variance16x16_c =
     vp8_sub_pixel_variance16x16_c;
 const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c;
@@ -1174,7 +1174,7 @@
                       make_tuple(3, 4, vp8_subpel_variance8x16_c, 0),
                       make_tuple(4, 3, vp8_subpel_variance16x8_c, 0),
                       make_tuple(4, 4, vp8_subpel_variance16x16_c, 0)));
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
     vp9_sub_pixel_avg_variance4x4_c;
@@ -1460,7 +1460,7 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_MMX
 const SubpixVarMxNFunc subpel_variance16x16_mmx =
     vp8_sub_pixel_variance16x16_mmx;
@@ -1476,7 +1476,7 @@
                       make_tuple(3, 3, subpel_variance8x8_mmx, 0),
                       make_tuple(2, 2, subpel_variance4x4_mmx, 0)));
 #endif  // HAVE_MMX
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 #if HAVE_SSE2
@@ -1768,7 +1768,7 @@
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_SSE2
 const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 =
     vp8_sub_pixel_variance16x16_wmt;
@@ -1788,7 +1788,7 @@
                       make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0),
                       make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0)));
 #endif  // HAVE_SSE2
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 #if HAVE_SSSE3
@@ -1879,7 +1879,7 @@
 #endif  // HAVE_SSSE3
 #endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_SSSE3
 const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 =
     vp8_sub_pixel_variance16x16_ssse3;
@@ -1890,7 +1890,7 @@
     ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0),
                       make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0)));
 #endif  // HAVE_SSSE3
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if HAVE_AVX2
 const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
@@ -1931,7 +1931,7 @@
 #endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_AVX2
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_MEDIA
 const SubpixVarMxNFunc subpel_variance16x16_media =
     vp8_sub_pixel_variance16x16_armv6;
@@ -1942,7 +1942,7 @@
     ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0),
                       make_tuple(4, 4, subpel_variance16x16_media, 0)));
 #endif  // HAVE_MEDIA
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if HAVE_NEON
 const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
@@ -1972,7 +1972,7 @@
                       make_tuple(3, 4, variance8x16_neon, 0),
                       make_tuple(3, 3, variance8x8_neon, 0)));
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_NEON_ASM
 const SubpixVarMxNFunc vp8_subpel_variance16x16_neon =
     vp8_sub_pixel_variance16x16_neon;
@@ -1980,7 +1980,7 @@
     NEON, VP8SubpelVarianceTest,
     ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0)));
 #endif  // HAVE_NEON_ASM
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index 499c42a..13c46a5 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c

@@ -15,6 +15,75 @@
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+                          const uint8_t *above, const uint8_t *left,
+                          int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
 // DC 8x8
 
 // 'do_above' and 'do_left' facilitate branch removal when inlined.
@@ -161,6 +230,144 @@
   dc_16x16(dst, stride, NULL, NULL, 0, 0);
 }
 
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+// -----------------------------------------------------------------------------
+
+void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
 #if !HAVE_NEON_ASM
 
 void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,

diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index f1217d5a..863b2dc 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h

@@ -244,6 +244,22 @@
   out3 = LW((psrc) + 3 * stride);                    \
 }
 
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+   Details     : Loads double word in 'out0' from (psrc)
+                 Loads double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) {  \
+  out0 = LD((psrc));                     \
+  out1 = LD((psrc) + stride);            \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) {  \
+  LD2((psrc), stride, out0, out1);                   \
+  LD2((psrc) + 2 * stride, stride, out2, out3);      \
+}
+
 /* Description : Store 4 words with stride
    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
    Details     : Stores word from 'in0' to (pdst)
@@ -364,6 +380,17 @@
   out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
 }
 
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) {  \
+  out0 = LD_SW((psrc));                     \
+  out1 = LD_SW((psrc) + stride);            \
+}
+
 /* Description : Store vectors of 16 byte elements with stride
    Arguments   : Inputs  - in0, in1, stride
                  Outputs - pdst    (destination pointer to store to)
@@ -482,6 +509,24 @@
   SD(out0_m, pdst);                       \
 }
 
+/* Description : Store as 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs  - in, pdst, stride
+   Details     : Index 0 double word element from input vector 'in' is copied
+                 and stored to destination memory at (pdst)
+                 Index 1 double word element from input vector 'in' is copied
+                 and stored to destination memory at (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) {        \
+  uint64_t out0_m, out1_m;                  \
+  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                            \
+  SD(out0_m, pblk_8x2_m);                   \
+  SD(out1_m, pblk_8x2_m + stride);          \
+}
+
 /* Description : Store as 8x4 byte block to destination memory from input
                  vectors
    Arguments   : Inputs  - in0, in1, pdst, stride
@@ -675,6 +720,24 @@
 }
 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
 
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                           cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - signed word
+   Details     : Signed word elements from mult0 are multiplied with
+                 signed word elements from cnst0 producing a result
+                 twice the size of input i.e. signed double word.
+                 Then this multiplication results of adjacent odd-even elements
+                 are added together and stored to the out vector
+                 (2 signed double word results)
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
 /* Description : Dot product & addition of byte vector elements
    Arguments   : Inputs  - mult0, mult1
                            cnst0, cnst1
@@ -743,6 +806,24 @@
   CLIP_SH2_0_255(in2, in3);                   \
 }
 
+/* Description : Addition of 4 signed word elements
+                 4 signed word elements of input vector are added together and
+                 the resulting integer sum is returned
+   Arguments   : Inputs  - in       (signed word vector)
+                 Outputs - sum_m    (i32 sum)
+                 Return Type - signed word
+*/
+#define HADD_SW_S32(in) ({                        \
+  v2i64 res0_m, res1_m;                           \
+  int32_t sum_m;                                  \
+                                                  \
+  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
+  res1_m = __msa_splati_d(res0_m, 1);             \
+  res0_m = res0_m + res1_m;                       \
+  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
+  sum_m;                                          \
+})
+
 /* Description : Horizontal addition of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -1039,8 +1120,8 @@
                  Outputs - in0, in1, in2, in3 (in place)
                  Return Type - unsigned halfword
    Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range
-                 Results are in placed to original vectors
+                 value generated with (sat_val+1) bit range.
+                 The results are stored in place
 */
 #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
   in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
@@ -1062,7 +1143,7 @@
                  Return Type - unsigned halfword
    Details     : Each unsigned halfword element from 'in0' is saturated to the
                  value generated with (sat_val+1) bit range
-                 Results are in placed to original vectors
+                 The results are stored in place
 */
 #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
   in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
@@ -1182,10 +1263,10 @@
                  Outputs - in0, in1 (in-place)
                  Return Type - as per RTYPE
    Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and result is in-place stored in
+                 logically xor'ed with 128 and the result is in-place stored in
                  'in0' vector
                  Each unsigned byte element from input vector 'in1' is
-                 logically xor'ed with 128 and result is in-place stored in
+                 logically xor'ed with 128 and the result is in-place stored in
                  'in1' vector
                  Similar for other pairs
 */
@@ -1237,13 +1318,28 @@
 }
 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
 
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in0, in1, in2, in3 (in place)
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is in place written to 'in0'
+                 Similar for other pairs
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 << shift;                         \
+  in1 = in1 << shift;                         \
+  in2 = in2 << shift;                         \
+  in3 = in3 << shift;                         \
+}
+
 /* Description : Arithmetic shift right all elements of vector
                  (generic for all data types)
    Arguments   : Inputs  - in0, in1, in2, in3, shift
                  Outputs - in0, in1, in2, in3 (in place)
                  Return Type - as per input vector RTYPE
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 result is in place written to 'in0'
+                 the result is in place written to 'in0'
                  Here, 'shift' is GP variable passed in
                  Similar for other pairs
 */
@@ -1362,6 +1458,24 @@
   ILVRL_B2_SH(zero_m, in, out0, out1);  \
 }
 
+/* Description : Sign extend halfword elements from input vector and return
+                 result in pair of vectors
+   Arguments   : Inputs  - in           (1 input halfword vector)
+                 Outputs - out0, out1   (sign extended 2 word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) {    \
+  v8i16 tmp_m;                           \
+                                         \
+  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
+  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
+}
+
 /* Description : Butterfly of 4 input vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1, out2, out3
@@ -1393,6 +1507,34 @@
   out7 = in0 - in7;                                                    \
 }
 
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
+                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
+                     out0, out1, out2, out3, out4, out5, out6, out7,          \
+                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  out0 = in0 + in15;                                                          \
+  out1 = in1 + in14;                                                          \
+  out2 = in2 + in13;                                                          \
+  out3 = in3 + in12;                                                          \
+  out4 = in4 + in11;                                                          \
+  out5 = in5 + in10;                                                          \
+  out6 = in6 + in9;                                                           \
+  out7 = in7 + in8;                                                           \
+                                                                              \
+  out8 = in7 - in8;                                                           \
+  out9 = in6 - in9;                                                           \
+  out10 = in5 - in10;                                                         \
+  out11 = in4 - in11;                                                         \
+  out12 = in3 - in12;                                                         \
+  out13 = in2 - in13;                                                         \
+  out14 = in1 - in14;                                                         \
+  out15 = in0 - in15;                                                         \
+}
+
 /* Description : Transposes input 8x8 byte block
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                            (input 8x8 byte block)
@@ -1606,7 +1748,7 @@
                  Outputs - out_m
                  Return Type - unsigned byte
    Details     : Signed byte even elements from 'in0' and 'in1' are packed
-                 together in one vector and the resulted vector is xor'ed with
+                 together in one vector and the resulting vector is xor'ed with
                  128 to shift the range from signed to unsigned byte
 */
 #define PCKEV_XORI128_UB(in0, in1) ({                    \

diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c
new file mode 100644
index 0000000..64cb9a8
--- /dev/null
+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c

@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  uint64_t src0_d, src1_d, dst0_d, dst1_d;
+  v16i8 src0 = { 0 };
+  v16i8 src1 = { 0 };
+  v16i8 dst0 = { 0 };
+  v16i8 dst1 = { 0 };
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 2; row--;) {
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src0);
+    INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src1);
+    INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst0, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst1, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+  }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+                                      int32_t src_stride,
+                                      uint8_t *dst_ptr,
+                                      int32_t dst_stride,
+                                      int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 4; row--;) {
+    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src2, src_r, src_l);
+    UNPCK_UB_SH(dst2, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src3, src_r, src_l);
+    UNPCK_UB_SH(dst3, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   int src_weight) {
+  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 53ae921..64d379c 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -161,6 +161,8 @@
   int up_available;
   int left_available;
 
+  const vp9_prob (*partition_probs)[PARTITION_TYPES - 1];
+
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
   int mb_to_right_edge;

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index a2584e8..ad6c04b 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -133,12 +133,6 @@
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = {
-  -EOB_MODEL_TOKEN, 2,
-  -ZERO_TOKEN, 4,
-  -ONE_TOKEN, -TWO_TOKEN,
-};
-
 // Model obtained from a 2-sided zero-centerd distribuition derived
 // from a Pareto distribution. The cdf of the distribution is:
 // cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 4e02630..2fc97c3 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -74,7 +74,6 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #define EOB_MODEL_TOKEN 3
-extern const vp9_tree_index vp9_coefmodel_tree[];
 
 typedef struct {
   const vp9_tree_index *tree;

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 424451f..22d431b 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -314,7 +314,7 @@
   { 149, 144, },
 };
 
-void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
+static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp9_copy(fc->uv_mode_prob, default_if_uv_probs);
   vp9_copy(fc->y_mode_prob, default_if_y_probs);
   vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
@@ -444,7 +444,7 @@
   lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
-  vp9_init_mode_probs(cm->fc);
+  init_mode_probs(cm->fc);
   vp9_init_mv_probs(cm);
   cm->fc->initialized = 1;
 

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index a0619ec..8c9e6a7 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -90,8 +90,6 @@
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
-void vp9_init_mode_probs(FRAME_CONTEXT *fc);
-
 void vp9_adapt_mode_probs(struct VP9Common *cm);
 
 void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 3af2a41..1811d76 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -162,7 +162,8 @@
   int show_existing_frame;
 
   // Flag signaling that the frame is encoded using only INTRA modes.
-  int intra_only;
+  uint8_t intra_only;
+  uint8_t last_intra_only;
 
   int allow_high_precision_mv;
 
@@ -335,6 +336,18 @@
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void set_partition_probs(const VP9_COMMON *const cm,
+                                       MACROBLOCKD *const xd) {
+  xd->partition_probs =
+      frame_is_intra_only(cm) ?
+          &vp9_kf_partition_probs[0] :
+          (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+}
+
 static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
   int i;
 
@@ -355,16 +368,13 @@
   xd->above_seg_context = cm->above_seg_context;
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
+
+  set_partition_probs(cm, xd);
 }
 
-static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
-  return cm->frame_type == KEY_FRAME || cm->intra_only;
-}
-
-static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
+static INLINE const vp9_prob* get_partition_probs(const MACROBLOCKD *xd,
                                                   int ctx) {
-  return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
-                                 : cm->fc->partition_prob[ctx];
+  return xd->partition_probs[ctx];
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {

diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 650f4ad..1e9acb8 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c

@@ -533,8 +533,8 @@
 }
 intra_pred_no_4x4(d117)
 
-void vp9_d135_predictor_4x4(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
   const int I = left[0];
   const int J = left[1];
   const int K = left[2];

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5035126..27cd3d0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -60,7 +60,7 @@
 specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
@@ -72,7 +72,7 @@
 specialize qw/vp9_d117_predictor_4x4/;
 
 add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_4x4/;
+specialize qw/vp9_d135_predictor_4x4 neon/;
 
 add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
@@ -84,16 +84,16 @@
 specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -192,7 +192,7 @@
 specialize qw/vp9_d135_predictor_32x32/;
 
 add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d153_predictor_32x32/;
+specialize qw/vp9_d153_predictor_32x32/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc";
@@ -201,16 +201,16 @@
 specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64";
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
 
 #
 # Loopfilter
@@ -276,10 +276,10 @@
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
 
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
-specialize qw/vp9_filter_by_weight16x16 sse2/;
+specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
 
 add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
-specialize qw/vp9_filter_by_weight8x8 sse2/;
+specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 }
 
 #
@@ -1029,7 +1029,7 @@
   specialize qw/vp9_fht8x8 sse2/;
 
   add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2/;
+  specialize qw/vp9_fht16x16 sse2 msa/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
@@ -1047,19 +1047,19 @@
   specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16_1 sse2/;
+  specialize qw/vp9_fdct16x16_1 sse2 msa/;
 
   add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16 sse2/;
+  specialize qw/vp9_fdct16x16 sse2 msa/;
 
   add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_1 sse2/;
+  specialize qw/vp9_fdct32x32_1 sse2 msa/;
 
   add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32 sse2 avx2/;
+  specialize qw/vp9_fdct32x32 sse2 avx2 msa/;
 
   add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+  specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/;
 }
 
 #

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 30ca2d0..9311d8d 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -346,6 +346,357 @@
   *args->eobtotal += eob;
 }
 
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+                                 uint16_t *dst, int dst_stride,
+                                 int x, int y, int b_w, int b_h,
+                                 int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      vpx_memset16(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right)
+      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               MACROBLOCKD *xd,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
+                         x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+  } else {
+    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
+                    x0, y0, b_w, b_h, frame_width, frame_height);
+    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
+  }
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+}
+#else
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+                               int x0, int y0, int b_w, int b_h,
+                               int frame_width, int frame_height,
+                               int border_offset,
+                               uint8_t *const dst, int dst_buf_stride,
+                               int subpel_x, int subpel_y,
+                               const InterpKernel *kernel,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref, int xs, int ys) {
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+  const uint8_t *buf_ptr;
+
+  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
+                  x0, y0, b_w, b_h, frame_width, frame_height);
+  buf_ptr = mc_buf + border_offset;
+
+  inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                       int plane, int bw, int bh, int x,
+                                       int y, int w, int h, int mi_x, int mi_y,
+                                       const InterpKernel *kernel,
+                                       const struct scale_factors *sf,
+                                       struct buf_2d *pre_buf,
+                                       struct buf_2d *dst_buf, const MV* mv,
+                                       RefCntBuffer *ref_frame_buf,
+                                       int is_scaled, int ref) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  MV32 scaled_mv;
+  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+      buf_stride, subpel_x, subpel_y;
+  uint8_t *ref_frame, *buf_ptr;
+
+  // Get reference frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = ref_frame_buf->buf.y_crop_width;
+    frame_height = ref_frame_buf->buf.y_crop_height;
+    ref_frame = ref_frame_buf->buf.y_buffer;
+  } else {
+    frame_width = ref_frame_buf->buf.uv_crop_width;
+    frame_height = ref_frame_buf->buf.uv_crop_height;
+    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                         : ref_frame_buf->buf.v_buffer;
+  }
+
+  if (is_scaled) {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+    // Co-ordinate of containing block to pixel precision.
+    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = (x_start + x) << SUBPEL_BITS;
+    y0_16 = (y_start + y) << SUBPEL_BITS;
+
+    // Co-ordinate of current block in reference frame
+    // to 1/16th pixel precision.
+    x0_16 = sf->scale_value_x(x0_16, sf);
+    y0_16 = sf->scale_value_y(y0_16, sf);
+
+    // Map the top left corner of the block into the reference frame.
+    x0 = sf->scale_value_x(x_start + x, sf);
+    y0 = sf->scale_value_y(y_start + y, sf);
+
+    // Scale the MV and incorporate the sub-pixel offset of the block
+    // in the reference frame.
+    scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    // Co-ordinate of containing block to pixel precision.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+    xs = ys = 16;
+  }
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+  // Calculate the top left corner of the best matching block in the
+  // reference frame.
+  x0 += scaled_mv.col >> SUBPEL_BITS;
+  y0 += scaled_mv.row >> SUBPEL_BITS;
+  x0_16 += scaled_mv.col;
+  y0_16 += scaled_mv.row;
+
+  // Get reference block pointer.
+  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+  buf_stride = pre_buf->stride;
+
+  // Do border extension if there is motion or the
+  // width/height is not a multiple of 8 pixels.
+  if (is_scaled || scaled_mv.col || scaled_mv.row ||
+      (frame_width & 0x7) || (frame_height & 0x7)) {
+    int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+
+    // Get reference block bottom right horizontal coordinate.
+    int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
+    int x_pad = 0, y_pad = 0;
+
+    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      x0 -= VP9_INTERP_EXTEND - 1;
+      x1 += VP9_INTERP_EXTEND;
+      x_pad = 1;
+    }
+
+    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      y0 -= VP9_INTERP_EXTEND - 1;
+      y1 += VP9_INTERP_EXTEND;
+      y_pad = 1;
+    }
+
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (pbi->frame_parallel_decode)
+      vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+    // Skip border extension if block is inside the frame.
+    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+      // Extend the border.
+      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
+      const int b_w = x1 - x0 + 1;
+      const int b_h = y1 - y0 + 1;
+      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
+
+      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+                         frame_width, frame_height, border_offset,
+                         dst, dst_buf->stride,
+                         subpel_x, subpel_y,
+                         kernel, sf,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         xd,
+#endif
+                         w, h, ref, xs, ys);
+      return;
+    }
+  } else {
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+     if (pbi->frame_parallel_decode) {
+       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+     }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+#else
+  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0];
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+  const int is_compound = has_second_ref(&mi->mbmi);
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct buf_2d *const dst_buf = &pd->dst;
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+    int ref;
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+      const int idx = xd->block_refs[ref]->idx;
+      BufferPool *const pool = pbi->common.buffer_pool;
+      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+      const int is_scaled = vp9_is_scaled(sf);
+
+      if (sb_type < BLOCK_8X8) {
+        int i = 0, x, y;
+        assert(bsize == BLOCK_8X8);
+        for (y = 0; y < num_4x4_h; ++y) {
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       sf, pre_buf, dst_buf, &mv,
+                                       ref_frame_buf, is_scaled, ref);
+          }
+        }
+      } else {
+        const MV mv = mi->mbmi.mv[ref].as_mv;
+        dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+                                   0, 0, bw, bh, mi_x, mi_y, kernel,
+                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+                                   is_scaled, ref);
+      }
+    }
+  }
+}
+
 static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                  const TileInfo *const tile,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col) {
@@ -405,7 +756,7 @@
                                   predict_and_reconstruct_intra_block, &arg);
   } else {
     // Prediction
-    vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
 
     // Reconstruction
     if (!mbmi->skip) {
@@ -420,14 +771,11 @@
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
-static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     int hbs,
-                                     int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                     vp9_reader *r) {
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, vp9_reader *r,
+                                     int has_rows, int has_cols) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = get_partition_probs(cm, ctx);
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  const vp9_prob *const probs = get_partition_probs(xd, ctx);
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
@@ -454,11 +802,13 @@
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
+  partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols);
   subsize = get_subsize(bsize, partition);
   if (bsize == BLOCK_8X8) {
     decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
@@ -469,12 +819,12 @@
         break;
       case PARTITION_HORZ:
         decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
-        if (mi_row + hbs < cm->mi_rows)
+        if (has_rows)
           decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
         break;
       case PARTITION_VERT:
         decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
-        if (mi_col + hbs < cm->mi_cols)
+        if (has_cols)
           decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
         break;
       case PARTITION_SPLIT:
@@ -669,12 +1019,6 @@
                              : literal_to_filter[vp9_rb_read_literal(rb, 2)];
 }
 
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
-                         int *width, int *height) {
-  *width = vp9_rb_read_literal(rb, 16) + 1;
-  *height = vp9_rb_read_literal(rb, 16) + 1;
-}
-
 static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   cm->display_width = cm->width;
   cm->display_height = cm->height;
@@ -1112,8 +1456,6 @@
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads & ~1;
     int i;
-    // TODO(jzern): Allocate one less worker, as in the current code we only
-    // use num_threads - 1 workers.
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     // Ensure tile data offsets will be properly aligned. This may fail on
@@ -1253,20 +1595,6 @@
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
-  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
-         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
-}
-
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
-  int profile = vp9_rb_read_bit(rb);
-  profile |= vp9_rb_read_bit(rb) << 1;
-  if (profile > 2)
-    profile += vp9_rb_read_bit(rb);
-  return (BITSTREAM_PROFILE) profile;
-}
-
 static void read_bitdepth_colorspace_sampling(
     VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   if (cm->profile >= PROFILE_2) {
@@ -1319,6 +1647,7 @@
   size_t sz;
 
   cm->last_frame_type = cm->frame_type;
+  cm->last_intra_only = cm->intra_only;
 
   if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
@@ -1595,12 +1924,12 @@
 }
 #endif  // NDEBUG
 
-static struct vp9_read_bit_buffer* init_read_bit_buffer(
+static struct vp9_read_bit_buffer *init_read_bit_buffer(
     VP9Decoder *pbi,
     struct vp9_read_bit_buffer *rb,
     const uint8_t *data,
     const uint8_t *data_end,
-    uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) {
+    uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
   rb->bit_offset = 0;
   rb->error_handler = error_handler;
   rb->error_handler_data = &pbi->common;
@@ -1616,12 +1945,34 @@
   return rb;
 }
 
+//------------------------------------------------------------------------------
+
+int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
+  return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+         vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+}
+
+void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+                         int *width, int *height) {
+  *width = vp9_rb_read_literal(rb, 16) + 1;
+  *height = vp9_rb_read_literal(rb, 16) + 1;
+}
+
+BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
+  int profile = vp9_rb_read_bit(rb);
+  profile |= vp9_rb_read_bit(rb) << 1;
+  if (profile > 2)
+    profile += vp9_rb_read_bit(rb);
+  return (BITSTREAM_PROFILE) profile;
+}
+
 void vp9_decode_frame(VP9Decoder *pbi,
                       const uint8_t *data, const uint8_t *data_end,
                       const uint8_t **p_data_end) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
+  struct vp9_read_bit_buffer rb;
   int context_updated = 0;
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
@@ -1645,8 +1996,9 @@
   cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
                            cm->width == cm->last_width &&
                            cm->height == cm->last_height &&
-                           !cm->intra_only &&
-                           cm->last_show_frame;
+                           !cm->last_intra_only &&
+                           cm->last_show_frame &&
+                           (cm->last_frame_type != KEY_FRAME);
 
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
@@ -1725,353 +2077,3 @@
   if (cm->refresh_frame_context && !context_updated)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }
-
-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void high_build_mc_border(const uint8_t *src8, int src_stride,
-                                 uint16_t *dst, int dst_stride,
-                                 int x, int y, int b_w, int b_h,
-                                 int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      vpx_memset16(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
-
-    if (right)
-      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
-                               int border_offset,
-                               uint8_t *const dst, int dst_buf_stride,
-                               int subpel_x, int subpel_y,
-                               const InterpKernel *kernel,
-                               const struct scale_factors *sf,
-                               MACROBLOCKD *xd,
-                               int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
-  const uint8_t *buf_ptr;
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
-                         x0, y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
-  } else {
-    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
-                    x0, y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
-  }
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-  } else {
-    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-}
-#else
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
-                               int border_offset,
-                               uint8_t *const dst, int dst_buf_stride,
-                               int subpel_x, int subpel_y,
-                               const InterpKernel *kernel,
-                               const struct scale_factors *sf,
-                               int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
-  const uint8_t *buf_ptr;
-
-  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
-                  x0, y0, b_w, b_h, frame_width, frame_height);
-  buf_ptr = mc_buf + border_offset;
-
-  inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                  subpel_y, sf, w, h, ref, kernel, xs, ys);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                                       int plane, int bw, int bh, int x,
-                                       int y, int w, int h, int mi_x, int mi_y,
-                                       const InterpKernel *kernel,
-                                       const struct scale_factors *sf,
-                                       struct buf_2d *pre_buf,
-                                       struct buf_2d *dst_buf, const MV* mv,
-                                       RefCntBuffer *ref_frame_buf,
-                                       int is_scaled, int ref) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  MV32 scaled_mv;
-  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
-      buf_stride, subpel_x, subpel_y;
-  uint8_t *ref_frame, *buf_ptr;
-
-  // Get reference frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = ref_frame_buf->buf.y_crop_width;
-    frame_height = ref_frame_buf->buf.y_crop_height;
-    ref_frame = ref_frame_buf->buf.y_buffer;
-  } else {
-    frame_width = ref_frame_buf->buf.uv_crop_width;
-    frame_height = ref_frame_buf->buf.uv_crop_height;
-    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
-                         : ref_frame_buf->buf.v_buffer;
-  }
-
-  if (is_scaled) {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-    // Co-ordinate of containing block to pixel precision.
-    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-    // Co-ordinate of the block to 1/16th pixel precision.
-    x0_16 = (x_start + x) << SUBPEL_BITS;
-    y0_16 = (y_start + y) << SUBPEL_BITS;
-
-    // Co-ordinate of current block in reference frame
-    // to 1/16th pixel precision.
-    x0_16 = sf->scale_value_x(x0_16, sf);
-    y0_16 = sf->scale_value_y(y0_16, sf);
-
-    // Map the top left corner of the block into the reference frame.
-    x0 = sf->scale_value_x(x_start + x, sf);
-    y0 = sf->scale_value_y(y_start + y, sf);
-
-    // Scale the MV and incorporate the sub-pixel offset of the block
-    // in the reference frame.
-    scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-    xs = sf->x_step_q4;
-    ys = sf->y_step_q4;
-  } else {
-    // Co-ordinate of containing block to pixel precision.
-    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-    // Co-ordinate of the block to 1/16th pixel precision.
-    x0_16 = x0 << SUBPEL_BITS;
-    y0_16 = y0 << SUBPEL_BITS;
-
-    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
-    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
-    xs = ys = 16;
-  }
-  subpel_x = scaled_mv.col & SUBPEL_MASK;
-  subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-  // Calculate the top left corner of the best matching block in the
-  // reference frame.
-  x0 += scaled_mv.col >> SUBPEL_BITS;
-  y0 += scaled_mv.row >> SUBPEL_BITS;
-  x0_16 += scaled_mv.col;
-  y0_16 += scaled_mv.row;
-
-  // Get reference block pointer.
-  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-  buf_stride = pre_buf->stride;
-
-  // Do border extension if there is motion or the
-  // width/height is not a multiple of 8 pixels.
-  if (is_scaled || scaled_mv.col || scaled_mv.row ||
-      (frame_width & 0x7) || (frame_height & 0x7)) {
-    int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-
-    // Get reference block bottom right horizontal coordinate.
-    int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
-    int x_pad = 0, y_pad = 0;
-
-    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
-      x0 -= VP9_INTERP_EXTEND - 1;
-      x1 += VP9_INTERP_EXTEND;
-      x_pad = 1;
-    }
-
-    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
-      y0 -= VP9_INTERP_EXTEND - 1;
-      y1 += VP9_INTERP_EXTEND;
-      y_pad = 1;
-    }
-
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (pbi->frame_parallel_decode)
-      vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-
-    // Skip border extension if block is inside the frame.
-    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
-        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-      // Extend the border.
-      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
-      const int b_w = x1 - x0 + 1;
-      const int b_h = y1 - y0 + 1;
-      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
-
-      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
-                         frame_width, frame_height, border_offset,
-                         dst, dst_buf->stride,
-                         subpel_x, subpel_y,
-                         kernel, sf,
-#if CONFIG_VP9_HIGHBITDEPTH
-                         xd,
-#endif
-                         w, h, ref, xs, ys);
-      return;
-    }
-  } else {
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-     if (pbi->frame_parallel_decode) {
-       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-     }
-  }
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-  } else {
-    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-#else
-  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                  subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-}
-
-void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                                       int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  const MODE_INFO *mi = xd->mi[0];
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
-  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
-  const int is_compound = has_second_ref(&mi->mbmi);
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
-    int ref;
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-      const int idx = xd->block_refs[ref]->idx;
-      BufferPool *const pool = pbi->common.buffer_pool;
-      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-      const int is_scaled = vp9_is_scaled(sf);
-
-      if (sb_type < BLOCK_8X8) {
-        int i = 0, x, y;
-        assert(bsize == BLOCK_8X8);
-        for (y = 0; y < num_4x4_h; ++y) {
-          for (x = 0; x < num_4x4_w; ++x) {
-            const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(pbi, xd, plane, bw, bh,
-                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
-                                       sf, pre_buf, dst_buf, &mv,
-                                       ref_frame_buf, is_scaled, ref);
-          }
-        }
-      } else {
-        const MV mv = mi->mbmi.mv[ref].as_mv;
-        dec_build_inter_predictors(pbi, xd, plane, bw, bh,
-                                   0, 0, bw, bh, mi_x, mi_y, kernel,
-                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
-                                   is_scaled, ref);
-      }
-    }
-  }
-}

diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index 8410c54..a876e7c 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h

@@ -16,24 +16,18 @@
 extern "C" {
 #endif
 
-struct VP9Common;
 struct VP9Decoder;
 struct vp9_read_bit_buffer;
 
-void vp9_init_dequantizer(struct VP9Common *cm);
-
-void vp9_decode_frame(struct VP9Decoder *pbi,
-                      const uint8_t *data, const uint8_t *data_end,
-                      const uint8_t **p_data_end);
-
 int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
 void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
                          int *width, int *height);
 BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
 
-void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
-                                       MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize);
+void vp9_decode_frame(struct VP9Decoder *pbi,
+                      const uint8_t *data, const uint8_t *data_end,
+                      const uint8_t **p_data_end);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d34926d..8a8d8dd 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -599,19 +599,20 @@
   MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
   int w, h;
 
-  if (frame_is_intra_only(cm))
+  if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
-  else
+  } else {
     read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
 
-  for (h = 0; h < y_mis; ++h) {
-    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
-    for (w = 0; w < x_mis; ++w) {
-      MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+      }
     }
   }
 }

diff --git a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
new file mode 100644
index 0000000..a3ebfab
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c

@@ -0,0 +1,688 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+static void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                               int32_t src_stride) {
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+                 -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+  v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+                   cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+  v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
+                   0, 0, 0, 0 };
+
+  LD_SH16(input, src_stride,
+          in0, in1, in2, in3, in4, in5, in6, in7,
+          in8, in9, in10, in11, in12, in13, in14, in15);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in8, in9, in10, in11, 2);
+  SLLI_4V(in12, in13, in14, in15, 2);
+  ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
+  ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
+  VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
+  SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
+  SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
+  ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __msa_splati_h(coeff, 0);
+  stp25 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
+
+  cnst5 = __msa_splati_h(coeff, 1);
+  cnst5 = __msa_ilvev_h(cnst5, cnst4);
+  stp22 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
+  stp24 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
+  stp23 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
+
+  /* stp2 */
+  BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
+  ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
+  SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  stp26 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
+
+  cnst0 = __msa_splati_h(coeff, 4);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+  stp21 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
+
+  BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  ILVRL_H2_SH(in15, in8, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr);
+
+  cnst0 = __msa_splati_h(coeff2, 0);
+  cnst0 = __msa_ilvev_h(cnst1, cnst0);
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 224);
+
+  ILVRL_H2_SH(in14, in9, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+  ST_SH(in8, tmp_ptr + 128);
+
+  cnst1 = __msa_splati_h(coeff2, 2);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 96);
+
+  SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  stp25 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+  cnst1 = __msa_splati_h(coeff, 3);
+  cnst1 = __msa_ilvev_h(cnst0, cnst1);
+  stp22 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+  /* stp4 */
+  ADD2(stp34, stp25, stp33, stp22, in13, in10);
+
+  ILVRL_H2_SH(in13, in10, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 64);
+
+  cnst0 = __msa_splati_h(coeff2, 1);
+  cnst0 = __msa_ilvev_h(cnst1, cnst0);
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 160);
+
+  SUB2(stp34, stp25, stp33, stp22, in12, in11);
+  ILVRL_H2_SH(in12, in11, vec1, vec0);
+  SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
+  cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+  ST_SH(in8, tmp_ptr + 192);
+
+  cnst1 = __msa_splati_h(coeff2, 3);
+  cnst0 = __msa_ilvev_h(cnst0, cnst1);
+  in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+  ST_SH(in8, tmp_ptr + 32);
+}
+
+static void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+
+  LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
+  ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
+  SRA_4V(in0, in1, in2, in3, 2);
+  SRA_4V(in4, in5, in6, in7, 2);
+  SRA_4V(in8, in9, in10, in11, 2);
+  SRA_4V(in12, in13, in14, in15, 2);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+               in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
+               tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
+  VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  VP9_FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
+}
+
+void vp9_fdct16x16_msa(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+
+void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[1] = 0;
+
+  out[0] = VP9_LD_HADD(input, stride);
+  out[0] += VP9_LD_HADD(input + 8, stride);
+  out[0] += VP9_LD_HADD(input + 16 * 8, stride);
+  out[0] += VP9_LD_HADD(input + 16 * 8 + 8, stride);
+  out[0] >>= 1;
+}
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+                                   const int32_t *const0, int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r15 = LD_SH(input + 15 * stride);
+  r7 = LD_SH(input + 7 * stride);
+  r8 = LD_SH(input + 8 * stride);
+  SLLI_4V(r0, r15, r7, r8, 2);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 8, 4, k2, k3);
+  VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * stride);
+  r4 = LD_SH(input + 4 * stride);
+  r11 = LD_SH(input + 11 * stride);
+  r12 = LD_SH(input + 12 * stride);
+  SLLI_4V(r3, r4, r11, r12, 2);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp2, int_buf, 8);
+  ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+  ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+  r9 = LD_SH(input + 9 * stride);
+  r6 = LD_SH(input + 6 * stride);
+  r1 = LD_SH(input + stride);
+  r14 = LD_SH(input + 14 * stride);
+  SLLI_4V(r9, r6, r1, r14, 2);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r13 = LD_SH(input + 13 * stride);
+  r2 = LD_SH(input + 2 * stride);
+  r5 = LD_SH(input + 5 * stride);
+  r10 = LD_SH(input + 10 * stride);
+  SLLI_4V(r13, r2, r5, r10, 2);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 128;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+  LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  VP9_MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+  out += 64;
+
+  /* load input data */
+  input += 128;
+  LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r0, r1);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r2, r3);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r4, r5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r6, r7);
+  ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+  out += 64;
+
+  LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r8, r9);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r10, r11);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r12, r13);
+  VP9_FDCT_POSTPROC_2V_NEG_H(r14, r15);
+  ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+                                   int16_t *int_buf) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+  v4i32 k0, k1, k2, k3;
+
+  /* load input data */
+  r0 = LD_SH(input);
+  r7 = LD_SH(input + 7 * 8);
+  r8 = LD_SH(input + 8 * 8);
+  r15 = LD_SH(input + 15 * 8);
+
+  /* stage 1 */
+  LD_SW2(const0, 4, k0, k1);
+  LD_SW2(const0 + 4 * 2, 4, k2, k3);
+  VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  r3 = LD_SH(input + 3 * 8);
+  r4 = LD_SH(input + 4 * 8);
+  r11 = LD_SH(input + 11 * 8);
+  r12 = LD_SH(input + 12 * 8);
+
+  LD_SW2(const0 + 4 * 4, 4, k0, k1);
+  LD_SW2(const0 + 4 * 6, 4, k2, k3);
+  VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  /* stage 2 */
+  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+  ST_SH2(tp0, tp1, int_buf, 4 * 8);
+  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+  LD_SW2(const0 + 4 * 8, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 10);
+  VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+  r1 = LD_SH(input + 8);
+  r6 = LD_SH(input + 6 * 8);
+  r9 = LD_SH(input + 9 * 8);
+  r14 = LD_SH(input + 14 * 8);
+
+  LD_SW2(const0 + 4 * 11, 4, k0, k1);
+  LD_SW2(const0 + 4 * 13, 4, k2, k3);
+  VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+  r2 = LD_SH(input + 2 * 8);
+  r5 = LD_SH(input + 5 * 8);
+  r10 = LD_SH(input + 10 * 8);
+  r13 = LD_SH(input + 13 * 8);
+
+  LD_SW2(const0 + 4 * 15, 4, k0, k1);
+  LD_SW2(const0 + 4 * 17, 4, k2, k3);
+  VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+                                   int16_t *out) {
+  int16_t *out_ptr = out + 8;
+  v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v4i32 k0, k1, k2, k3;
+
+  g13 = LD_SH(int_buf + 3 * 8);
+  g15 = LD_SH(int_buf + 7 * 8);
+  g5 = LD_SH(int_buf + 11 * 8);
+  g7 = LD_SH(int_buf + 15 * 8);
+
+  LD_SW2(const0 + 4 * 19, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 21);
+  VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  tp0 = LD_SH(int_buf + 4 * 8);
+  tp1 = LD_SH(int_buf + 5 * 8);
+  tp3 = LD_SH(int_buf + 10 * 8);
+  tp2 = LD_SH(int_buf + 14 * 8);
+
+  LD_SW2(const0 + 4 * 22, 4, k0, k1);
+  k2 = LD_SW(const0 + 4 * 24);
+  VP9_MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  ST_SH(out4, (out + 3 * 16));
+  ST_SH(out5, (out_ptr + 4 * 16));
+
+  h1 = LD_SH(int_buf + 9 * 8);
+  h3 = LD_SH(int_buf + 12 * 8);
+  VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  ST_SH(out12, (out + 2 * 16));
+  ST_SH(out13, (out_ptr + 5 * 16));
+
+  tp0 = LD_SH(int_buf);
+  tp1 = LD_SH(int_buf + 8);
+  tp2 = LD_SH(int_buf + 2 * 8);
+  tp3 = LD_SH(int_buf + 6 * 8);
+
+  BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+  out1 = -out1;
+  ST_SH(out0, (out));
+  ST_SH(out1, (out_ptr + 7 * 16));
+
+  h0 = LD_SH(int_buf + 8 * 8);
+  h2 = LD_SH(int_buf + 13 * 8);
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+  ST_SH(out8, (out + 16));
+  ST_SH(out9, (out_ptr + 6 * 16));
+
+  /* stage 4 */
+  LD_SW2(const0 + 4 * 25, 4, k0, k1);
+  LD_SW2(const0 + 4 * 27, 4, k2, k3);
+  VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  ST_SH(out2, (out + 7 * 16));
+  ST_SH(out3, (out_ptr));
+
+  VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  ST_SH(out6, (out + 4 * 16));
+  ST_SH(out7, (out_ptr + 3 * 16));
+
+  VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  ST_SH(out10, (out + 6 * 16));
+  ST_SH(out11, (out_ptr + 16));
+
+  VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  ST_SH(out14, (out + 5 * 16));
+  ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+  out += 16 * 8;
+
+  /* load input data */
+  input += 128;
+  LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+          l4, l12, l5, l13, l6, l14, l7, l15);
+  TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                     r0, r1, r2, r3, r4, r5, r6, r7);
+  TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                     r8, r9, r10, r11, r12, r13, r14, r15);
+  ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+  ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+  int16_t *temp = intermediate;
+  int16_t *out = output;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+  v8i16 in12, in13, in14, in15;
+
+  LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  temp = intermediate + 8;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in0, in1);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in2, in3);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in6, in7);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  temp = intermediate;
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+  VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  temp = intermediate;
+  LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+  VP9_FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+                   in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+                     tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+  ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+  TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+                     tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+  out = output + 8;
+  ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void vp9_fht16x16_msa(const int16_t *input, int16_t *output,
+                      int32_t stride, int32_t tx_type) {
+  DECLARE_ALIGNED(32, int16_t, tmp[256]);
+  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+  int32_t i;
+  int16_t *ptmpbuf = &tmp_buf[0];
+  int16_t *trans = &trans_buf[0];
+  const int32_t const_arr[29 * 4] = {
+    52707308, 52707308, 52707308, 52707308,
+    -1072430300, -1072430300, -1072430300, -1072430300,
+    795618043, 795618043, 795618043, 795618043,
+    -721080468, -721080468, -721080468, -721080468,
+    459094491, 459094491, 459094491, 459094491,
+    -970646691, -970646691, -970646691, -970646691,
+    1010963856, 1010963856, 1010963856, 1010963856,
+    -361743294, -361743294, -361743294, -361743294,
+    209469125, 209469125, 209469125, 209469125,
+    -1053094788, -1053094788, -1053094788, -1053094788,
+    1053160324, 1053160324, 1053160324, 1053160324,
+    639644520, 639644520, 639644520, 639644520,
+    -862444000, -862444000, -862444000, -862444000,
+    1062144356, 1062144356, 1062144356, 1062144356,
+    -157532337, -157532337, -157532337, -157532337,
+    260914709, 260914709, 260914709, 260914709,
+    -1041559667, -1041559667, -1041559667, -1041559667,
+    920985831, 920985831, 920985831, 920985831,
+    -551995675, -551995675, -551995675, -551995675,
+    596522295, 596522295, 596522295, 596522295,
+    892853362, 892853362, 892853362, 892853362,
+    -892787826, -892787826, -892787826, -892787826,
+    410925857, 410925857, 410925857, 410925857,
+    -992012162, -992012162, -992012162, -992012162,
+    992077698, 992077698, 992077698, 992077698,
+    759246145, 759246145, 759246145, 759246145,
+    -759180609, -759180609, -759180609, -759180609,
+    -759222975, -759222975, -759222975, -759222975,
+    759288511, 759288511, 759288511, 759288511 };
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case ADST_DCT:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+      }
+      break;
+    case DCT_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    case ADST_ADST:
+      /* column transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+      }
+
+      fadst16_transpose_postproc_msa(tmp, trans);
+
+      /* row transform */
+      for (i = 0; i < 2; ++i) {
+        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+      }
+
+      fadst16_transpose_msa(tmp, output);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}

diff --git a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c
new file mode 100644
index 0000000..3a74023
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c

@@ -0,0 +1,956 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 step0, step1, step2, step3;
+  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  v8i16 step0_1, step1_1, step2_1, step3_1;
+
+  /* 1st and 2nd set */
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+  LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
+  LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+  LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+              step0, step1, step2, step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH4(step0, step1, step2, step3, temp_buff, 8);
+  ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
+  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
+  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
+
+  /* 3rd and 4th set */
+  LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
+  LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
+  LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+  LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+  SLLI_4V(in0, in1, in2, in3, 2);
+  SLLI_4V(in4, in5, in6, in7, 2);
+  SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+  SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+              step0, step1, step2, step3, in4, in5, in6, in7);
+  BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+              step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
+  ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
+  ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
+  ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 temp0, temp1;
+
+  /* fdct even */
+  LD_SH4(input, 8, in0, in1, in2, in3);
+  LD_SH4(input + 96, 8, in12, in13, in14, in15);
+  BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
+              vec0, vec1, vec2, vec3, in12, in13, in14, in15);
+  LD_SH4(input + 32, 8, in4, in5, in6, in7);
+  LD_SH4(input + 64, 8, in8, in9, in10, in11);
+  BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
+              vec4, vec5, vec6, vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+  VP9_DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp);
+  ST_SH(temp1, temp + 512);
+
+  VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 256);
+  ST_SH(temp1, temp + 768);
+
+  SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
+  VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 128);
+  ST_SH(temp1, temp + 896);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 640);
+  ST_SH(temp1, temp + 384);
+
+  VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 64);
+  ST_SH(temp1, temp + 960);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 576);
+  ST_SH(temp1, temp + 448);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 320);
+  ST_SH(temp1, temp + 704);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  ST_SH(temp0, temp + 192);
+  ST_SH(temp1, temp + 832);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+  in20 = LD_SH(input + 32);
+  in21 = LD_SH(input + 40);
+  in26 = LD_SH(input + 80);
+  in27 = LD_SH(input + 88);
+
+  VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = LD_SH(input + 16);
+  in19 = LD_SH(input + 24);
+  in28 = LD_SH(input + 96);
+  in29 = LD_SH(input + 104);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, input + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, input + 40);
+  vec4 = in29 - in26;
+  ST_SH(vec4, input + 80);
+  vec4 = in28 - in27;
+  ST_SH(vec4, input + 88);
+
+  in21 = in18 + in21;
+  in20 = in19 + in20;
+  in27 = in28 + in27;
+  in26 = in29 + in26;
+
+  LD_SH4(input + 48, 8, in22, in23, in24, in25);
+  VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = LD_SH(input);
+  in17 = LD_SH(input + 8);
+  in30 = LD_SH(input + 112);
+  in31 = LD_SH(input + 120);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, input + 16);
+  vec4 = in16 - in23;
+  ST_SH(vec4, input + 24);
+  vec4 = in31 - in24;
+  ST_SH(vec4, input + 96);
+  vec4 = in30 - in25;
+  ST_SH(vec4, input + 104);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+  VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr);
+  ST_SH(vec4, temp_ptr + 960);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 448);
+  ST_SH(vec4, temp_ptr + 512);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+  VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec4, temp_ptr + 704);
+  ST_SH(vec5, temp_ptr + 256);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec4, temp_ptr + 192);
+  ST_SH(vec5, temp_ptr + 768);
+
+  LD_SH4(input + 16, 8, in22, in23, in20, in21);
+  LD_SH4(input + 80, 8, in26, in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  ADD2(in28, in29, in31, in30, in16, in19);
+  VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 832);
+  ST_SH(vec4, temp_ptr + 128);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 320);
+  ST_SH(vec4, temp_ptr + 640);
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+  VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 576);
+  ST_SH(vec4, temp_ptr + 384);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  ST_SH(vec5, temp_ptr + 64);
+  ST_SH(vec4, temp_ptr + 896);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
+
+  LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               step0, step1, step2, step3, step4, step5, step6, step7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
+
+  /* 2nd set */
+  LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+                     in8, in9, in10, in11, in12, in13, in14, in15);
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               step0, step1, step2, step3, step4, step5, step6, step7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
+         (output + 8 * 8), 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
+  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
+       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
+  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
+       vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = vec0_r + vec3_r;
+  vec0_r = vec0_r - vec3_r;
+  vec3_r = vec1_r + vec2_r;
+  vec1_r = vec1_r - vec2_r;
+
+  VP9_DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
+                        cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  VP9_FDCT32_POSTPROC_NEG_W(vec4_r);
+  VP9_FDCT32_POSTPROC_NEG_W(tmp3_w);
+  VP9_FDCT32_POSTPROC_NEG_W(vec6_r);
+  VP9_FDCT32_POSTPROC_NEG_W(vec3_r);
+  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  ST_SH2(vec5, vec4, out, 8);
+
+  VP9_DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
+                        cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+  VP9_FDCT32_POSTPROC_NEG_W(vec4_r);
+  VP9_FDCT32_POSTPROC_NEG_W(tmp3_w);
+  VP9_FDCT32_POSTPROC_NEG_W(vec6_r);
+  VP9_FDCT32_POSTPROC_NEG_W(vec3_r);
+  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  ST_SH2(vec5, vec4, out + 16, 8);
+
+  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 32);
+  ST_SH(in5, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 40);
+  ST_SH(in5, out + 48);
+
+  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+  VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 64);
+  ST_SH(in5, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 72);
+  ST_SH(in5, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 80);
+  ST_SH(in5, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  ST_SH(in4, out + 96);
+  ST_SH(in5, out + 88);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+  BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+  VP9_DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out);
+  ST_SH(temp1, out + 8);
+
+  VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 16);
+  ST_SH(temp1, out + 24);
+
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 32);
+  ST_SH(temp1, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 40);
+  ST_SH(temp1, out + 48);
+
+  VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 64);
+  ST_SH(temp1, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 72);
+  ST_SH(temp1, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
+  VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 80);
+  ST_SH(temp1, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  ST_SH(temp0, out + 96);
+  ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+  in20 = LD_SH(temp + 32);
+  in21 = LD_SH(temp + 40);
+  in26 = LD_SH(temp + 80);
+  in27 = LD_SH(temp + 88);
+
+  VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = LD_SH(temp + 16);
+  in19 = LD_SH(temp + 24);
+  in28 = LD_SH(temp + 96);
+  in29 = LD_SH(temp + 104);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, interm_ptr + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, interm_ptr + 88);
+  vec4 = in28 - in27;
+  ST_SH(vec4, interm_ptr + 56);
+  vec4 = in29 - in26;
+  ST_SH(vec4, interm_ptr + 64);
+
+  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+  in22 = LD_SH(temp + 48);
+  in23 = LD_SH(temp + 56);
+  in24 = LD_SH(temp + 64);
+  in25 = LD_SH(temp + 72);
+
+  VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = LD_SH(temp);
+  in17 = LD_SH(temp + 8);
+  in30 = LD_SH(temp + 112);
+  in31 = LD_SH(temp + 120);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, interm_ptr + 40);
+  vec4 = in30 - in25;
+  ST_SH(vec4, interm_ptr + 48);
+  vec4 = in31 - in24;
+  ST_SH(vec4, interm_ptr + 72);
+  vec4 = in16 - in23;
+  ST_SH(vec4, interm_ptr + 80);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+
+  VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out);
+  ST_SH(vec4, out + 120);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 112);
+  ST_SH(vec4, out + 8);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+
+  VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 16);
+  ST_SH(vec5, out + 104);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 24);
+  ST_SH(vec5, out + 96);
+
+  in20 = LD_SH(interm_ptr + 32);
+  in21 = LD_SH(interm_ptr + 88);
+  in27 = LD_SH(interm_ptr + 56);
+  in26 = LD_SH(interm_ptr + 64);
+
+  in16 = in20;
+  in17 = in21;
+  VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = LD_SH(interm_ptr + 40);
+  in25 = LD_SH(interm_ptr + 48);
+  in24 = LD_SH(interm_ptr + 72);
+  in23 = LD_SH(interm_ptr + 80);
+
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  ADD2(in28, in29, in31, in30, in16, in19);
+  VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 32);
+  ST_SH(vec4, out + 88);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 40);
+  ST_SH(vec4, out + 80);
+
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+
+  VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec5, out + 72);
+  ST_SH(vec4, out + 48);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  ST_SH(vec4, out + 56);
+  ST_SH(vec5, out + 64);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = LD_SH(temp);
+  in4 = LD_SH(temp + 32);
+  in2 = LD_SH(temp + 64);
+  in6 = LD_SH(temp + 96);
+  in1 = LD_SH(temp + 128);
+  in7 = LD_SH(temp + 152);
+  in3 = LD_SH(temp + 192);
+  in5 = LD_SH(temp + 216);
+
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = LD_SH(temp + 16);
+  in1_1 = LD_SH(temp + 232);
+  in2_1 = LD_SH(temp + 80);
+  in3_1 = LD_SH(temp + 168);
+  in4_1 = LD_SH(temp + 48);
+  in5_1 = LD_SH(temp + 176);
+  in6_1 = LD_SH(temp + 112);
+  in7_1 = LD_SH(temp + 240);
+
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
+  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = LD_SH(temp + 8);
+  in1 = LD_SH(temp + 136);
+  in2 = LD_SH(temp + 72);
+  in3 = LD_SH(temp + 200);
+  in4 = LD_SH(temp + 40);
+  in5 = LD_SH(temp + 208);
+  in6 = LD_SH(temp + 104);
+  in7 = LD_SH(temp + 144);
+
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+         output + 8, 32);
+  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+                     in0, in1, in2, in3, in4, in5, in6, in7);
+  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
+
+  /* 4th set */
+  in0_1 = LD_SH(temp + 24);
+  in1_1 = LD_SH(temp + 224);
+  in2_1 = LD_SH(temp + 88);
+  in3_1 = LD_SH(temp + 160);
+  in4_1 = LD_SH(temp + 56);
+  in5_1 = LD_SH(temp + 184);
+  in6_1 = LD_SH(temp + 120);
+  in7_1 = LD_SH(temp + 248);
+
+  TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+  ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+         output + 24, 32);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
+                            int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vp9_fdct32x32_msa(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+  out[1] = 0;
+
+  out[0] = VP9_LD_HADD(input, stride);
+  out[0] += VP9_LD_HADD(input + 8, stride);
+  out[0] += VP9_LD_HADD(input + 16, stride);
+  out[0] += VP9_LD_HADD(input + 24, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 8, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 8 + 8, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 8 + 16, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 8 + 24, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 16, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 16 + 8, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 16 + 16, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 16 + 24, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 24, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 24 + 8, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 24 + 16, stride);
+  out[0] += VP9_LD_HADD(input + 32 * 24 + 24, stride);
+  out[0] >>= 3;
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+               in8, in9, in10, in11, in12, in13, in14, in15,
+               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+               in8, in9, in10, in11, in12, in13, in14, in15);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  VP9_FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+
+  temp0 = in0 + in3;
+  in0 = in0 - in3;
+  in3 = in1 + in2;
+  in1 = in1 - in2;
+
+  VP9_DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  ST_SH(temp0, out);
+  ST_SH(temp1, out + 8);
+
+  VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  ST_SH(temp0, out + 16);
+  ST_SH(temp1, out + 24);
+
+  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+  VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+  VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  ST_SH(temp0, out + 32);
+  ST_SH(temp1, out + 56);
+
+  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  ST_SH(temp0, out + 40);
+  ST_SH(temp1, out + 48);
+
+  VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+  VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  ADD2(in0, in1, in2, in3, vec0, vec7);
+  VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  ST_SH(temp0, out + 64);
+  ST_SH(temp1, out + 120);
+
+  SUB2(in0, in1, in2, in3, in0, in2);
+  VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  ST_SH(temp0, out + 72);
+  ST_SH(temp1, out + 112);
+
+  SUB2(in9, vec2, in14, vec5, vec2, vec5);
+  VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+  VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  ST_SH(temp0, out + 80);
+  ST_SH(temp1, out + 104);
+
+  ADD2(in3, in2, in0, in1, vec3, vec4);
+  VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  ST_SH(temp0, out + 96);
+  ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+  v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
+  v8i16 vec4, vec5;
+
+  in20 = LD_SH(temp + 32);
+  in21 = LD_SH(temp + 40);
+  in26 = LD_SH(temp + 80);
+  in27 = LD_SH(temp + 88);
+
+  VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  VP9_FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = LD_SH(temp + 16);
+  in19 = LD_SH(temp + 24);
+  in28 = LD_SH(temp + 96);
+  in29 = LD_SH(temp + 104);
+
+  VP9_FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = in19 - in20;
+  ST_SH(vec4, interm_ptr + 32);
+  vec4 = in18 - in21;
+  ST_SH(vec4, interm_ptr + 88);
+  vec4 = in29 - in26;
+  ST_SH(vec4, interm_ptr + 64);
+  vec4 = in28 - in27;
+  ST_SH(vec4, interm_ptr + 56);
+
+  ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+  in22 = LD_SH(temp + 48);
+  in23 = LD_SH(temp + 56);
+  in24 = LD_SH(temp + 64);
+  in25 = LD_SH(temp + 72);
+
+  VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = LD_SH(temp);
+  in17 = LD_SH(temp + 8);
+  in30 = LD_SH(temp + 112);
+  in31 = LD_SH(temp + 120);
+
+  VP9_FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  VP9_FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = in17 - in22;
+  ST_SH(vec4, interm_ptr + 40);
+  vec4 = in30 - in25;
+  ST_SH(vec4, interm_ptr + 48);
+  vec4 = in31 - in24;
+  ST_SH(vec4, interm_ptr + 72);
+  vec4 = in16 - in23;
+  ST_SH(vec4, interm_ptr + 80);
+
+  ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+  VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  ADD2(in27, in26, in25, in24, in23, in20);
+  VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  ST_SH(vec5, out);
+  ST_SH(vec4, out + 120);
+
+  SUB2(in27, in26, in25, in24, in22, in21);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  ST_SH(vec5, out + 112);
+  ST_SH(vec4, out + 8);
+
+  SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+  VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+  SUB2(in26, in27, in24, in25, in23, in20);
+  VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  ST_SH(vec4, out + 16);
+  ST_SH(vec5, out + 104);
+
+  ADD2(in26, in27, in24, in25, in22, in21);
+  VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  ST_SH(vec4, out + 24);
+  ST_SH(vec5, out + 96);
+
+  in20 = LD_SH(interm_ptr + 32);
+  in21 = LD_SH(interm_ptr + 88);
+  in27 = LD_SH(interm_ptr + 56);
+  in26 = LD_SH(interm_ptr + 64);
+
+  in16 = in20;
+  in17 = in21;
+  VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+  VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = LD_SH(interm_ptr + 40);
+  in25 = LD_SH(interm_ptr + 48);
+  in24 = LD_SH(interm_ptr + 72);
+  in23 = LD_SH(interm_ptr + 80);
+
+  SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = in28 + in29;
+  in19 = in31 + in30;
+  VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  ST_SH(vec5, out + 32);
+  ST_SH(vec4, out + 88);
+
+  SUB2(in28, in29, in31, in30, in17, in18);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  ST_SH(vec5, out + 40);
+  ST_SH(vec4, out + 80);
+
+  ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+  VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+  SUB2(in29, in28, in30, in31, in16, in19);
+  VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  ST_SH(vec5, out + 72);
+  ST_SH(vec4, out + 48);
+
+  ADD2(in29, in28, in30, in31, in17, in18);
+  VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  ST_SH(vec4, out + 56);
+  ST_SH(vec5, out + 64);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vp9_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}

diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h
new file mode 100644
index 0000000..99d299a
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h

@@ -0,0 +1,333 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \
+  v8i16 k0_m = __msa_fill_h(cnst0);                                  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
+                                                                     \
+  s0_m = (v4i32)__msa_fill_h(cnst1);                                 \
+  k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                           \
+                                                                     \
+  ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                            \
+  ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                               \
+  DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);                   \
+  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
+  out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
+                                                                     \
+  DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);                   \
+  SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                           \
+  out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);                    \
+}
+
+#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({   \
+  v8i16 dst_m;                                        \
+  v4i32 tp0_m, tp1_m;                                 \
+                                                      \
+  DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m);      \
+  SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS);          \
+  dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m);  \
+                                                      \
+  dst_m;                                              \
+})
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) {                \
+  v4i32 madd0_m, madd1_m, madd2_m, madd3_m;                         \
+  v8i16 madd_s0_m, madd_s1_m;                                       \
+                                                                    \
+  ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m);                        \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m,           \
+              c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m);  \
+  SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS);  \
+  PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1);      \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3,     \
+                    out0, out1, out2, out3) {                           \
+  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;                     \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m;                     \
+                                                                        \
+  ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m);                        \
+  ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m);                        \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+              cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+              m4_m, m5_m, tmp3_m, tmp2_m);                              \
+  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);                  \
+  DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m,               \
+              cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m);  \
+  BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m,                           \
+              m4_m, m5_m, tmp3_m, tmp2_m);                              \
+  SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);              \
+  PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                  \
+}
+
+#define VP9_LD_HADD(psrc, stride) ({                                  \
+  v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;       \
+  v4i32 vec_w_m;                                                      \
+                                                                      \
+  LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                 \
+  ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                     \
+  LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);  \
+  ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m,        \
+       in4_m, in6_m, in0_m, in4_m);                                   \
+  in0_m += in4_m;                                                     \
+                                                                      \
+  vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                             \
+  HADD_SW_S32(vec_w_m);                                               \
+})
+
+#define VP9_FDCT_POSTPROC_2V_NEG_H(vec0, vec1) {  \
+  v8i16 tp0_m, tp1_m;                             \
+  v8i16 one_m = __msa_ldi_h(1);                   \
+                                                  \
+  tp0_m = __msa_clti_s_h(vec0, 0);                \
+  tp1_m = __msa_clti_s_h(vec1, 0);                \
+  vec0 += 1;                                      \
+  vec1 += 1;                                      \
+  tp0_m = one_m & tp0_m;                          \
+  tp1_m = one_m & tp1_m;                          \
+  vec0 += tp0_m;                                  \
+  vec1 += tp1_m;                                  \
+  vec0 >>= 2;                                     \
+  vec1 >>= 2;                                     \
+}
+
+#define VP9_FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                          out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                      \
+  v8i16 x0_m, x1_m, x2_m, x3_m;                                              \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,      \
+                    cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 };     \
+                                                                             \
+  /* FDCT stage1 */                                                          \
+  BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,                        \
+              s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);               \
+  BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);               \
+  ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);                            \
+  ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);                            \
+  SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m);                                   \
+  x1_m = __msa_ilvev_h(x1_m, x0_m);                                          \
+  out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                        \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m);                                   \
+  x2_m = -x2_m;                                                              \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
+  out6 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                        \
+                                                                             \
+  out0 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                        \
+  x2_m = __msa_splati_h(coeff_m, 2);                                         \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
+  out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m);                        \
+                                                                             \
+  /* stage2 */                                                               \
+  ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m);                                       \
+                                                                             \
+  s6_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m);                        \
+  s5_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m);                        \
+                                                                             \
+  /* stage3 */                                                               \
+  BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);               \
+                                                                             \
+  /* stage4 */                                                               \
+  ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);                            \
+  ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);                            \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m);                                   \
+  x1_m = __msa_ilvev_h(x0_m, x1_m);                                          \
+  out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m);                        \
+                                                                             \
+  SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m);                                   \
+  x2_m = __msa_ilvev_h(x3_m, x2_m);                                          \
+  out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                        \
+                                                                             \
+  x1_m = __msa_splati_h(coeff_m, 5);                                         \
+  x0_m = -x0_m;                                                              \
+  x0_m = __msa_ilvev_h(x1_m, x0_m);                                          \
+  out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m);                        \
+                                                                             \
+  x2_m = __msa_splati_h(coeff_m, 6);                                         \
+  x3_m = -x3_m;                                                              \
+  x2_m = __msa_ilvev_h(x2_m, x3_m);                                          \
+  out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m);                        \
+}
+
+#define VP9_FDCT8x16_ODD(input0, input1, input2, input3,           \
+                         input4, input5, input6, input7,           \
+                         out1, out3, out5, out7,                   \
+                         out9, out11, out13, out15) {              \
+  v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;      \
+  v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;      \
+  v8i16 stp36_m, stp37_m, vec0_m, vec1_m;                          \
+  v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                    \
+  v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m;                        \
+  v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64,         \
+                    cospi_24_64, -cospi_8_64, -cospi_24_64,        \
+                    cospi_12_64, cospi_20_64 };                    \
+  v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64,         \
+                     cospi_18_64, cospi_10_64, cospi_22_64,        \
+                     cospi_6_64, cospi_26_64 };                    \
+  v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64,      \
+                     -cospi_26_64, 0, 0, 0, 0 };                   \
+                                                                   \
+  /* stp 1 */                                                      \
+  ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m);      \
+  ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m);      \
+                                                                   \
+  cnst4_m = __msa_splati_h(coeff_m, 0);                            \
+  stp25_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m);    \
+                                                                   \
+  cnst5_m = __msa_splati_h(coeff_m, 1);                            \
+  cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m);                       \
+  stp22_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m);    \
+  stp24_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m);    \
+  stp23_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m);    \
+                                                                   \
+  /* stp2 */                                                       \
+  BUTTERFLY_4(input0, input1, stp22_m, stp23_m,                    \
+              stp30_m, stp31_m, stp32_m, stp33_m);                 \
+  BUTTERFLY_4(input7, input6, stp25_m, stp24_m,                    \
+              stp37_m, stp36_m, stp35_m, stp34_m);                 \
+                                                                   \
+  ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m);  \
+  ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m);  \
+                                                                   \
+  SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m);                   \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  stp26_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m);    \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff_m, 4);                            \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  stp21_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m);    \
+                                                                   \
+  SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m);                   \
+  cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  stp25_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);    \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff_m, 3);                            \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  stp22_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m);    \
+                                                                   \
+  /* stp4 */                                                       \
+  BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m,                  \
+              vec6_m, vec2_m, vec4_m, vec5_m);                     \
+  BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m,                  \
+              stp21_m, stp23_m, stp24_m, stp31_m);                 \
+                                                                   \
+  ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m);                     \
+  SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m);                  \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+                                                                   \
+  out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);       \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff2_m, 0);                           \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  out15 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);      \
+                                                                   \
+  ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m);                     \
+  SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m);                  \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+                                                                   \
+  out9 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);       \
+                                                                   \
+  cnst1_m = __msa_splati_h(coeff2_m, 2);                           \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);       \
+                                                                   \
+  ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m);                   \
+  SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m);                  \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);       \
+                                                                   \
+  cnst0_m = __msa_splati_h(coeff2_m, 1);                           \
+  cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+  out11 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);      \
+                                                                   \
+  ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m);                   \
+  SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m);                  \
+  cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m);                       \
+                                                                   \
+  out13 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m);      \
+                                                                   \
+  cnst1_m = __msa_splati_h(coeff2_m, 3);                           \
+  cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m);                       \
+  out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m);       \
+}
+
+#define VP9_FDCT32_POSTPROC_NEG_W(vec) {  \
+  v4i32 temp_m;                           \
+  v4i32 one_m = __msa_ldi_w(1);           \
+                                          \
+  temp_m = __msa_clti_s_w(vec, 0);        \
+  vec += 1;                               \
+  temp_m = one_m & temp_m;                \
+  vec += temp_m;                          \
+  vec >>= 2;                              \
+}
+
+#define VP9_FDCT32_POSTPROC_2V_POS_H(vec0, vec1) {  \
+  v8i16 tp0_m, tp1_m;                               \
+  v8i16 one = __msa_ldi_h(1);                       \
+                                                    \
+  tp0_m = __msa_clei_s_h(vec0, 0);                  \
+  tp1_m = __msa_clei_s_h(vec1, 0);                  \
+  tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255);   \
+  tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255);   \
+  vec0 += 1;                                        \
+  vec1 += 1;                                        \
+  tp0_m = one & tp0_m;                              \
+  tp1_m = one & tp1_m;                              \
+  vec0 += tp0_m;                                    \
+  vec1 += tp1_m;                                    \
+  vec0 >>= 2;                                       \
+  vec1 >>= 2;                                       \
+}
+
+#define VP9_DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right,  \
+                              reg1_right, const0, const1,        \
+                              out0, out1, out2, out3) {          \
+  v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;          \
+  v2i64 tp0_m, tp1_m, tp2_m, tp3_m;                              \
+  v4i32 k0_m = __msa_fill_w((int32_t) const0);                   \
+                                                                 \
+  s0_m = __msa_fill_w((int32_t) const1);                         \
+  k0_m = __msa_ilvev_w(s0_m, k0_m);                              \
+                                                                 \
+  ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m);                \
+  ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m);                 \
+  ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m);              \
+  ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m);               \
+                                                                 \
+  DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m);             \
+  DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m);             \
+  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
+  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
+  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
+  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
+  out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
+  out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
+                                                                 \
+  DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m);             \
+  DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m);             \
+  tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS);                  \
+  tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS);                  \
+  tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS);                  \
+  tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS);                  \
+  out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m);              \
+  out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m);              \
+}
+#endif  /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index df70d48..78dced2 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -529,3 +529,10 @@
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
   return cr->rdmult;
 }
+
+void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+}

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 99bb98e..29d2a91 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h

@@ -75,6 +75,8 @@
 
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
 
+void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi);
+
 static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
   return segment_id == CR_SEGMENT_ID_BOOST1 ||
          segment_id == CR_SEGMENT_ID_BOOST2;

diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index b9987c1..223c923 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c

@@ -29,6 +29,8 @@
   return (sum + 8) >> 4;
 }
 
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+//           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, int src_stride,
                           int16_t *coeff) {
   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
@@ -65,15 +67,18 @@
   int16_t buffer[64];
   int16_t *tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
     tmp_buf += 8;
     ++src_diff;
   }
 
   tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);
-    coeff += 8;
+    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
+                                       // dynamic range [-2040, 2040]
+    coeff += 8;  // coeff: 15 bit
+                 // dynamic range [-16320, 16320]
     ++tmp_buf;
   }
 }
@@ -83,37 +88,42 @@
                           int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
     int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
                                 + (idx & 0x01) * 8;
     vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
   }
 
+  // coeff: 15 bit, dynamic range [-16320, 16320]
   for (idx = 0; idx < 64; ++idx) {
     int16_t a0 = coeff[0];
     int16_t a1 = coeff[64];
     int16_t a2 = coeff[128];
     int16_t a3 = coeff[192];
 
-    int16_t b0 = a0 + a1;
-    int16_t b1 = a0 - a1;
-    int16_t b2 = a2 + a3;
-    int16_t b3 = a2 - a3;
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
 
-    coeff[0]   = (b0 + b2) >> 1;
-    coeff[64]  = (b1 + b3) >> 1;
-    coeff[128] = (b0 - b2) >> 1;
-    coeff[192] = (b1 - b3) >> 1;
+    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64]  = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
 
     ++coeff;
   }
 }
 
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
 int16_t vp9_satd_c(const int16_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i)
     satd += abs(coeff[i]);
 
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
   return (int16_t)satd;
 }
 

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 092d265..1ebdd06 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -403,7 +403,7 @@
                             int hbs, int mi_row, int mi_col,
                             PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const vp9_prob *const probs = xd->partition_probs[ctx];
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
@@ -481,9 +481,12 @@
 static void write_modes(VP9_COMP *cpi,
                         const TileInfo *const tile, vp9_writer *w,
                         TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+  const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int mi_row, mi_col;
 
+  set_partition_probs(cm, xd);
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     vp9_zero(xd->left_seg_context);

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ddfe69f..dcddefc 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1564,7 +1564,7 @@
   }
 }
 
-const struct {
+static const struct {
   int row;
   int col;
 } coord_lookup[16] = {
@@ -2220,66 +2220,6 @@
   *max_block_size = max_size;
 }
 
-static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
-                                 MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col,
-                                 BLOCK_SIZE *min_block_size,
-                                 BLOCK_SIZE *max_block_size) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO **mi_8x8 = xd->mi;
-  const int left_in_image = xd->left_available && mi_8x8[-1];
-  const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride];
-  int row8x8_remaining = tile->mi_row_end - mi_row;
-  int col8x8_remaining = tile->mi_col_end - mi_col;
-  int bh, bw;
-  BLOCK_SIZE min_size = BLOCK_32X32;
-  BLOCK_SIZE max_size = BLOCK_8X8;
-  int bsl = mi_width_log2_lookup[BLOCK_64X64];
-  const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
-                       get_chessboard_index(cm->current_video_frame)) & 0x1;
-  // Trap case where we do not have a prediction.
-  if (search_range_ctrl &&
-      (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
-    int block;
-    MODE_INFO **mi;
-    BLOCK_SIZE sb_type;
-
-    // Find the min and max partition sizes used in the left SB64.
-    if (left_in_image) {
-      MODE_INFO *cur_mi;
-      mi = &mi_8x8[-1];
-      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        cur_mi = mi[block * xd->mi_stride];
-        sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
-        min_size = MIN(min_size, sb_type);
-        max_size = MAX(max_size, sb_type);
-      }
-    }
-    // Find the min and max partition sizes used in the above SB64.
-    if (above_in_image) {
-      mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
-      for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
-        min_size = MIN(min_size, sb_type);
-        max_size = MAX(max_size, sb_type);
-      }
-    }
-
-    min_size = min_partition_size[min_size];
-    max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
-                                   &bh, &bw);
-    min_size = MIN(min_size, max_size);
-    min_size = MAX(min_size, BLOCK_8X8);
-    max_size = MIN(max_size, BLOCK_32X32);
-  } else {
-    min_size = BLOCK_8X8;
-    max_size = BLOCK_32X32;
-  }
-
-  *min_block_size = min_size;
-  *max_block_size = max_size;
-}
-
 // TODO(jingning) refactor functions setting partition search range
 static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -3786,9 +3726,13 @@
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
   int tile_tok = 0;
 
-  if (cpi->tile_data == NULL) {
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    if (cpi->tile_data != NULL)
+      vpx_free(cpi->tile_data);
     CHECK_MEM_ERROR(cm, cpi->tile_data,
         vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    cpi->allocated_tiles = tile_cols * tile_rows;
+
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
       for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
         TileDataEnc *tile_data =

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ba38d64..d708b83 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -1596,6 +1596,9 @@
                   sizeof(*cm->frame_contexts)));
 
   cpi->use_svc = 0;
+  cpi->resize_state = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
   cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
@@ -3032,6 +3035,31 @@
                          oxcf->scaled_frame_height);
   }
 
+  if (oxcf->pass == 0 &&
+      oxcf->rc_mode == VPX_CBR &&
+      !cpi->use_svc &&
+      oxcf->resize_mode == RESIZE_DYNAMIC) {
+      if (cpi->resize_state == 1) {
+        oxcf->scaled_frame_width =
+            (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+        oxcf->scaled_frame_height =
+            (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+      } else if (cpi->resize_state == -1) {
+        // Go back up to original size.
+        oxcf->scaled_frame_width = oxcf->width;
+        oxcf->scaled_frame_height = oxcf->height;
+      }
+      if (cpi->resize_state != 0) {
+        // There has been a change in frame size.
+        vp9_set_size_literal(cpi,
+                             oxcf->scaled_frame_width,
+                             oxcf->scaled_frame_height);
+
+        // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+        set_mv_search_params(cpi);
+      }
+  }
+
   if ((oxcf->pass == 2) &&
       (!cpi->use_svc ||
           (is_two_pass_svc(cpi) &&
@@ -3962,7 +3990,6 @@
 extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
                                  const unsigned char *img2, int img2_pitch,
                                  int width, int height);
-#endif
 
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
@@ -3972,6 +3999,7 @@
   s->stat[ALL] += all;
   s->worst = MIN(s->worst, all);
 }
+#endif  // CONFIG_INTERNAL_STATS
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
@@ -4165,7 +4193,7 @@
           (is_two_pass_svc(cpi) &&
               cpi->svc.encode_empty_frame_state != ENCODING))) {
     vp9_rc_get_second_pass_params(cpi);
-  } else {
+  } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
 

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 6ce4a67..2b0da10 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -306,6 +306,7 @@
   YV12_BUFFER_CONFIG scaled_last_source;
 
   TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
@@ -478,6 +479,12 @@
 #endif
 
   int resize_pending;
+  int resize_state;
+  int resize_scale_num;
+  int resize_scale_den;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
 
   // VAR_BASED_PARTITION thresholds
   // 0 - threshold_64x64; 1 - threshold_32x32;

diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 8700ccd..4ae3fbc 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c

@@ -54,6 +54,18 @@
   return 0;
 }
 
+static int get_max_tile_cols(VP9_COMP *cpi) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2);
+  int mi_cols = aligned_width >> MI_SIZE_LOG2;
+  int min_log2_tile_cols, max_log2_tile_cols;
+  int log2_tile_cols;
+
+  vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+  log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                   min_log2_tile_cols, max_log2_tile_cols);
+  return (1 << log2_tile_cols);
+}
+
 void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -65,20 +77,30 @@
 
   // Only run once to create threads and allocate thread data.
   if (cpi->num_workers == 0) {
+    int allocated_workers = num_workers;
+
+    // While using SVC, we need to allocate threads according to the highest
+    // resolution.
+    if (cpi->use_svc) {
+      int max_tile_cols = get_max_tile_cols(cpi);
+      allocated_workers = MIN(cpi->oxcf.max_threads, max_tile_cols);
+    }
+
     CHECK_MEM_ERROR(cm, cpi->workers,
-                    vpx_malloc(num_workers * sizeof(*cpi->workers)));
+                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
 
     CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+                    vpx_calloc(allocated_workers,
+                    sizeof(*cpi->tile_thr_data)));
 
-    for (i = 0; i < num_workers; i++) {
+    for (i = 0; i < allocated_workers; i++) {
       VP9Worker *const worker = &cpi->workers[i];
       EncWorkerData *thread_data = &cpi->tile_thr_data[i];
 
       ++cpi->num_workers;
       winterface->init(worker);
 
-      if (i < num_workers - 1) {
+      if (i < allocated_workers - 1) {
         thread_data->cpi = cpi;
 
         // Allocate thread data.
@@ -154,7 +176,7 @@
     // Set the starting tile for each thread.
     thread_data->start = i;
 
-    if (i == num_workers - 1)
+    if (i == cpi->num_workers - 1)
       winterface->execute(worker);
     else
       winterface->launch(worker);
@@ -171,7 +193,7 @@
     EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
 
     // Accumulate counters.
-    if (i < num_workers - 1) {
+    if (i < cpi->num_workers - 1) {
       vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0);
       accumulate_rd_opt(&cpi->td, thread_data->td);
     }

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 2342726..081b99f 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -286,20 +286,20 @@
   bestmv->row *= 8;                                                        \
   bestmv->col *= 8;
 
-static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
-                                              const MV *bestmv,
-                                              const MV *ref_mv,
-                                              int error_per_bit,
-                                              const vp9_variance_fn_ptr_t *vfp,
-                                              const uint8_t *const src,
-                                              const int src_stride,
-                                              const uint8_t *const y,
-                                              int y_stride,
-                                              const uint8_t *second_pred,
-                                              int w, int h, int offset,
-                                              int *mvjcost, int *mvcost[2],
-                                              unsigned int *sse1,
-                                              int *distortion) {
+static unsigned int setup_center_error(const MACROBLOCKD *xd,
+                                       const MV *bestmv,
+                                       const MV *ref_mv,
+                                       int error_per_bit,
+                                       const vp9_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y,
+                                       int y_stride,
+                                       const uint8_t *second_pred,
+                                       int w, int h, int offset,
+                                       int *mvjcost, int *mvcost[2],
+                                       unsigned int *sse1,
+                                       int *distortion) {
   unsigned int besterr;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (second_pred != NULL) {
@@ -610,7 +610,7 @@
   return besterr;
 }
 
-const MV search_step_table[12] = {
+static const MV search_step_table[12] = {
     // left, right, up, down
     {0, -4}, {0, 4}, {-4, 0}, {4, 0},
     {0, -2}, {0, 2}, {-2, 0}, {2, 0},

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 2479b6e..3eaa990 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -658,7 +658,8 @@
   block = 0;
   *rate = 0;
   *dist = 0;
-  *sse = (*sse << 6) >> shift;
+  if (*sse < INT64_MAX)
+    *sse = (*sse << 6) >> shift;
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 32682fe..158581b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -1596,6 +1596,7 @@
     target = calc_pframe_target_size_one_pass_cbr(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
+  cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1756,3 +1757,92 @@
     vbr_rate_correction(cpi, &target_rate);
   vp9_rc_set_frame_target(cpi, target_rate);
 }
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int resize_now = 0;
+  cpi->resize_scale_num = 1;
+  cpi->resize_scale_den = 1;
+  // Don't resize on key frame; reset the counters on key frame.
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->resize_avg_qp = 0;
+    cpi->resize_count = 0;
+    return 0;
+  }
+  // Resize based on average QP over some window.
+  // Ignore samples close to key frame, since QP is usually high after key.
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+    const int window = 5 * cpi->framerate;
+    cpi->resize_avg_qp += cm->base_qindex;
+    if (cpi->rc.buffer_level < 0)
+      ++cpi->resize_buffer_underflow;
+    ++cpi->resize_count;
+    // Check for resize action every "window" frames.
+    if (cpi->resize_count == window) {
+      int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+      // Resize down if buffer level has underflowed sufficent amount in past
+      // window, and we are at original resolution.
+      // Resize back up if average QP is low, and we are currently in a resized
+      // down state.
+      if (cpi->resize_state == 0 &&
+          cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) {
+        resize_now = 1;
+      } else if (cpi->resize_state == 1 &&
+                 avg_qp < 40 * cpi->rc.worst_quality / 100) {
+        resize_now = -1;
+      }
+      // Reset for next window measurement.
+      cpi->resize_avg_qp = 0;
+      cpi->resize_count = 0;
+      cpi->resize_buffer_underflow = 0;
+    }
+  }
+  // If decision is to resize, reset some quantities, and check is we should
+  // reduce rate correction factor,
+  if (resize_now != 0) {
+    int target_bits_per_frame;
+    int active_worst_quality;
+    int qindex;
+    int tot_scale_change;
+    // For now, resize is by 1/2 x 1/2.
+    cpi->resize_scale_num = 1;
+    cpi->resize_scale_den = 2;
+    tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+        (cpi->resize_scale_num * cpi->resize_scale_num);
+    // Reset buffer level to optimal, update target size.
+    rc->buffer_level = rc->optimal_buffer_level;
+    rc->bits_off_target = rc->optimal_buffer_level;
+    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    // Reset cyclic refresh parameters.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+      vp9_cyclic_refresh_reset_resize(cpi);
+    // Get the projected qindex, based on the scaled target frame size (scaled
+    // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
+    target_bits_per_frame = (resize_now == 1) ?
+        rc->this_frame_target * tot_scale_change :
+        rc->this_frame_target / tot_scale_change;
+    active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+    qindex = vp9_rc_regulate_q(cpi,
+                               target_bits_per_frame,
+                               rc->best_quality,
+                               active_worst_quality);
+    // If resize is down, check if projected q index is close to worst_quality,
+    // and if so, reduce the rate correction factor (since likely can afford
+    // lower q for resized frame).
+    if (resize_now == 1 &&
+        qindex > 90 * cpi->rc.worst_quality / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    }
+    // If resize is back up, check if projected q index is too much above the
+    // current base_qindex, and if so, reduce the rate correction factor
+    // (since prefer to keep q for resized frame at least close to previous q).
+    if (resize_now == -1 &&
+       qindex > 130 * cm->base_qindex / 100) {
+      rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+    }
+  }
+  return resize_now;
+}

diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index e12d200..a10836c 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h

@@ -245,6 +245,8 @@
 
 void vp9_set_target_rate(struct VP9_COMP *cpi);
 
+int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index bbcbfe9..90ee1e4 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -265,6 +265,7 @@
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   RD_OPT *const rd = &cpi->rd;
   int i;
 
@@ -280,6 +281,7 @@
                        cm->frame_type != KEY_FRAME) ? 0 : 1;
 
   set_block_thresholds(cm, rd);
+  set_partition_probs(cm, xd);
 
   if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
     fill_token_costs(x->token_costs, cm->fc->coef_probs);
@@ -287,7 +289,7 @@
   if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
       cm->frame_type == KEY_FRAME) {
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
+      vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
                       vp9_partition_tree);
   }
 

diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index bca5b13..f46cad8 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c

@@ -29,7 +29,7 @@
 typedef int16_t interp_kernel[INTERP_TAPS];
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
   {-3,  0, 35, 64, 35,  0, -3, 0},
   {-3, -1, 34, 64, 36,  1, -3, 0},
   {-3, -1, 32, 64, 38,  1, -3, 0},
@@ -65,7 +65,7 @@
 };
 
 // Filters for interpolation (0.625-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
   {-1, -8, 33, 80, 33, -8, -1, 0},
   {-1, -8, 30, 80, 35, -8, -1, 1},
   {-1, -8, 28, 80, 37, -7, -2, 1},
@@ -101,7 +101,7 @@
 };
 
 // Filters for interpolation (0.75-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
   {2, -11,  25,  96,  25, -11,   2, 0},
   {2, -11,  22,  96,  28, -11,   2, 0},
   {2, -10,  19,  95,  31, -11,   2, 0},
@@ -137,7 +137,7 @@
 };
 
 // Filters for interpolation (0.875-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
   {3,  -8,  13, 112,  13,  -8,   3, 0},
   {3,  -7,  10, 112,  17,  -9,   3, -1},
   {2,  -6,   7, 111,  21,  -9,   3, -1},
@@ -173,7 +173,7 @@
 };
 
 // Filters for interpolation (full-band) - no filtering for integer pixels
-const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
   {0,   0,   0, 128,   0,   0,   0, 0},
   {0,   1,  -3, 128,   3,  -1,   0, 0},
   {-1,   2,  -6, 127,   7,  -2,   1, 0},
@@ -215,15 +215,15 @@
 static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
   int outlength16 = outlength * 16;
   if (outlength16 >= inlength * 16)
-    return vp9_filteredinterp_filters1000;
+    return filteredinterp_filters1000;
   else if (outlength16 >= inlength * 13)
-    return vp9_filteredinterp_filters875;
+    return filteredinterp_filters875;
   else if (outlength16 >= inlength * 11)
-    return vp9_filteredinterp_filters750;
+    return filteredinterp_filters750;
   else if (outlength16 >= inlength * 9)
-    return vp9_filteredinterp_filters625;
+    return filteredinterp_filters625;
   else
-    return vp9_filteredinterp_filters500;
+    return filteredinterp_filters500;
 }
 
 static void interpolate(const uint8_t *const input, int inlength,

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index cb1b0df..1b35ac9 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c

@@ -152,7 +152,6 @@
     }
   } else {
     int layer_end;
-    float bitrate_alloc = 1.0;
 
     if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
       layer_end = svc->number_temporal_layers;

diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 56a91ed..4531d79 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -264,17 +264,18 @@
     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
 
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
-    coeff0 = _mm_srai_epi16(coeff0, 1);
-    coeff1 = _mm_srai_epi16(coeff1, 1);
     _mm_store_si128((__m128i *)coeff, coeff0);
     _mm_store_si128((__m128i *)(coeff + 64), coeff1);
 
     coeff2 = _mm_sub_epi16(b0, b2);
     coeff3 = _mm_sub_epi16(b1, b3);
-    coeff2 = _mm_srai_epi16(coeff2, 1);
-    coeff3 = _mm_srai_epi16(coeff3, 1);
     _mm_store_si128((__m128i *)(coeff + 128), coeff2);
     _mm_store_si128((__m128i *)(coeff + 192), coeff3);
 

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index b01fdd1..6f091ee 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -152,6 +152,10 @@
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
 
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
+endif
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
 ifeq ($(ARCH_X86_64), yes)

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 5415215..6074da2 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -152,4 +152,8 @@
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
commit	714a46a63c5434a4f18b52905e840516a5d85017	[log] [tgz]
author	James Zern <jzern@google.com>	Fri Jun 19 03:32:24 2015 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Fri Jun 19 03:32:24 2015 +0000
tree	a44a767f9182712f1f370fea5d5f25ef2e06e10a
parent	c5d779d266ba254d60b56afe813b681684613b24 [diff]
parent	e2b52f6f0115911cb4d0f669763718b858bc3644 [diff]