Merge changes Iedb5b6a3,Iaea98508,I36580cea,Ia0574320 * changes: vp9_decodeframe.h: remove unused prototype vp9_decodeframe: move public funcs to end of file vp9_decodeframe: reorder some functions vp9_decodeframe: hide vp9_dec_build_inter_predictors_sb

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index cd562da..d2687b2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -16,6 +16,7 @@
 
 #include "./vp9_rtcd.h"
 #include "test/acm_random.h"
+#include "test/clear_system_state.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -66,6 +67,7 @@
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
       pred_funcs[k](src, kBPS, above, left);
     }
+    libvpx_test::ClearSystemState();
     vpx_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
@@ -211,6 +213,14 @@
                 NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vp9_dc_predictor_4x4_msa,
+                vp9_dc_left_predictor_4x4_msa, vp9_dc_top_predictor_4x4_msa,
+                vp9_dc_128_predictor_4x4_msa, vp9_v_predictor_4x4_msa,
+                vp9_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_4x4_msa)
+#endif  // HAVE_MSA
+
 // -----------------------------------------------------------------------------
 // 8x8
 
@@ -256,6 +266,14 @@
 
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred8, vp9_dc_predictor_8x8_msa,
+                vp9_dc_left_predictor_8x8_msa, vp9_dc_top_predictor_8x8_msa,
+                vp9_dc_128_predictor_8x8_msa, vp9_v_predictor_8x8_msa,
+                vp9_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_8x8_msa)
+#endif  // HAVE_MSA
+
 // -----------------------------------------------------------------------------
 // 16x16
 
@@ -299,6 +317,14 @@
                 vp9_tm_predictor_16x16_neon)
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred16, vp9_dc_predictor_16x16_msa,
+                vp9_dc_left_predictor_16x16_msa, vp9_dc_top_predictor_16x16_msa,
+                vp9_dc_128_predictor_16x16_msa, vp9_v_predictor_16x16_msa,
+                vp9_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_16x16_msa)
+#endif  // HAVE_MSA
+
 // -----------------------------------------------------------------------------
 // 32x32
 
@@ -340,4 +366,12 @@
                 NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred32, vp9_dc_predictor_32x32_msa,
+                vp9_dc_left_predictor_32x32_msa, vp9_dc_top_predictor_32x32_msa,
+                vp9_dc_128_predictor_32x32_msa, vp9_v_predictor_32x32_msa,
+                vp9_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL,
+                NULL, vp9_tm_predictor_32x32_msa)
+#endif  // HAVE_MSA
+
 #include "test/test_libvpx.cc"

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2d17119..670fe09 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -1106,12 +1106,12 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest;
 
 TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest;
@@ -1160,7 +1160,7 @@
                       make_tuple(6, 5, subpel_variance64x32_c, 0),
                       make_tuple(6, 6, subpel_variance64x64_c, 0)));
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 const SubpixVarMxNFunc vp8_subpel_variance16x16_c =
     vp8_sub_pixel_variance16x16_c;
 const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c;
@@ -1174,7 +1174,7 @@
                       make_tuple(3, 4, vp8_subpel_variance8x16_c, 0),
                       make_tuple(4, 3, vp8_subpel_variance16x8_c, 0),
                       make_tuple(4, 4, vp8_subpel_variance16x16_c, 0)));
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
     vp9_sub_pixel_avg_variance4x4_c;
@@ -1460,7 +1460,7 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_MMX
 const SubpixVarMxNFunc subpel_variance16x16_mmx =
     vp8_sub_pixel_variance16x16_mmx;
@@ -1476,7 +1476,7 @@
                       make_tuple(3, 3, subpel_variance8x8_mmx, 0),
                       make_tuple(2, 2, subpel_variance4x4_mmx, 0)));
 #endif  // HAVE_MMX
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 #if HAVE_SSE2
@@ -1768,7 +1768,7 @@
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_SSE2
 const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 =
     vp8_sub_pixel_variance16x16_wmt;
@@ -1788,7 +1788,7 @@
                       make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0),
                       make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0)));
 #endif  // HAVE_SSE2
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 #if HAVE_SSSE3
@@ -1879,7 +1879,7 @@
 #endif  // HAVE_SSSE3
 #endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_SSSE3
 const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 =
     vp8_sub_pixel_variance16x16_ssse3;
@@ -1890,7 +1890,7 @@
     ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0),
                       make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0)));
 #endif  // HAVE_SSSE3
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if HAVE_AVX2
 const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
@@ -1931,7 +1931,7 @@
 #endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_AVX2
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_MEDIA
 const SubpixVarMxNFunc subpel_variance16x16_media =
     vp8_sub_pixel_variance16x16_armv6;
@@ -1942,7 +1942,7 @@
     ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0),
                       make_tuple(4, 4, subpel_variance16x16_media, 0)));
 #endif  // HAVE_MEDIA
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if HAVE_NEON
 const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
@@ -1972,7 +1972,7 @@
                       make_tuple(3, 4, variance8x16_neon, 0),
                       make_tuple(3, 3, variance8x8_neon, 0)));
 
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
 #if HAVE_NEON_ASM
 const SubpixVarMxNFunc vp8_subpel_variance16x16_neon =
     vp8_sub_pixel_variance16x16_neon;
@@ -1980,7 +1980,7 @@
     NEON, VP8SubpelVarianceTest,
     ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0)));
 #endif  // HAVE_NEON_ASM
-#endif  // CONFIG_VP8
+#endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
 const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;

diff --git a/vp9/common/mips/msa/vp9_intra_predict_msa.c b/vp9/common/mips/msa/vp9_intra_predict_msa.c
new file mode 100644
index 0000000..2fc6105
--- /dev/null
+++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c

@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
+  out0 = __msa_subs_u_h(out0, in0);                \
+  out1 = __msa_subs_u_h(out1, in1);                \
+}
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t src_data;
+
+  src_data = LW(src);
+
+  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  uint32_t src_data1, src_data2;
+
+  src_data1 = LW(src);
+  src_data2 = LW(src + 4);
+
+  for (row = 8; row--;) {
+    SW(src_data1, dst);
+    SW(src_data2, (dst + 4));
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src0;
+
+  src0 = LD_UB(src);
+
+  for (row = 16; row--;) {
+    ST_UB(src0, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src1, src2;
+
+  src1 = LD_UB(src);
+  src2 = LD_UB(src + 16);
+
+  for (row = 32; row--;) {
+    ST_UB2(src1, src2, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t out0, out1, out2, out3;
+
+  out0 = src[0] * 0x01010101;
+  out1 = src[1] * 0x01010101;
+  out2 = src[2] * 0x01010101;
+  out3 = src[3] * 0x01010101;
+
+  SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  out0 = src[0] * 0x0101010101010101ull;
+  out1 = src[1] * 0x0101010101010101ull;
+  out2 = src[2] * 0x0101010101010101ull;
+  out3 = src[3] * 0x0101010101010101ull;
+  out4 = src[4] * 0x0101010101010101ull;
+  out5 = src[5] * 0x0101010101010101ull;
+  out6 = src[6] * 0x0101010101010101ull;
+  out7 = src[7] * 0x0101010101010101ull;
+
+  SD4(out0, out1, out2, out3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 4; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 8; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB2(src0, src0, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src1, src1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src2, src2, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src3, src3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val0, val1;
+  v16i8 store, src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LW(src_top);
+  val1 = LW(src_left);
+  INSERT_W2_SB(val0, val1, src);
+  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t val0;
+  v16i8 store, data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+
+  val0 = LW(src);
+  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  v16i8 store;
+  v16u8 src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src_top);
+  val1 = LD(src_left);
+  INSERT_D2_UB(val0, val1, src);
+  sum_h = __msa_hadd_u_h(src, src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t val0;
+  v16i8 store;
+  v16u8 data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src);
+  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+  uint64_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(out, out, out, out, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  v16u8 top, left, out;
+  v8u16 sum_h, sum_top, sum_left;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  top = LD_UB(src_top);
+  left = LD_UB(src_left);
+  HADD_UB2_UH(top, left, sum_top, sum_left);
+  sum_h = sum_top + sum_left;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  v16u8 data, out;
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  data = LD_UB(src);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  v16u8 top0, top1, left0, left1, out;
+  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src_top, 16, top0, top1);
+  LD_UB2(src_left, 16, left0, left1);
+  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+  sum_h = sum_top0 + sum_top1;
+  sum_h += sum_left0 + sum_left1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  v16u8 data0, data1, out;
+  v8u16 sum_h, sum_data0, sum_data1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src, 16, data0, data1);
+  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+  sum_h = sum_data0 + sum_data1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint32_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v16u8 src0, src1, src2, src3;
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+  val = LW(src_top_ptr);
+  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
+
+  src_left0 = __msa_fill_b(src_left[0]);
+  src_left1 = __msa_fill_b(src_left[1]);
+  src_left2 = __msa_fill_b(src_left[2]);
+  src_left3 = __msa_fill_b(src_left[3]);
+
+  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+             src_left3, src_top, src0, src1, src2, src3);
+  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+                                     const uint8_t *src_left,
+                                     uint8_t *dst, int32_t dst_stride) {
+  uint64_t val;
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
+  v8u16 src_top_left, vec0, vec1, vec2, vec3;
+  v16u8 src0, src1, src2, src3;
+
+  val = LD(src_top_ptr);
+  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 2; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+               src_left3, src_top, src0, src1, src2, src3);
+    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top_ptr[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r, res_l;
+
+  src_top = LD_SB(src_top_ptr);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+
+    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+    HADD_UB2_UH(res_r, res_l, res_r, res_l);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+    SAT_UH2_UH(res_r, res_l, 7);
+    PCKEV_ST_SB(res_r, res_l, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left,
+                                       uint8_t *dst, int32_t dst_stride) {
+  uint8_t top_left = src_top[-1];
+  uint32_t loop_cnt;
+  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+  LD_SB2(src_top, 16, src_top0, src_top1);
+  src_top_left = (v8u16)__msa_fill_h(top_left);
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    src_left0 = __msa_fill_b(src_left[0]);
+    src_left1 = __msa_fill_b(src_left[1]);
+    src_left2 = __msa_fill_b(src_left[2]);
+    src_left3 = __msa_fill_b(src_left[3]);
+    src_left += 4;
+
+    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+
+    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+    PCKEV_ST_SB(res_r0, res_l0, dst);
+    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+void vp9_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void vp9_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void vp9_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void vp9_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void vp9_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void vp9_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void vp9_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void vp9_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void vp9_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void vp9_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void vp9_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void vp9_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void vp9_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void vp9_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void vp9_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_32x32_msa(dst, y_stride);
+}
+
+void vp9_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
+}
+
+void vp9_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
+}
+
+void vp9_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
+}
+
+void vp9_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
+}

diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index 3751e35..2043e13 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h

@@ -244,6 +244,22 @@
   out3 = LW((psrc) + 3 * stride);                    \
 }
 
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+   Details     : Loads double word in 'out0' from (psrc)
+                 Loads double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) {  \
+  out0 = LD((psrc));                     \
+  out1 = LD((psrc) + stride);            \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) {  \
+  LD2((psrc), stride, out0, out1);                   \
+  LD2((psrc) + 2 * stride, stride, out2, out3);      \
+}
+
 /* Description : Store 4 words with stride
    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
    Details     : Stores word from 'in0' to (pdst)
@@ -482,6 +498,24 @@
   SD(out0_m, pdst);                       \
 }
 
+/* Description : Store as 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs  - in, pdst, stride
+   Details     : Index 0 double word element from input vector 'in' is copied
+                 and stored to destination memory at (pdst)
+                 Index 1 double word element from input vector 'in' is copied
+                 and stored to destination memory at (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) {        \
+  uint64_t out0_m, out1_m;                  \
+  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                            \
+  SD(out0_m, pblk_8x2_m);                   \
+  SD(out1_m, pblk_8x2_m + stride);          \
+}
+
 /* Description : Store as 8x4 byte block to destination memory from input
                  vectors
    Arguments   : Inputs  - in0, in1, pdst, stride
@@ -743,6 +777,26 @@
   CLIP_SH2_0_255(in2, in3);                   \
 }
 
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is stored in 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
+  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
+}
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
 /* Description : Insert specified word elements from input vectors to 1
                  destination vector
    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
@@ -755,6 +809,19 @@
 }
 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
 
+/* Description : Insert specified double word elements from input vectors to 1
+                 destination vector
+   Arguments   : Inputs  - in0, in1      (2 input vectors)
+                 Outputs - out           (output vector)
+                 Return Type - as per RTYPE
+*/
+#define INSERT_D2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
 /* Description : Interleave even byte elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1

diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c
new file mode 100644
index 0000000..64cb9a8
--- /dev/null
+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c

@@ -0,0 +1,137 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  uint64_t src0_d, src1_d, dst0_d, dst1_d;
+  v16i8 src0 = { 0 };
+  v16i8 src1 = { 0 };
+  v16i8 dst0 = { 0 };
+  v16i8 dst1 = { 0 };
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 2; row--;) {
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src0);
+    INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+    LD2(src_ptr, src_stride, src0_d, src1_d);
+    src_ptr += (2 * src_stride);
+    LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+    INSERT_D2_SB(src0_d, src1_d, src1);
+    INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst0, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+    ST8x2_UB(dst1, dst_ptr, dst_stride);
+    dst_ptr += (2 * dst_stride);
+  }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+                                      int32_t src_stride,
+                                      uint8_t *dst_ptr,
+                                      int32_t dst_stride,
+                                      int32_t src_weight) {
+  int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  int32_t row;
+  v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+  src_wt = __msa_fill_h(src_weight);
+  dst_wt = __msa_fill_h(dst_weight);
+
+  for (row = 4; row--;) {
+    LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+    UNPCK_UB_SH(src0, src_r, src_l);
+    UNPCK_UB_SH(dst0, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src1, src_r, src_l);
+    UNPCK_UB_SH(dst1, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src2, src_r, src_l);
+    UNPCK_UB_SH(dst2, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+
+    UNPCK_UB_SH(src3, src_r, src_l);
+    UNPCK_UB_SH(dst3, dst_r, dst_l);
+    res_h_r = (src_r * src_wt);
+    res_h_r += (dst_r * dst_wt);
+    res_h_l = (src_l * src_wt);
+    res_h_l += (dst_l * dst_wt);
+    SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+    PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+    dst_ptr += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   int src_weight) {
+  filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}

diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index afcdf22..b256d4a 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c

@@ -12,7 +12,8 @@
 
 #include "vp9/common/vp9_filter.h"
 
-const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = {
+DECLARE_ALIGNED(256, const InterpKernel,
+                vp9_bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 484e457..9816728 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -267,8 +267,8 @@
 
   for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
     int lvl_seg = default_filt_lvl;
-    if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
-      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+    if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
       lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
                       data : default_filt_lvl + data,
                       0, MAX_LOOP_FILTER);

diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 564a3eb..d83f3c1 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c

@@ -266,8 +266,8 @@
 
 int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
-  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
-    const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
     const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
         data : base_qindex + data;
     return clamp(seg_qindex, 0, MAXQ);

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index bbe200d..2f262a6 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -66,7 +66,7 @@
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_4x4/;
@@ -78,22 +78,22 @@
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
+specialize qw/vp9_v_predictor_4x4 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
+specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -105,7 +105,7 @@
 specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_8x8/;
@@ -117,22 +117,22 @@
 specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_v_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -144,7 +144,7 @@
 specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_16x16/;
@@ -156,22 +156,22 @@
 specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_v_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_tm_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_16x16 dspr2 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16 neon/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
@@ -183,7 +183,7 @@
 specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
+specialize qw/vp9_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_32x32/;
@@ -195,22 +195,22 @@
 specialize qw/vp9_d153_predictor_32x32/;
 
 add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
+specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
+specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64";
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc";
 
 #
 # Loopfilter
@@ -276,10 +276,10 @@
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
 
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
-specialize qw/vp9_filter_by_weight16x16 sse2/;
+specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
 
 add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
-specialize qw/vp9_filter_by_weight8x8 sse2/;
+specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 }
 
 #

diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 910200e..471e238 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c

@@ -25,12 +25,6 @@
 // the coding mechanism is still subject to change so these provide a
 // convenient single point of change.
 
-int vp9_segfeature_active(const struct segmentation *seg, int segment_id,
-                          SEG_LVL_FEATURES feature_id) {
-  return seg->enabled &&
-         (seg->feature_mask[segment_id] & (1 << feature_id));
-}
-
 void vp9_clearall_segfeatures(struct segmentation *seg) {
   vp9_zero(seg->feature_data);
   vp9_zero(seg->feature_mask);
@@ -60,12 +54,6 @@
   seg->feature_data[segment_id][feature_id] = seg_data;
 }
 
-int vp9_get_segdata(const struct segmentation *seg, int segment_id,
-                    SEG_LVL_FEATURES feature_id) {
-  return seg->feature_data[segment_id][feature_id];
-}
-
-
 const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
   2,  4,  6,  8, 10, 12,
   0, -1, -2, -3, -4, -5, -6, -7

diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index ff2d66a..95c9918 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h

@@ -49,9 +49,12 @@
   unsigned int feature_mask[MAX_SEGMENTS];
 };
 
-int vp9_segfeature_active(const struct segmentation *seg,
-                          int segment_id,
-                          SEG_LVL_FEATURES feature_id);
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled &&
+         (seg->feature_mask[segment_id] & (1 << feature_id));
+}
 
 void vp9_clearall_segfeatures(struct segmentation *seg);
 
@@ -68,9 +71,10 @@
                      SEG_LVL_FEATURES feature_id,
                      int seg_data);
 
-int vp9_get_segdata(const struct segmentation *seg,
-                    int segment_id,
-                    SEG_LVL_FEATURES feature_id);
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
 
 extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index bc03caf..d34926d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -177,7 +177,7 @@
 
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
                      int segment_id, vp9_reader *r) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
@@ -307,9 +307,9 @@
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = xd->counts;
 
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id,
-                                                       SEG_LVL_REF_FRAME);
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
@@ -444,9 +444,8 @@
 
 static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                int segment_id, vp9_reader *r) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
-           INTRA_FRAME;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
     const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
@@ -493,7 +492,7 @@
                      mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
   }
 
-  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
         vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,

diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 71c1e0b..df70d48 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c

@@ -95,19 +95,6 @@
     return 1;
 }
 
-static void adjust_cyclic_refresh_parameters(VP9_COMP *const cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  // Adjust some parameters, currently only for low resolutions at low bitrates.
-  if (cm->width <= 352 &&
-      cm->height <= 288 &&
-      rc->avg_frame_bandwidth < 3400) {
-      cr->motion_thresh = 4;
-      cr->rate_boost_fac = 1.25;
-  }
-}
-
 // Check if this coding block, of size bsize, should be considered for refresh
 // (lower-qp coding). Decision can be based on various factors, such as
 // size of the coding block (i.e., below min_block size rejected), coding
@@ -435,18 +422,30 @@
   cr->sb_index = i;
 }
 
-// Set/update global/frame level cyclic refresh parameters.
+// Set cyclic refresh parameters.
 void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
-  // periods of the refresh cycle, after a key frame. This corresponds to ~40
-  // frames with cr->percent_refresh = 10.
-  if (rc->frames_since_key <  40)
+  // periods of the refresh cycle, after a key frame.
+  if (rc->frames_since_key <  4 * cr->percent_refresh)
     cr->rate_ratio_qdelta = 3.0;
   else
     cr->rate_ratio_qdelta = 2.0;
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 &&
+      cm->height <= 288 &&
+      rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 1.25;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 1.7;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -475,9 +474,6 @@
     int qindex2;
     const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
     vp9_clear_system_state();
-    cr->max_qdelta_perc = 50;
-    cr->time_for_refresh = 0;
-    cr->rate_boost_fac = 1.7;
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
     cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
@@ -485,9 +481,6 @@
     // q will not exceed 457, so (q * q) is within 32bit; see:
     // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
     cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
-    cr->motion_thresh = 32;
-
-    adjust_cyclic_refresh_parameters(cpi);
 
     // Set up segmentation.
     // Clear down the segment map.

diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index b9987c1..4373566 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c

@@ -29,6 +29,8 @@
   return (sum + 8) >> 4;
 }
 
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+//           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, int src_stride,
                           int16_t *coeff) {
   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
@@ -65,15 +67,18 @@
   int16_t buffer[64];
   int16_t *tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(src_diff, src_stride, tmp_buf);
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
     tmp_buf += 8;
     ++src_diff;
   }
 
   tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);
-    coeff += 8;
+    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
+                                       // dynamic range [-2040, 2040]
+    coeff += 8;  // coeff: 15 bit
+                 // dynamic range [-16320, 16320]
     ++tmp_buf;
   }
 }
@@ -83,26 +88,28 @@
                           int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
     int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
                                 + (idx & 0x01) * 8;
     vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
   }
 
+  // coeff: 15 bit, dynamic range [-16320, 16320]
   for (idx = 0; idx < 64; ++idx) {
     int16_t a0 = coeff[0];
     int16_t a1 = coeff[64];
     int16_t a2 = coeff[128];
     int16_t a3 = coeff[192];
 
-    int16_t b0 = a0 + a1;
-    int16_t b1 = a0 - a1;
-    int16_t b2 = a2 + a3;
-    int16_t b3 = a2 - a3;
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
 
-    coeff[0]   = (b0 + b2) >> 1;
-    coeff[64]  = (b1 + b3) >> 1;
-    coeff[128] = (b0 - b2) >> 1;
-    coeff[192] = (b1 - b3) >> 1;
+    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64]  = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
 
     ++coeff;
   }

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d20e067..092d265 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -93,7 +93,7 @@
 
 static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       int segment_id, const MODE_INFO *mi, vp9_writer *w) {
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
@@ -207,10 +207,10 @@
 
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     assert(!is_compound);
     assert(mbmi->ref_frame[0] ==
-               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+               get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
   } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
@@ -264,7 +264,7 @@
 
   skip = write_skip(cm, xd, segment_id, mi, w);
 
-  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
     vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
@@ -293,7 +293,7 @@
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
-    if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(w, mode, inter_probs);
       }
@@ -787,10 +787,10 @@
 
     for (i = 0; i < MAX_SEGMENTS; i++) {
       for (j = 0; j < SEG_LVL_MAX; j++) {
-        const int active = vp9_segfeature_active(seg, i, j);
+        const int active = segfeature_active(seg, i, j);
         vp9_wb_write_bit(wb, active);
         if (active) {
-          const int data = vp9_get_segdata(seg, i, j);
+          const int data = get_segdata(seg, i, j);
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8864e0e..f5e3e98 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1051,7 +1051,7 @@
   if (!output_enabled)
     return;
 
-  if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+  if (!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     for (i = 0; i < TX_MODES; i++)
       rdc->tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
@@ -1248,7 +1248,7 @@
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
@@ -1291,8 +1291,8 @@
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
     const int inter_block = is_inter_block(mbmi);
-    const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                                     SEG_LVL_REF_FRAME);
+    const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
+                                                 SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
       counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
       // If the segment reference feature is enabled we have only a single
@@ -1317,7 +1317,7 @@
       }
     }
     if (inter_block &&
-        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
       if (bsize >= BLOCK_8X8) {
         const PREDICTION_MODE mode = mbmi->mode;
@@ -2849,7 +2849,7 @@
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
       int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
     x->source_variance = UINT_MAX;
@@ -2909,7 +2909,7 @@
 static int check_dual_ref_flags(VP9_COMP *cpi) {
   const int ref_flags = cpi->ref_frame_flags;
 
-  if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+  if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
     return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
@@ -2984,7 +2984,7 @@
 
   if (cm->frame_type == KEY_FRAME)
     hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
-  else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  else if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
     set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
   else if (bsize >= BLOCK_8X8)
     vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
@@ -3599,7 +3599,7 @@
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
       int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
       if (seg_skip) {
         partition_search_type = FIXED_PARTITION;
       }
@@ -3786,9 +3786,13 @@
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
   int tile_tok = 0;
 
-  if (cpi->tile_data == NULL) {
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    if (cpi->tile_data != NULL)
+      vpx_free(cpi->tile_data);
     CHECK_MEM_ERROR(cm, cpi->tile_data,
         vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    cpi->allocated_tiles = tile_cols * tile_rows;
+
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
       for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
         TileDataEnc *tile_data =
@@ -4157,8 +4161,8 @@
   MODE_INFO **mi_8x8 = xd->mi;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
-  const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                             SEG_LVL_SKIP);
+  const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id,
+                                         SEG_LVL_SKIP);
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 6ce4a67..4d2a186 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -306,6 +306,7 @@
   YV12_BUFFER_CONFIG scaled_last_source;
 
   TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 60bff57..2479b6e 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -1692,8 +1692,8 @@
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
 
     mbmi->ref_frame[0] = ref_frame;

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index db5460b..e6e17c0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -678,7 +678,7 @@
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
 
-  x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
   x->errorperbit = rdmult >> 6;

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index eacc63f..162d4de 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -2120,8 +2120,8 @@
                                      unsigned int *ref_costs_single,
                                      unsigned int *ref_costs_comp,
                                      vp9_prob *comp_mode_p) {
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+  int seg_ref_active = segfeature_active(&cm->seg, segment_id,
+                                         SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
     memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
     memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
@@ -3007,8 +3007,8 @@
     }
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
@@ -3017,7 +3017,7 @@
   // Disable this drop out case if the ref frame
   // segment level feature is enabled for this segment. This is to
   // prevent the possibility that we end up unable to pick any mode.
-  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
     // unless ARNR filtering is enabled in which case we want
     // an unfiltered alternative. We allow near/nearest as well
@@ -3196,7 +3196,7 @@
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
         continue;
 
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
@@ -3638,7 +3638,7 @@
 
   rd_cost->rate = INT_MAX;
 
-  assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
   mbmi->mode = ZEROMV;
   mbmi->uv_mode = DC_PRED;
@@ -3850,7 +3850,7 @@
         continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
         continue;
 
       if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
@@ -3875,13 +3875,13 @@
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
       // an unfiltered alternative. We allow near/nearest as well

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3592031..181a99c 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -484,7 +484,7 @@
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
   const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -615,8 +615,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_skip_context(xd);
-  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
-                                              SEG_LVL_SKIP);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
   struct tokenize_b_args arg = {cpi, td, t};
   if (mbmi->skip) {
     if (!dry_run)

diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 56a91ed..4531d79 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -264,17 +264,18 @@
     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
 
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
-    coeff0 = _mm_srai_epi16(coeff0, 1);
-    coeff1 = _mm_srai_epi16(coeff1, 1);
     _mm_store_si128((__m128i *)coeff, coeff0);
     _mm_store_si128((__m128i *)(coeff + 64), coeff1);
 
     coeff2 = _mm_sub_epi16(b0, b2);
     coeff3 = _mm_sub_epi16(b1, b3);
-    coeff2 = _mm_srai_epi16(coeff2, 1);
-    coeff3 = _mm_srai_epi16(coeff3, 1);
     _mm_store_si128((__m128i *)(coeff + 128), coeff2);
     _mm_store_si128((__m128i *)(coeff + 192), coeff3);
 

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d018699..6f091ee 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -146,11 +146,16 @@
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
 
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
+endif
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
 ifeq ($(ARCH_X86_64), yes)